From 372bf0890b19cc3c2992ce5c16eca3647e2a9e13 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 27 Jan 2025 15:25:30 +0800
Subject: [PATCH 01/69] [Bugfix] Fix missing seq_start_loc in xformers prefill
 metadata (#12464)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/attention/backends/xformers.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 8c25dda7aad2c..49f47f9c8ded3 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -199,6 +199,8 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]:
         # Compute some attn_metadata fields which default to None
         query_start_loc = (None if self.query_start_loc is None else
                            self.query_start_loc[:self.num_prefills + 1])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
         slot_mapping = (None if self.slot_mapping is None else
                         self.slot_mapping[:self.num_prefill_tokens])
         seq_lens = (None if self.seq_lens is None else
@@ -225,6 +227,7 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]:
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_seq_len=0,
             query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
             context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=False,

From 624a1e4711cb9cfdd7e336980668e64744a84863 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 27 Jan 2025 01:09:27 -0800
Subject: [PATCH 02/69] [V1][Minor] Minor optimizations for update_from_output
 (#12454)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/scheduler.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 8ded5e5787133..de7fb1a698df6 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -411,6 +411,10 @@ def update_from_output(
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
         outputs: List[EngineCoreOutput] = []
+
+        # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
+        # loop can be a performance bottleneck. We should do our best to avoid
+        # expensive operations inside the loop.
         for request in self.running:
             req_id = request.request_id
             request.num_computed_tokens += num_scheduled_tokens[req_id]
@@ -421,13 +425,15 @@ def update_from_output(
 
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
-            for input_id in list(cached_encoder_input_ids):
-                start_pos = request.mm_positions[input_id]["offset"]
-                num_tokens = request.mm_positions[input_id]["length"]
-                if start_pos + num_tokens <= request.num_computed_tokens:
-                    # The encoder output is already processed and stored
-                    # in the decoder's KV cache.
-                    self.encoder_cache_manager.free(request, input_id)
+            # OPTIMIZATION: Avoid list(set) if the set is empty.
+            if cached_encoder_input_ids:
+                for input_id in list(cached_encoder_input_ids):
+                    start_pos = request.mm_positions[input_id]["offset"]
+                    num_tokens = request.mm_positions[input_id]["length"]
+                    if start_pos + num_tokens <= request.num_computed_tokens:
+                        # The encoder output is already processed and stored
+                        # in the decoder's KV cache.
+                        self.encoder_cache_manager.free(request, input_id)
 
             if request.num_computed_tokens == request.num_tokens:
                 req_index = model_runner_output.req_id_to_index[req_id]

From ce69f7f7542bdb8b6e6302d112fb9fad212c1460 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 27 Jan 2025 18:31:49 +0800
Subject: [PATCH 03/69] [Bugfix] Fix gpt2 GGUF inference (#12467)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/gpt2.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 1656a3cc9e46d..2f1aa2d68653c 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -258,13 +258,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.transformer = GPT2Model(vllm_config=vllm_config,
                                      prefix=maybe_prefix(
                                          prefix, "transformer"))
+        self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                      self.config.hidden_size,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.lm_head")
         if self.config.tie_word_embeddings:
-            self.lm_head = self.transformer.wte
-        else:
-            self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                          self.config.hidden_size,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.lm_head")
+            self.lm_head = self.lm_head.tie_weights(self.transformer.wte)
+
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
@@ -309,15 +309,12 @@ def load_weights(self, weights: Iterable[Tuple[str,
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if name.startswith("lm_head"):
-                # GPT-2 ties the weights of the embedding layer and the final
-                # linear layer.
-                continue
             if ".attn.bias" in name or ".attn.masked_bias" in name:
                 # Skip attention mask.
                 # NOTE: "c_attn.bias" should not be skipped.
                 continue
-            if not name.startswith("transformer."):
+            if not name.startswith("transformer.") and not name.startswith(
+                    "lm_head"):
                 name = "transformer." + name
 
             if is_pp_missing_parameter(name, self):

From 103bd17ac585b44372a47f365d80f13446cf362d Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Mon, 27 Jan 2025 10:40:00 -0500
Subject: [PATCH 04/69] [Build] Only build 9.0a for scaled_mm and sparse
 kernels (#12339)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 CMakeLists.txt    |  8 ++++----
 cmake/utils.cmake | 43 ++++++++++++++++++++++++++++---------------
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ead539993d98c..4dee9ec36895f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -275,7 +275,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
     set(MARLIN_SRCS
        "csrc/quantization/fp8/fp8_marlin.cu"
@@ -296,8 +296,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
+  # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
     set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
@@ -351,7 +351,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # 2:4 Sparse Kernels
 
   # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
-  # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
+  # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
     set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
              "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 15b09395a889f..1c1c539819d05 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -259,7 +259,7 @@ endmacro()
 #  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
 # We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
 #  in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
-#  9.0a to the result. 
+#  9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). 
 # The result is stored in `OUT_CUDA_ARCHS`.
 #
 # Example:
@@ -270,34 +270,47 @@ endmacro()
 #
 function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
   list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
+  set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
 
   # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
   # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
   set(_CUDA_ARCHS)
   if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
     list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
-    if ("9.0" IN_LIST TGT_CUDA_ARCHS)
+    if ("9.0" IN_LIST TGT_CUDA_ARCHS_)
+      list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0")
       set(_CUDA_ARCHS "9.0a")
     endif()
   endif()
 
   list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
 
-  # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is 
-  # less or eqault to ARCH
-  foreach(_ARCH ${CUDA_ARCHS})
-  set(_TMP_ARCH)
-  foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
-    if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
-      set(_TMP_ARCH ${_SRC_ARCH})
-    else()
-      break()
+  # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
+  # is less or equal to ARCH (but has the same major version since SASS binary
+  # compatibility is only forward compatible within the same major version).
+  foreach(_ARCH ${TGT_CUDA_ARCHS_})
+    set(_TMP_ARCH)
+    # Extract the major version of the target arch
+    string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
+    foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
+      # Extract the major version of the source arch
+      string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
+      # Check major-version match AND version-less-or-equal
+      if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
+        if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
+          set(_TMP_ARCH "${_SRC_ARCH}")
+        endif()
+      else()
+        # If we hit a version greater than the target, we can break
+        break()
+      endif()
+    endforeach()
+
+    # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
+    if (_TMP_ARCH)
+      list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
     endif()
   endforeach()
-  if (_TMP_ARCH)
-    list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
-  endif()
-  endforeach()
 
   list(REMOVE_DUPLICATES _CUDA_ARCHS)
   set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)

From 01ba927040d0b6f7d8daf6bfbf32fde562d2f8a6 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 27 Jan 2025 17:26:28 +0000
Subject: [PATCH 05/69] [V1][Metrics] Add initial Prometheus logger (#12416)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py | 41 ++++++++++++++++++++----
 vllm/v1/engine/async_llm.py              | 11 ++++---
 vllm/v1/metrics/loggers.py               | 36 +++++++++++++++++++++
 3 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 6523c8b6297c6..469a5fb039fb6 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -16,6 +16,24 @@
 MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 
 
+@pytest.fixture(scope="module", params=[True, False])
+def use_v1(request):
+    # Module-scoped variant of run_with_both_engines
+    #
+    # Use this fixture to run a test with both v0 and v1, and
+    # also to conditionalize the test logic e.g.
+    #
+    # def test_metrics_exist(use_v1, server, client):
+    #     ...
+    #     expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
+    #     for metric in expected:
+    #         assert metric in response.text
+    #
+    # @skip_v1 wouldn't work here because this is a module-level
+    # fixture - per-function decorators would have no effect
+    yield request.param
+
+
 @pytest.fixture(scope="module")
 def default_server_args():
     return [
@@ -36,10 +54,12 @@ def default_server_args():
                     "--enable-chunked-prefill",
                     "--disable-frontend-multiprocessing",
                 ])
-def server(default_server_args, request):
+def server(use_v1, default_server_args, request):
     if request.param:
         default_server_args.append(request.param)
-    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+    env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0')
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args,
+                            env_dict=env_dict) as remote_server:
         yield remote_server
 
 
@@ -84,7 +104,9 @@ async def client(server):
 
 @pytest.mark.asyncio
 async def test_metrics_counts(server: RemoteOpenAIServer,
-                              client: openai.AsyncClient):
+                              client: openai.AsyncClient, use_v1: bool):
+    if use_v1:
+        pytest.skip("Skipping test on vllm V1")
     for _ in range(_NUM_REQUESTS):
         # sending a request triggers the metrics to be logged.
         await client.completions.create(
@@ -174,10 +196,15 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "swap_space_bytes",
 ]
 
+EXPECTED_METRICS_V1 = [
+    "vllm:num_requests_running",
+    "vllm:num_requests_waiting",
+]
+
 
 @pytest.mark.asyncio
 async def test_metrics_exist(server: RemoteOpenAIServer,
-                             client: openai.AsyncClient):
+                             client: openai.AsyncClient, use_v1: bool):
     # sending a request triggers the metrics to be logged.
     await client.completions.create(model=MODEL_NAME,
                                     prompt="Hello, my name is",
@@ -187,11 +214,13 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
     response = requests.get(server.url_for("metrics"))
     assert response.status_code == HTTPStatus.OK
 
-    for metric in EXPECTED_METRICS:
+    for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
         assert metric in response.text
 
 
-def test_metrics_exist_run_batch():
+def test_metrics_exist_run_batch(use_v1: bool):
+    if use_v1:
+        pytest.skip("Skipping test on vllm V1")
     input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""  # noqa: E501
 
     base_url = "0.0.0.0"
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 6dc68b3a16099..917d52d3220b8 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -24,7 +24,8 @@
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
+from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
+                                     StatLoggerBase)
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
@@ -46,13 +47,15 @@ def __init__(
 
         assert start_engine_loop
 
+        self.model_config = vllm_config.model_config
+
         self.log_requests = log_requests
         self.log_stats = log_stats
         self.stat_loggers: List[StatLoggerBase] = [
             LoggingStatLogger(),
-            # TODO(rob): PrometheusStatLogger(),
+            PrometheusStatLogger(labels=dict(
+                model_name=self.model_config.served_model_name)),
         ]
-        self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -272,7 +275,7 @@ async def _run_output_handler(self):
 
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
-                # background thread once we add Prometheus.
+                # background thread once Prometheus overhead is non-trivial.
                 assert iteration_stats is not None
                 self._log_stats(
                     scheduler_stats=outputs.scheduler_stats,
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 8feeef17542e6..b84f03fa3267c 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,5 +1,8 @@
 import time
 from abc import ABC, abstractmethod
+from typing import Dict
+
+import prometheus_client
 
 from vllm.logger import init_logger
 from vllm.v1.metrics.stats import SchedulerStats
@@ -36,3 +39,36 @@ def log(self, scheduler_stats: SchedulerStats):
             scheduler_stats.num_running_reqs,
             scheduler_stats.num_waiting_reqs,
         )
+
+
+class PrometheusStatLogger(StatLoggerBase):
+
+    def __init__(self, labels: Dict[str, str]):
+        self.labels = labels
+
+        labelnames = self.labels.keys()
+        labelvalues = self.labels.values()
+
+        self._unregister_vllm_metrics()
+
+        self.gauge_scheduler_running = prometheus_client.Gauge(
+            name="vllm:num_requests_running",
+            documentation="Number of requests in model execution batches.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.gauge_scheduler_waiting = prometheus_client.Gauge(
+            name="vllm:num_requests_waiting",
+            documentation="Number of requests waiting to be processed.",
+            labelnames=labelnames).labels(*labelvalues)
+
+    def log(self, scheduler_stats: SchedulerStats):
+        """Log to prometheus."""
+        self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
+        self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
+
+    @staticmethod
+    def _unregister_vllm_metrics():
+        # Unregister any existing vLLM collectors (for CI/CD
+        for collector in list(prometheus_client.REGISTRY._collector_to_names):
+            if hasattr(collector, "_name") and "vllm" in collector._name:
+                prometheus_client.REGISTRY.unregister(collector)

From 3f1fc7425a7db4d9722941075e43bb2ebfb90613 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 27 Jan 2025 09:40:04 -0800
Subject: [PATCH 06/69] [V1][CI/Test] Do basic test for top-p & top-k sampling
 (#12469)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/engine/test_engine_core.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index cccfd305ac604..033bbcfce564e 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -144,7 +144,7 @@ def test_engine_core(monkeypatch):
 def test_engine_core_advanced_sampling(monkeypatch):
     """
     A basic end-to-end test to verify that the engine functions correctly 
-    when additional sampling parameters, such as min_tokens and 
+    when additional sampling parameters, such as top_p, min_tokens, and 
     presence_penalty, are set.
     """
     with monkeypatch.context() as m:
@@ -167,11 +167,23 @@ def test_engine_core_advanced_sampling(monkeypatch):
             stop_token_ids=[1001, 1002],
         )
         engine_core.add_request(request)
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
-        # Loop through until they are all done.
-        while len(engine_core.step().outputs) > 0:
-            pass
 
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
+        def _check_engine_state():
+            assert len(engine_core.scheduler.waiting) == 1
+            assert len(engine_core.scheduler.running) == 0
+            # Loop through until they are all done.
+            while len(engine_core.step().outputs) > 0:
+                pass
+            assert len(engine_core.scheduler.waiting) == 0
+            assert len(engine_core.scheduler.running) == 0
+
+        _check_engine_state()
+
+        # Second request.
+        request2 = make_request()
+        request2.sampling_params = SamplingParams(
+            top_p=0.99,
+            top_k=50,
+        )
+        engine_core.add_request(request2)
+        _check_engine_state()

From 2bc3fbba0cf5b07fabb798d41b153b895d30c7b4 Mon Sep 17 00:00:00 2001
From: Bowen Wang <abmfy@icloud.com>
Date: Tue, 28 Jan 2025 02:19:24 +0800
Subject: [PATCH 07/69] [FlashInfer] Upgrade to 0.2.0 (#11194)

Signed-off-by: Bowen Wang <abmfy@icloud.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  11 +-
 Dockerfile                                    |  23 ++-
 .../test_basic_correctness.py                 |   5 +-
 tests/compile/test_basic_correctness.py       |   2 +-
 tests/kernels/test_flashinfer.py              |  74 +++----
 vllm/attention/backends/flashinfer.py         | 183 ++++++++++++++++--
 vllm/config.py                                |  10 +-
 vllm/model_executor/model_loader/loader.py    |   4 +-
 .../model_executor/model_loader/tensorizer.py |   3 +-
 vllm/worker/worker_base.py                    |  17 +-
 10 files changed, 257 insertions(+), 75 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index daec46760117d..d5d02fdeb7f4b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -183,7 +183,16 @@ steps:
     - vllm/
     - tests/v1
   commands:
-    - VLLM_USE_V1=1 pytest -v -s v1
+    # split the test to avoid interference
+    - VLLM_USE_V1=1 pytest -v -s v1/core
+    - VLLM_USE_V1=1 pytest -v -s v1/engine
+    - VLLM_USE_V1=1 pytest -v -s v1/sample
+    - VLLM_USE_V1=1 pytest -v -s v1/worker
+    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
+    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - VLLM_USE_V1=1 pytest -v -s v1/e2e
 
 - label: Examples Test # 25min
   working_dir: "/vllm-workspace/examples"
diff --git a/Dockerfile b/Dockerfile
index cb9cf0da5be65..0b9f74e08dc68 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -149,7 +149,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
+# TODO: Restore to base image after FlashInfer AOT wheel fixed
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
@@ -194,12 +195,30 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
+# How to build this FlashInfer wheel:
+# $ export FLASHINFER_ENABLE_AOT=1
+# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
+# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
+# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
+# $ cd flashinfer
+# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
+# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
+
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+    python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
 fi
 COPY examples examples
+
+# Although we build Flashinfer with AOT mode, there's still
+# some issues w.r.t. JIT compilation. Therefore we need to
+# install build dependencies for JIT compilation.
+# TODO: Remove this once FlashInfer AOT wheel is fixed
+COPY requirements-build.txt requirements-build.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-build.txt
+
 #################### vLLM installation IMAGE ####################
 
 #################### TEST IMAGE ####################
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 31a101e48e026..23285040642a8 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -61,9 +61,10 @@ def test_models(
     if backend == "FLASHINFER" and current_platform.is_rocm():
         pytest.skip("Flashinfer does not support ROCm/HIP.")
 
-    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
+    if backend in ("XFORMERS",
+                   "FLASHINFER") and model == "google/gemma-2-2b-it":
         pytest.skip(
-            "XFORMERS does not support gemma2 with full context length.")
+            f"{backend} does not support gemma2 with full context length.")
 
     os.environ["VLLM_ATTENTION_BACKEND"] = backend
 
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 87d5aefea6cb4..1945479fc3031 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -58,7 +58,7 @@ class TestSetting:
         model_args=["--task", "embed"],
         pp_size=1,
         tp_size=1,
-        attn_backend="FLASHINFER",
+        attn_backend="FLASH_ATTN",
         method="encode",
         fullgraph=True,
     ),
diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py
index a2c8f71665737..1645ef911d697 100644
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -133,17 +133,19 @@ def test_flashinfer_decode_with_paged_kv(
                 use_tensor_cores=(
                     (num_query_heads//num_kv_heads) > 4)
                 )
-    wrapper.begin_forward(kv_indptr,
-                          kv_indices,
-                          kv_last_page_lens,
-                          num_query_heads,
-                          num_kv_heads,
-                          head_size,
-                          block_size,
-                          "NONE",
-                          data_type=dtype)
-
-    output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap)
+    wrapper.plan(kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_query_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 "NONE",
+                 q_data_type=dtype,
+                 kv_data_type=dtype,
+                 logits_soft_cap=soft_cap)
+
+    output = wrapper.run(query, key_value_cache)
 
     ref_output = ref_paged_attn(query=query,
                                 key_cache=key_cache,
@@ -228,7 +230,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, "NHD")
-    wrapper.begin_forward(
+    wrapper.plan(
         qo_indptr,
         kv_indptr,
         kv_indices,
@@ -237,12 +239,14 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
         num_kv_heads,
         head_size,
         block_size,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        logits_soft_cap=soft_cap,
     )
 
-    output = wrapper.forward(
+    output = wrapper.run(
         query,
         key_value_cache,
-        logits_soft_cap=soft_cap,
     )
 
     ref_output = ref_paged_attn(query=query,
@@ -253,7 +257,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
                                 block_tables=block_tables,
                                 scale=scale,
                                 soft_cap=soft_cap)
-    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
 
 
@@ -332,7 +336,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, "NHD")
-    wrapper.begin_forward(
+    wrapper.plan(
         qo_indptr,
         kv_indptr,
         kv_indices,
@@ -341,13 +345,12 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
         num_kv_heads,
         head_size,
         block_size,
+        q_data_type=dtype,
+        kv_data_type=kv_cache_dtype,
+        logits_soft_cap=soft_cap,
     )
 
-    output = wrapper.forward(query,
-                             kv_cache_fp8,
-                             logits_soft_cap=soft_cap,
-                             k_scale=k_scale,
-                             v_scale=v_scale)
+    output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
 
     ref_output = ref_paged_attn(query=query,
                                 key_cache=key_cache.squeeze(1),
@@ -360,7 +363,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     del query
     del block_tables
     # verify prefill fp8
-    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
 
 
@@ -439,21 +442,18 @@ def test_flashinfer_decode_with_paged_fp8_kv(
     wrapper = flashinfer.\
         BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
                     use_tensor_cores=use_tensor_cores)
-    wrapper.begin_forward(kv_indptr,
-                          kv_indices,
-                          kv_last_page_lens,
-                          num_query_heads,
-                          num_kv_heads,
-                          head_size,
-                          block_size,
-                          "NONE",
-                          data_type=dtype,
-                          q_data_type=dtype)
-    output = wrapper.forward(query,
-                             kv_cache_fp8,
-                             logits_soft_cap=soft_cap,
-                             k_scale=k_scale,
-                             v_scale=v_scale)
+    wrapper.plan(kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_query_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 "NONE",
+                 q_data_type=dtype,
+                 kv_data_type=kv_cache_dtype,
+                 logits_soft_cap=soft_cap)
+    output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
     key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
     value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
 
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 3135b0b405343..7cccef9608218 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,3 +1,4 @@
+import dataclasses
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -13,9 +14,11 @@
     from vllm.vllm_flash_attn import flash_attn_varlen_func
     FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
 except ImportError:
-    BatchDecodeWithPagedKVCacheWrapper = None
-    CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
-    BatchPrefillWithPagedKVCacheWrapper = None
+    # Avoid turning these types into variables during type checking
+    if not TYPE_CHECKING:
+        BatchDecodeWithPagedKVCacheWrapper = None
+        CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
+        BatchPrefillWithPagedKVCacheWrapper = None
     FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
 
 import torch
@@ -30,7 +33,9 @@
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
+from vllm.attention.layer import Attention
 from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                         make_tensor_with_pad)
 
@@ -99,6 +104,72 @@ def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
 
+@dataclass
+class PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters.
+    """
+
+    window_left: int
+    logits_soft_cap: Optional[float]
+    sm_scale: float
+
+
+def get_per_layer_parameters(
+        vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]:
+    """
+    Scan all attention layers and determine some hyperparameters
+    to use during `plan`.
+    """
+
+    layers = vllm_config.compilation_config.static_forward_context
+    per_layer_params: Dict[str, PerLayerParameters] = {}
+
+    for key, layer in layers.items():
+        assert isinstance(layer, Attention)
+
+        impl = layer.impl
+        assert isinstance(impl, FlashInferImpl)
+
+        # Infer hyperparameters from the attention layer
+        window_size = impl.sliding_window
+        window_left = window_size[0] if window_size is not None else -1
+        logits_soft_cap = impl.logits_soft_cap
+        sm_scale = impl.scale
+
+        per_layer_params[key] = PerLayerParameters(window_left,
+                                                   logits_soft_cap, sm_scale)
+
+    return per_layer_params
+
+
+def infer_global_hyperparameters(
+        per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters:
+    - `window_left`
+    - `logits_soft_cap`
+    - `sm_scale`
+
+    So this function asserts that all layers share the same values for these
+    hyperparameters and returns the global values.
+    """
+
+    assert len(per_layer_params) > 0, "No attention layers found in the model."
+
+    param_sets = list(per_layer_params.values())
+    global_params = param_sets[0]
+    for params in param_sets:
+        assert params == global_params, (
+            "FlashInfer backend currently only supports models in which all "
+            "layers share the same values for the following hyperparameters: "
+            "`window_left`, `logits_soft_cap`, `sm_scale`.")
+
+    return global_params
+
+
 class FlashInferState(AttentionState):
 
     def __init__(self, runner):
@@ -108,6 +179,11 @@ def __init__(self, runner):
         self._decode_wrapper = None
         self._prefill_wrapper = None
 
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = get_current_vllm_config()
+
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
             self._workspace_buffer = torch.empty(
@@ -215,6 +291,9 @@ def graph_capture_get_metadata_for_batch(
                                             batch_size + 1,
                                             dtype=torch.int32)
 
+        global_params = infer_global_hyperparameters(
+            get_per_layer_parameters(self.vllm_config))
+
         attn_metadata = self.runner.attn_backend.make_metadata(
             num_prefills=0,
             slot_mapping=self._graph_slot_mapping[:batch_size],
@@ -238,7 +317,9 @@ def graph_capture_get_metadata_for_batch(
             q_data_type=self.runner.model_config.dtype,
             use_cuda_graph=True,
             decode_wrapper=self._graph_decode_wrapper,
-            prefill_wrapper=None)
+            prefill_wrapper=None,
+            **dataclasses.asdict(global_params),
+        )
         attn_metadata.begin_forward()
         return attn_metadata
 
@@ -325,9 +406,28 @@ class FlashInferMetadata(AttentionMetadata):
     data_type: torch.dtype = None
     # The data type of the query
     q_data_type: torch.dtype = None
-    device: torch.device = torch.device("cuda")
+    # FlashInfer 0.2 encourages passing host tensors
+    device: torch.device = torch.device("cpu")
     is_profile_run: bool = False
 
+    # The FlashInfer backend currently supports only models in which all layers
+    # share the same following hyperparameters:
+
+    # The left (inclusive) window size for the attention window, when
+    # set to `-1`, the window size will be set to the full length of
+    # the sequence. Defaults to `-1`.
+    window_left: int = -1
+    # The attention logits soft capping value (used in Gemini, Grok and
+    # Gemma-2, etc.), if not provided, will be set to `0`. If greater
+    # than 0, the logits will be capped according to formula:
+    # $$\texttt{logits\_soft\_cap} \times
+    # \mathrm{tanh}(x / \texttt{logits\_soft\_cap})$$,
+    # where $x$ is the input logits.
+    logits_soft_cap: Optional[float] = None
+    # The scale used in softmax, if not provided, will be set to
+    # `1.0 / sqrt(head_dim)`.
+    sm_scale: Optional[float] = None
+
     def __post_init__(self):
         # Refer to
         # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
@@ -363,14 +463,21 @@ def begin_forward(self):
                 self.block_table_bound = self.block_table_bound.to(self.device)
                 self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
                 self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-                self.prefill_wrapper.end_forward()
-                self.prefill_wrapper.begin_forward(
+                self.prefill_wrapper.plan(
                     self.query_start_loc,
                     self.paged_kv_indptr[:self.num_prefills + 1],
                     self.paged_kv_indices,
                     self.paged_kv_last_page_len[:self.num_prefills],
-                    self.num_qo_heads, self.num_kv_heads, self.head_dim,
-                    self.page_size)
+                    self.num_qo_heads,
+                    self.num_kv_heads,
+                    self.head_dim,
+                    self.page_size,
+                    causal=True,
+                    sm_scale=self.sm_scale,
+                    window_left=self.window_left,
+                    logits_soft_cap=self.logits_soft_cap,
+                    q_data_type=self.q_data_type,
+                    kv_data_type=self.data_type)
         if self.num_decode_tokens > 0:
             assert self.paged_kv_indices is not None
             assert self.paged_kv_indptr is not None
@@ -386,8 +493,7 @@ def begin_forward(self):
                 self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
 
             assert self.decode_wrapper is not None
-            self.decode_wrapper.end_forward()
-            self.decode_wrapper.begin_forward(
+            self.decode_wrapper.plan(
                 self.paged_kv_indptr[self.num_prefills:],
                 self.paged_kv_indices,
                 self.paged_kv_last_page_len[self.num_prefills:],
@@ -397,8 +503,11 @@ def begin_forward(self):
                 self.page_size,
                 # Disable flashinfer's pos encoding and use vllm's rope.
                 pos_encoding_mode="NONE",
+                window_left=self.window_left,
+                logits_soft_cap=self.logits_soft_cap,
+                sm_scale=self.sm_scale,
                 # kv-cache data type.
-                data_type=self.data_type,
+                kv_data_type=self.data_type,
                 # query data type.
                 q_data_type=self.q_data_type)
 
@@ -496,6 +605,11 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.sliding_window = input_builder.sliding_window
         self.block_size = input_builder.block_size
 
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = get_current_vllm_config()
+
     def prepare(self):
         self.slot_mapping: List[int] = []
         self.prefill_seq_lens: List[int] = []
@@ -528,6 +642,20 @@ def prepare(self):
         self.total_blocks = 0
         self.is_profile_run: bool = False
 
+        if self.global_hyperparameters is None:
+            # Infer global hyperparameters, since currently we only support
+            # models in which all layers share the same values for the
+            # following hyperparameters:
+            # - `window_left`
+            # - `logits_soft_cap`
+            # - `sm_scale`
+            inferred_params = infer_global_hyperparameters(
+                get_per_layer_parameters(self.vllm_config))
+            self.global_hyperparameters = inferred_params
+            self.window_left = inferred_params.window_left
+            self.logits_soft_cap = inferred_params.logits_soft_cap
+            self.sm_scale = inferred_params.sm_scale
+
     def _add_seq_group(
             self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
             chunked_prefill_enabled: bool):
@@ -756,7 +884,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             data_type=kv_cache_dtype,
             q_data_type=self.runner.model_config.dtype,
             use_cuda_graph=use_captured_graph,
-            is_profile_run=self.is_profile_run)
+            is_profile_run=self.is_profile_run,
+            window_left=self.window_left,
+            logits_soft_cap=self.logits_soft_cap,
+            sm_scale=self.sm_scale,
+        )
 
 
 class FlashInferImpl(AttentionImpl):
@@ -885,25 +1017,34 @@ def forward(
             else:
                 assert prefill_meta is not None
                 assert prefill_meta.prefill_wrapper is not None
-                prefill_output = prefill_meta.prefill_wrapper.forward(
+
+                assert prefill_meta.prefill_wrapper._causal
+                assert prefill_meta.prefill_wrapper._window_left == window_left
+                assert prefill_meta.prefill_wrapper._logits_soft_cap == (
+                    logits_soft_cap or 0.0)
+                assert prefill_meta.prefill_wrapper._sm_scale == softmax_scale
+
+                prefill_output = prefill_meta.prefill_wrapper.run(
                     query,
                     kv_cache,
-                    logits_soft_cap=logits_soft_cap,
-                    causal=True,
                     k_scale=layer._k_scale_float,
                     v_scale=layer._v_scale_float,
-                    window_left=window_left)
+                )
         if decode_meta := attn_metadata.decode_metadata:
             assert decode_meta is not None
             assert decode_meta.decode_wrapper is not None
-            decode_output = decode_meta.decode_wrapper.forward(
+
+            assert decode_meta.decode_wrapper._window_left == window_left
+            assert decode_meta.decode_wrapper._logits_soft_cap == (
+                logits_soft_cap or 0.0)
+            assert decode_meta.decode_wrapper._sm_scale == softmax_scale
+
+            decode_output = decode_meta.decode_wrapper.run(
                 decode_query,
                 kv_cache,
-                sm_scale=softmax_scale,
-                logits_soft_cap=logits_soft_cap,
                 k_scale=layer._k_scale_float,
                 v_scale=layer._v_scale_float,
-                window_left=window_left)
+            )
 
         if prefill_output is None and decode_output is not None:
             # Decode only batch.
diff --git a/vllm/config.py b/vllm/config.py
index 7a58d64bcc6e2..dc1d611115489 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -310,14 +310,15 @@ def __init__(
             (self.hf_text_config.model_type in ["gemma2", "cohere2"]))
 
         if (not self.disable_sliding_window and has_interleaved_attention):
-            if envs.VLLM_ATTENTION_BACKEND == "XFORMERS":
+            if (backend :=
+                    envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"):
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
                 logger.warning_once(
                     f"{self.hf_text_config.model_type} has interleaved "
                     "attention, which is currently not supported by the "
-                    "XFORMERS backend. Disabling sliding window and capping "
+                    f"{backend} backend. Disabling sliding window and capping "
                     "the max length to the sliding window size "
                     f"({sliding_window_len_min}).")
                 self.disable_sliding_window = True
@@ -3310,7 +3311,7 @@ def __str__(self):
 
 
 @contextmanager
-def set_current_vllm_config(vllm_config: VllmConfig):
+def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
     """
     Temporarily set the current VLLM config.
     Used during model initialization.
@@ -3330,7 +3331,8 @@ def set_current_vllm_config(vllm_config: VllmConfig):
                      vllm_config.compilation_config.enabled_custom_ops)
         logger.debug("disabled custom ops: %s",
                      vllm_config.compilation_config.disabled_custom_ops)
-        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
+        if check_compile and \
+            vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
             and compilation_counter.num_models_seen == num_models_seen:
             # If the model supports compilation,
             # compilation_counter.num_models_seen should be increased
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index e9779878710ee..527b4307f3670 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -114,7 +114,7 @@ def _initialize_model(
     all_params = [param.name for param in signatures.parameters.values()]
     if "vllm_config" in all_params and "prefix" in all_params:
         # new-style model class
-        with set_current_vllm_config(vllm_config):
+        with set_current_vllm_config(vllm_config, check_compile=True):
             return model_class(vllm_config=vllm_config, prefix=prefix)
 
     msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
@@ -142,7 +142,7 @@ def _initialize_model(
         kwargs["lora_config"] = vllm_config.lora_config
     if "scheduler_config" in all_params:
         kwargs["scheduler_config"] = vllm_config.scheduler_config
-    with set_current_vllm_config(vllm_config):
+    with set_current_vllm_config(vllm_config, check_compile=True):
         return model_class(**kwargs)
 
 
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 5b4757072353f..e359aef9dcb7f 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -288,7 +288,8 @@ def _init_model(self):
         model_args.torch_dtype = self.tensorizer_config.dtype
         assert self.tensorizer_config.model_class is not None
         # TODO: Do we need to consider old-style model class?
-        with no_init_or_tensor(), set_current_vllm_config(self.vllm_config):
+        with no_init_or_tensor(), set_current_vllm_config(self.vllm_config,
+                                                          check_compile=True):
             return self.tensorizer_config.model_class(
                 vllm_config=self.vllm_config, )
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index c6e6693c54f57..6eeb4aa17051f 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -8,7 +8,8 @@
 import torch
 import torch.nn as nn
 
-from vllm.config import ObservabilityConfig, VllmConfig
+from vllm.config import (ObservabilityConfig, VllmConfig,
+                         set_current_vllm_config)
 from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -498,8 +499,11 @@ def __init__(
         group.
         """
         self.rpc_rank = rpc_rank
-        self.vllm_config = vllm_config
         self.worker: Optional[WorkerBase] = None
+        # do not store this `vllm_config`, `init_worker` will set the final
+        # one. TODO: investigate if we can remove this field in
+        # `WorkerWrapperBase`, `init_cached_hf_modules` should be
+        # unnecessary now.
         if vllm_config.model_config is not None:
             # it can be None in tests
             trust_remote_code = vllm_config.model_config.trust_remote_code
@@ -533,6 +537,9 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         Arguments are passed to the worker class constructor.
         """
         kwargs = all_kwargs[self.rpc_rank]
+        self.vllm_config = kwargs.get("vllm_config", None)
+        assert self.vllm_config is not None, (
+            "vllm_config is required to initialize the worker")
         enable_trace_function_call_for_thread(self.vllm_config)
 
         from vllm.plugins import load_general_plugins
@@ -546,8 +553,10 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
                               bytes)
             worker_class = cloudpickle.loads(
                 self.vllm_config.parallel_config.worker_cls)
-        self.worker = worker_class(**kwargs)
-        assert self.worker is not None
+        with set_current_vllm_config(self.vllm_config):
+            # To make vLLM config available during worker initialization
+            self.worker = worker_class(**kwargs)
+            assert self.worker is not None
 
     def execute_method(self, method: Union[str, bytes], *args, **kwargs):
         try:

From 6116ca8cd79b642c64f4ae6f050a6bc12b96d037 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nicolo.lucchesi@gmail.com>
Date: Mon, 27 Jan 2025 22:38:35 +0100
Subject: [PATCH 08/69] [Feature] [Spec decode]: Enable MLPSpeculator/Medusa
 and `prompt_logprobs` with ChunkedPrefill (#10132)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: wallashss <wallashss@ibm.com>
Co-authored-by: wallashss <wallashss@ibm.com>
---
 tests/spec_decode/e2e/conftest.py             |  19 +-
 .../e2e/test_integration_dist_tp2.py          |  10 +-
 tests/spec_decode/e2e/test_logprobs.py        |  16 +-
 .../e2e/test_medusa_correctness.py            |  31 ++-
 tests/spec_decode/e2e/test_mlp_correctness.py |  53 ++++-
 .../e2e/test_multistep_correctness.py         |  31 +--
 .../spec_decode/e2e/test_ngram_correctness.py |  13 +-
 tests/spec_decode/test_scorer.py              |   1 +
 tests/spec_decode/test_spec_decode_worker.py  |   1 +
 tests/spec_decode/utils.py                    |  12 +
 vllm/config.py                                |   9 +-
 vllm/engine/llm_engine.py                     |  19 +-
 vllm/spec_decode/batch_expansion.py           | 133 +++++++----
 vllm/spec_decode/interfaces.py                |   8 +-
 vllm/spec_decode/mqa_scorer.py                |  68 +++++-
 vllm/spec_decode/spec_decode_worker.py        | 211 +++++++++++++-----
 16 files changed, 469 insertions(+), 166 deletions(-)

diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index b9cb3858c0068..5cb982a0811c7 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -2,6 +2,7 @@
 from typing import List, Optional, Sequence, Tuple, Union
 
 import pytest
+import torch
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
@@ -154,6 +155,8 @@ def _check_logprobs_when_output_disabled(
          spec_pos_logprob) = next(iter(spec_pos_logprobs.items()))
         assert spec_pos_logprob.rank == -1
         assert spec_pos_logprob.logprob == 0.0
+        if isinstance(spec_pos_logprob_token_id, torch.Tensor):
+            spec_pos_logprob_token_id = spec_pos_logprob_token_id.item()
         assert spec_pos_logprob_token_id in baseline_pos_logprobs
 
 
@@ -244,7 +247,8 @@ def run_equality_correctness_test_tp(model,
                                      batch_size: int,
                                      max_output_len: int,
                                      seed: int = 0,
-                                     temperature: float = 0.0):
+                                     temperature: float = 0.0,
+                                     logprobs: Optional[int] = None):
     """Helper method that compares the outputs of both the baseline LLM and
     the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
     the same when temperature is zero.
@@ -257,7 +261,6 @@ def run_equality_correctness_test_tp(model,
     results = []
 
     prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
-
     for args, env in ((arg1, env1), (arg2, env2)):
         with RemoteOpenAIServer(model,
                                 args,
@@ -269,12 +272,14 @@ def run_equality_correctness_test_tp(model,
                                                    prompt=prompts,
                                                    max_tokens=max_output_len,
                                                    seed=seed,
-                                                   temperature=temperature)
+                                                   temperature=temperature,
+                                                   logprobs=logprobs)
 
             results.append({
                 "test":
                 "seeded_sampling",
                 "text": [choice.text for choice in completion.choices],
+                "logprobs": [choice.logprobs for choice in completion.choices],
                 "finish_reason":
                 [choice.finish_reason for choice in completion.choices],
                 "usage":
@@ -284,7 +289,15 @@ def run_equality_correctness_test_tp(model,
     n = len(results) // 2
     arg1_results = results[:n]
     arg2_results = results[n:]
+    # Separate logprobs to avoid asserting exact equality.
+    arg1_logprobs = [r.pop("logprobs") for r in arg1_results]
+    arg2_logprobs = [r.pop("logprobs") for r in arg2_results]
+
     for arg1_result, arg2_result in zip(arg1_results, arg2_results):
         assert arg1_result == arg2_result, (
             f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
             f"{arg1_result=} != {arg2_result=}")
+    if logprobs:
+        for logs1, logs2 in zip(arg1_logprobs, arg2_logprobs):
+            for l1, l2 in zip(logs1, logs2):
+                assert l1.tokens == l2.tokens
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index 02cba92795142..7001ee4c007fe 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -2,6 +2,8 @@
 tensor parallelism.
 """
 
+from typing import Optional
+
 import pytest
 import torch
 
@@ -154,15 +156,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
                               "--speculative-draft-tensor-parallel-size",
                               "1",
                           ])])
+@pytest.mark.parametrize("logprobs", [None, 2])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
 def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
                                          per_test_common_llm_kwargs,
                                          baseline_llm_kwargs, test_llm_kwargs,
+                                         logprobs: Optional[int],
                                          batch_size: int, seed: int):
     """Verify spec decode works well with same and different TP size for
     the draft model with chunked prefill.
     """
+    if logprobs:
+        test_llm_kwargs.extend(
+            ["--disable_logprobs_during_spec_decoding", "False"])
     run_equality_correctness_test_tp(model,
                                      common_llm_kwargs,
                                      per_test_common_llm_kwargs,
@@ -171,4 +178,5 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
                                      batch_size,
                                      max_output_len=32,
                                      seed=seed,
-                                     temperature=0.0)
+                                     temperature=0.0,
+                                     logprobs=logprobs)
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 4cfca8b78e79b..1a543606cb3f3 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -4,26 +4,27 @@
 
 from vllm import SamplingParams
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model_name": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-160m",
 
         # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+        "enforce_eager": True
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs",
                          [{
-                             "speculative_model": "JackFram/llama-160m",
+                             "speculative_model": "JackFram/llama-68m",
                              "num_speculative_tokens": 3,
                              "disable_logprobs_during_spec_decoding": False,
                          }, {
-                             "speculative_model": "JackFram/llama-160m",
+                             "speculative_model": "JackFram/llama-68m",
                              "num_speculative_tokens": 3,
                              "disable_logprobs_during_spec_decoding": True,
                          }])
@@ -36,12 +37,15 @@
     ])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4, 12])
 def test_logprobs_equality(vllm_runner, common_llm_kwargs,
                            per_test_common_llm_kwargs, baseline_llm_kwargs,
                            test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int, logprobs: int):
-    """Verify output logprobs are equal with and without speculative decoding.
+                           seed: int, logprobs: int, prefill_chunk_size: int):
+    """Verify output logprobs are equal with and without speculative decoding,
+        as well as with and without chunked prefill.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
index b8965606b3d0e..dbcbc0db10881 100644
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -21,6 +21,7 @@
 
 import pytest
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 # main model
@@ -67,12 +68,14 @@
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                        per_test_common_llm_kwargs,
                                        baseline_llm_kwargs, test_llm_kwargs,
                                        batch_size: int, output_len: int,
-                                       seed: int):
+                                       seed: int, prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -119,12 +122,15 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    seed: int, logprobs: int):
+                                    seed: int, logprobs: int,
+                                    prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -167,12 +173,14 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness_cuda_graph(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        seed: int, prefill_chunk_size: int):
     """Verify greedy equality with cuda graph enabled and different 
     batch sizes."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -217,13 +225,15 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness_with_preemption(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        seed: int, prefill_chunk_size: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -267,13 +277,15 @@ def test_medusa_e2e_greedy_correctness_with_preemption(
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_different_k(vllm_runner, common_llm_kwargs,
                             per_test_common_llm_kwargs, baseline_llm_kwargs,
                             test_llm_kwargs, batch_size: int, output_len: int,
-                            seed: int):
+                            seed: int, prefill_chunk_size: int):
     """Verify that medusa speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -313,14 +325,17 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
                               per_test_common_llm_kwargs, baseline_llm_kwargs,
                               test_llm_kwargs, batch_size: int,
-                              output_len: int, seed: int):
+                              output_len: int, seed: int,
+                              prefill_chunk_size: int):
     """Verify that medusa speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -361,12 +376,14 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                     baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int):
+                    output_len: int, seed: int, prefill_chunk_size: int):
     """Verify that speculative decoding generates the same output 
     with batch expansion scorer and mqa scorer.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 183ff2f5db274..1fa1104f5d3a8 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -25,6 +25,7 @@
 
 from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 # main model
@@ -66,14 +67,16 @@
 @pytest.mark.parametrize("output_len", [
     128,
 ])
-@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("batch_size", [4, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    seed: int):
+                                    seed: int, prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -116,12 +119,19 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs, test_llm_kwargs,
                                  batch_size: int, output_len: int, seed: int,
-                                 logprobs: int):
+                                 logprobs: int, prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    # NOTE Test is sensitive enough st if we don't enable chunked prefill
+    # scheduling on baseline too, we get slightly different logprobs, ending
+    # up sampling different tokens at the tail (ie top tokens don't change).
+    # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected?
+    maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -162,12 +172,15 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("output_len", [2048])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs, test_llm_kwargs,
-                                 batch_size: int, output_len: int, seed: int):
+                                 batch_size: int, output_len: int,
+                                 prefill_chunk_size: int, seed: int):
     """Verify acceptance rate with different batch size and large output 
     length."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -204,13 +217,17 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("output_len", [64])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("temperature", [1.0])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    temperature: float, seed: int):
+                                    temperature: float,
+                                    prefill_chunk_size: int, seed: int):
     """Verify seeded runs produce the same output."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -266,14 +283,16 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
         128,
     ])
 @pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_e2e_greedy_correctness_with_preemption(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        prefill_chunk_size: int, seed: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -317,12 +336,14 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_greedy_correctness_with_padding(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        prefill_chunk_size: int, seed: int):
     """Verify greedy equality when the vocab dimension is padded
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
 
     # Default pad_to is 64, test model has vocab_size of 32000
     def patched_pad_vocab_size(vocab_size, pad_to=None):
@@ -373,14 +394,16 @@ def patched_pad_vocab_size(vocab_size, pad_to=None):
         # Use smaller output len for fast test.
         32,
     ])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_different_k(vllm_runner, common_llm_kwargs,
                          per_test_common_llm_kwargs, baseline_llm_kwargs,
-                         test_llm_kwargs, batch_size: int, seed: int,
-                         output_len: int):
+                         test_llm_kwargs, batch_size: int,
+                         prefill_chunk_size: int, seed: int, output_len: int):
     """Verify that mlp speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -418,15 +441,21 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
         # Use smaller output len for fast test.
         32,
     ])
+# Speculative decoding is disabled when sequences reach decoding and the batch
+# consists of single-token requests. Hence we set `max_num_seqs`
+# >= `speculative_disable_by_batch_size` to test feature interaction.
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
                            per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, seed: int,
+                           test_llm_kwargs, batch_size: int,
+                           prefill_chunk_size: int, seed: int,
                            output_len: int):
     """Verify that mlp speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -460,13 +489,15 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
         # Use smaller output len for fast test.
         32,
     ])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                     baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int):
+                    output_len: int, prefill_chunk_size: int, seed: int):
     """Verify that speculative decoding generates the same output 
     with batch expansion scorer and mqa scorer.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index a13cca41f99e5..05ad468dd8bc5 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -147,20 +147,20 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
         },
     ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4,
-    },
-])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": "JackFram/llama-68m",
+                             "num_speculative_tokens": 5,
+                             "enable_chunked_prefill": False,
+                             "disable_logprobs_during_spec_decoding": False
+                         }, {
+                             "speculative_model": "JackFram/llama-68m",
+                             "num_speculative_tokens": 3,
+                             "enable_chunked_prefill": True,
+                             "max_num_batched_tokens": 4,
+                             "max_num_seqs": 4,
+                             "disable_logprobs_during_spec_decoding": False
+                         }])
 @pytest.mark.parametrize(
     "output_len",
     [
@@ -192,6 +192,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
                                   batch_size,
                                   max_output_len=output_len,
                                   seed=seed,
+                                  prompt_logprobs=2,
+                                  logprobs=2,
+                                  disable_logprobs=False,
                                   temperature=0.0,
                                   ensure_all_accepted=ensure_all_accepted)
 
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index e53d169a8fcc3..77f8b8998c8d3 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -26,6 +26,7 @@
 
 import pytest
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 
@@ -49,11 +50,13 @@
         "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
         "ngram_prompt_lookup_max": 3,
+        "speculative_disable_mqa_scorer": False,
     },
     {
         "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
         "ngram_prompt_lookup_max": 3,
+        "speculative_disable_mqa_scorer": True,
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -68,15 +71,7 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                       batch_size: int, output_len: int,
                                       prefill_chunk_size: int, seed: int):
     """Verify greedy equality on a tiny model with different batch size."""
-    if prefill_chunk_size > 0:
-        common_llm_kwargs.update(
-            **{
-                "enable_chunked_prefill": True,
-                "max_num_batched_tokens": prefill_chunk_size,
-                "max_num_seqs": prefill_chunk_size
-            })
-    else:
-        common_llm_kwargs["enable_chunked_prefill"] = False
+    maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index 0b1509d8b7785..5a093dea16d40 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -60,6 +60,7 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
     num_gpu_blocks = 2048 // block_size
     scorer_worker = create_worker(Worker, model_name, block_size,
                                   num_gpu_blocks, seed)
+    scorer_worker.model_runner.disable_logprobs = True  # accessed by mqa_scorer
     scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor = True
     scorer_worker.model_runner.model.sampler.\
         should_modify_greedy_probs_inplace = True
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index caf7a7e625b46..d8c3af4c1cd1e 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -754,6 +754,7 @@ def test_populate_seq_ids_with_bonus_tokens():
         seq_group_metadata_list=seq_group_metadata_list,
         accepted_token_ids=accepted_token_ids,
         target_logprobs=target_token_logprobs,
+        prompt_logprobs=None,
         k=k,
         stage_times=(0, 0, 0))
     # Verify that _seq_with_bonus_token_in_last_step contains the following:
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index a4bfa6b2f384b..2f883c2ff9b7a 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -274,3 +274,15 @@ def create_batch(batch_size,
             prompts, num_gpu_blocks, block_size, final_prompt_lens,
             prev_output_tokens, seq_ids)
     return seq_group_metadata_list, prompts, prev_output_tokens
+
+
+def maybe_enable_chunked_prefill(prefill_chunk_size, llm_kwargs):
+    if prefill_chunk_size > 0:
+        llm_kwargs.update(
+            **{
+                "enable_chunked_prefill": True,
+                "max_num_batched_tokens": prefill_chunk_size,
+                "max_num_seqs": prefill_chunk_size
+            })
+    else:
+        llm_kwargs["enable_chunked_prefill"] = False
diff --git a/vllm/config.py b/vllm/config.py
index dc1d611115489..7ab632d7e3667 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1685,7 +1685,8 @@ def maybe_create_spec_config(
             raise ValueError("Expect the batch size threshold of disabling "
                              "speculative decoding is > 1, but got "
                              f"{speculative_disable_by_batch_size=}")
-
+        if (enable_chunked_prefill and speculative_model == "eagle"):
+            raise ValueError("Chunked prefill and EAGLE are not compatible.")
         # TODO: The user should be able to specify revision/max model len
         # for the draft model. It is not currently supported.
         draft_revision = None
@@ -1752,12 +1753,6 @@ def maybe_create_spec_config(
                         f"num_speculative_tokens={n_predict}, but "
                         f"{num_speculative_tokens=} was provided.")
 
-            if enable_chunked_prefill and draft_hf_config.model_type in (
-                    "medusa", "mlp_speculator", "eagle"):
-                raise ValueError(
-                    "Chunked prefill and hidden-state based draft models are "
-                    "not compatible.")
-
             speculative_draft_tensor_parallel_size = \
                 SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
                     target_parallel_config,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 7da18d5f7d2eb..ab67ae29723cd 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1010,8 +1010,23 @@ def _process_model_outputs(self,
                      self.speculative_config
             # Organize outputs by [step][sequence group] instead of
             # [sequence group][step].
-            outputs_by_sequence_group = create_output_by_sequence_group(
-                outputs, num_seq_groups=len(seq_group_metadata_list))
+            if self.scheduler_config.is_multi_step:
+                outputs_by_sequence_group = create_output_by_sequence_group(
+                    outputs, len(seq_group_metadata_list))
+            elif self.speculative_config:
+                # Decodes are multi-steps while prefills are not, outputting at
+                # most 1 token. Separate them so that we can trigger chunk
+                # processing without having to pad or copy over prompts K times
+                # to match decodes structure (costly with prompt_logprobs).
+                num_prefills = sum(sg.is_prompt
+                                   for sg in seq_group_metadata_list)
+                prefills, decodes = outputs[:num_prefills], outputs[
+                    num_prefills:]
+                outputs_by_sequence_group = create_output_by_sequence_group(
+                    decodes,
+                    num_seq_groups=len(seq_group_metadata_list) - num_prefills)
+                outputs_by_sequence_group = [p.outputs for p in prefills
+                                             ] + outputs_by_sequence_group
             # We have outputs for multiple steps submitted in a single burst,
             # so invalidate is_first_step_output.
             is_first_step_output = None
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 01b9cdad963da..56fb9ba506a44 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -83,13 +83,13 @@ def score_proposals(
 
         if not non_spec_indices:
             # All sequence groups in batch have spec decoding enabled
-            contracted = self._contract_batch_all_spec(
+            return self._contract_batch_all_spec(
                 target_sampler_output=target_sampler_output,
                 proposals=proposals,
             )
         else:
             # Batch has a mix of spec decode enabled and disabled seq groups
-            contracted = self._contract_batch(
+            return self._contract_batch(
                 execute_model_req.seq_group_metadata_list,
                 target_sampler_output=target_sampler_output,
                 proposals=proposals,
@@ -99,14 +99,6 @@ def score_proposals(
                 k=execute_model_req.num_lookahead_slots,
             )
 
-        all_tokens, all_probs, spec_logprobs, all_hidden_states = contracted
-        return SpeculativeScores(
-            probs=all_probs,
-            token_ids=all_tokens,
-            logprobs=spec_logprobs,
-            hidden_states=all_hidden_states,
-        )
-
     def _expand_batch(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -143,13 +135,57 @@ def _expand_batch(
         return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
                 num_scoring_tokens)
 
+    def _contract_non_speculative(
+            self, scores: SpeculativeScores,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            non_spec_indices: List[int], non_spec_outputs: SpeculativeScores,
+            has_prompt_log: bool) -> SpeculativeScores:
+        """
+            Augment input `scores` with non-speculative requests outputs. 
+            This includes decode requests with speculation turned off, as well
+            as prefill requests when `enable_chunked_prefill` is set.
+            For the latter, prefills are further separated into terminal and 
+            non-terminal chunks (from which no token is sampled).
+        """
+        if not non_spec_indices:
+            return scores
+
+        if has_prompt_log:
+            # When prompt_logprobs is enabled, prefills yield output token
+            # (and respective prob) in the last entry (prompt|out):
+            # [.|.|.|prefill0_out|.|prefill1_out|decode0_out|..].
+            # With chunked prefill, non-terminal chunks have -1 on each
+            # position: they're still picked, but they're discarded later.
+            seq_meta = seq_group_metadata_list
+            nospec_sizes = torch.tensor([
+                seq_meta[i].token_chunk_size if seq_meta[i].is_prompt else 1
+                for i in non_spec_indices
+            ])
+            nospec_sampled_token_idxs = torch.cumsum(nospec_sizes, 0).add_(-1)
+        else:
+            # In this case only sampled tokens are returned, select all.
+            nospec_sampled_token_idxs = list(
+                range(len(non_spec_outputs.token_ids)))
+
+        scores.token_ids[non_spec_indices, :1] = \
+            non_spec_outputs.token_ids[nospec_sampled_token_idxs].unsqueeze(1)
+        scores.probs[non_spec_indices, :1, :] = \
+            non_spec_outputs.probs[nospec_sampled_token_idxs].unsqueeze(1)
+        scores.logprobs[non_spec_indices, :1, :] = \
+            non_spec_outputs.logprobs[nospec_sampled_token_idxs].unsqueeze(1)
+        if scores.hidden_states is not None:
+            assert non_spec_outputs.hidden_states is not None
+            scores.hidden_states[non_spec_indices, :1, :] = \
+                non_spec_outputs.hidden_states[nospec_sampled_token_idxs].unsqueeze(1)
+        return scores
+
     def _contract_batch(
-        self, contracted_seq_group_metadata_list: List[SequenceGroupMetadata],
-        target_sampler_output: SamplerOutput, proposals: SpeculativeProposals,
-        num_scoring_tokens: int, non_spec_indices: List[int],
-        spec_indices: List[int], k: int
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
-               Optional[torch.Tensor]]:
+            self,
+            contracted_seq_group_metadata_list: List[SequenceGroupMetadata],
+            target_sampler_output: SamplerOutput,
+            proposals: SpeculativeProposals, num_scoring_tokens: int,
+            non_spec_indices: List[int], spec_indices: List[int],
+            k: int) -> SpeculativeScores:
         """Contract the expanded batch back into its original size.
         This maps the scores of speculative tokens back to their original
         sequences.
@@ -195,23 +231,28 @@ def _contract_batch(
         else:
             all_hidden_states = None
 
-        # Rule out prefills that produce no tokens.
-        non_spec_indices = [
-            idx for idx in non_spec_indices
-            if contracted_seq_group_metadata_list[idx].do_sample
-        ]
-        if len(non_spec_indices):
-            all_tokens[non_spec_indices, :1] = \
-                non_spec_target_token_ids.unsqueeze(1)
-            all_probs[non_spec_indices, :1, :] = \
-                non_spec_target_probs.unsqueeze(1)
-            all_logprobs[non_spec_indices, :1, :] = \
-                non_spec_target_logprobs.unsqueeze(1)
-            if all_hidden_states is not None:
-                assert non_spec_target_hidden_states is not None
-                all_hidden_states[non_spec_indices, :1, :] = \
-                    non_spec_target_hidden_states.unsqueeze(1)
-
+        has_prompt_log = any((sg.sampling_params.prompt_logprobs
+                              and sg.sampling_params.prompt_logprobs > 0)
+                             for sg in contracted_seq_group_metadata_list)
+        # When prompt logprobs is enabled, lens of returned tensors go from
+        # n_sampled (requests with do_sample=True) to n_prompt+n_prefills.
+        # We adjust stride accordingly to get the generated tokens and
+        # their probs, but pass on prompt_logprobs as is.
+        prompt_logprobs = None
+        if (not self._scorer_worker.model_runner.disable_logprobs\
+            and has_prompt_log):
+            prompt_logprobs = [
+                o.prompt_logprobs for o in target_sampler_output.outputs
+            ]
+        elif not has_prompt_log:
+            # When prompt logprobs are not to be returned,
+            # we can ignore non-terminal chunks (no out token).
+            non_spec_indices = [
+                idx for idx in non_spec_indices
+                if contracted_seq_group_metadata_list[idx].do_sample
+            ]
+
+        # "Contract" speculative.
         if spec_indices:
             all_tokens[spec_indices] = target_token_ids
             all_probs[spec_indices] = target_probs
@@ -219,14 +260,27 @@ def _contract_batch(
             if all_hidden_states is not None:
                 all_hidden_states[spec_indices] = target_hidden_states
 
-        return all_tokens, all_probs, all_logprobs, all_hidden_states
+        spec_scores = SpeculativeScores(probs=all_probs,
+                                        token_ids=all_tokens,
+                                        logprobs=all_logprobs,
+                                        hidden_states=all_hidden_states,
+                                        prompt_logprobs=prompt_logprobs)
+
+        non_spec_outputs = SpeculativeScores(
+            probs=non_spec_target_probs,
+            token_ids=non_spec_target_token_ids,
+            logprobs=non_spec_target_logprobs,
+            hidden_states=non_spec_target_hidden_states)
+        # Contract remaining nonspec entries based on non_spec_indices, if any.
+        return self._contract_non_speculative(
+            spec_scores, contracted_seq_group_metadata_list, non_spec_indices,
+            non_spec_outputs, has_prompt_log)
 
     def _contract_batch_all_spec(
         self,
         target_sampler_output: SamplerOutput,
         proposals: SpeculativeProposals,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
-               Optional[torch.Tensor]]:
+    ) -> SpeculativeScores:
         """Contract the expanded batch back into its original size.
         This maps the scores of speculative tokens back to their original
         sequences.
@@ -250,8 +304,11 @@ def _contract_batch_all_spec(
             target_hidden_states = target_hidden_states.reshape(
                 *target_token_ids.shape, target_hidden_states.shape[-1])
 
-        return (target_token_ids, target_probs, target_logprobs,
-                target_hidden_states)
+        return SpeculativeScores(probs=target_probs,
+                                 token_ids=target_token_ids,
+                                 logprobs=target_logprobs,
+                                 hidden_states=target_hidden_states,
+                                 prompt_logprobs=None)
 
     def _create_scoring_model_input(
         self,
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index a4fe0f13c8db1..c39e98b6cca12 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,10 +1,10 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional, Set, Union
+from typing import List, Optional, Set, Union
 
 import torch
 
-from vllm.sequence import ExecuteModelRequest
+from vllm.sequence import ExecuteModelRequest, PromptLogprobs
 from vllm.worker.worker_base import WorkerBase
 
 
@@ -54,6 +54,10 @@ class SpeculativeScores:
     # Optional last hidden states from the scoring model.
     hidden_states: Optional[torch.Tensor] = None
 
+    # Scoring model may also return logprobs for prompt tokens
+    # for each request, when chunked prefill is enabled.
+    prompt_logprobs: Optional[List[PromptLogprobs]] = None
+
     def __repr__(self):
         return (f"SpeculativeScores("
                 f"probs={self.probs.shape}, "
diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py
index cbf793e2043e3..3aea2eabb4144 100644
--- a/vllm/spec_decode/mqa_scorer.py
+++ b/vllm/spec_decode/mqa_scorer.py
@@ -72,9 +72,15 @@ def score_proposals(
         target_token_ids = target_sampler_output.sampled_token_ids
         target_probs = target_sampler_output.sampled_token_probs
         target_logprobs = target_sampler_output.logprobs
+        prompt_logprobs = None
+
         # If all requests have the same number of query tokens, we can avoid
         # the for loop to build output for better performance.
         if min(all_proposal_lengths) == k:
+            # Regular decodes only.
+            assert all(not sg.is_prompt
+                       for sg in target_seq_group_metadata_list
+                       if sg.is_prompt)
             bs, _ = proposals.proposal_token_ids.shape
             all_tokens = target_token_ids.reshape(bs, k + 1)
             all_probs = target_probs.reshape(bs, k + 1, self._vocab_size)
@@ -88,19 +94,56 @@ def score_proposals(
             all_logprobs = target_logprobs.new_full(size=all_probs.shape,
                                                     fill_value=-float("inf"))
             target_token_ids = target_token_ids.flatten()
-            start_loc = 0
-            for i, (proposed_len, seq_meta) in enumerate(
-                    zip(all_proposal_lengths, target_seq_group_metadata_list)):
+
+            # When prompt logprobs is enabled, lens of returned tensors go from
+            # n_sampled (requests with do_sample=True) to n_prompt+n_prefills.
+            # We adjust stride accordingly to get the generated tokens and
+            # their probs, but pass on prompt_logprobs as is, since it may be
+            # that n_prompts >> K.
+            has_prompt_log = any((sg.sampling_params.prompt_logprobs
+                                  and sg.sampling_params.prompt_logprobs > 0)
+                                 for sg in target_seq_group_metadata_list)
+            # TODO (NickLucche) we should surface `disable_logprobs` as to not
+            # break abstraction to get its value.
+            if (not self._scorer_worker.model_runner.disable_logprobs\
+                and has_prompt_log):
+                prompt_logprobs = [
+                    o.prompt_logprobs for o in target_sampler_output.outputs
+                ]
+
+            # Split loop into prefill|decode for readability.
+            start_loc, i = 0, 0
+            while i < len(target_seq_group_metadata_list
+                          ) and target_seq_group_metadata_list[i].is_prompt:
+                seq_meta = target_seq_group_metadata_list[i]
+                end_loc = start_loc
+                if has_prompt_log:
+                    end_loc += seq_meta.token_chunk_size
+                elif seq_meta.do_sample:
+                    end_loc += 1
+
                 # Skip chunks with no output tokens.
                 if seq_meta.do_sample:
-                    output_len = proposed_len + 1
-                    end_loc = start_loc + output_len
-                    all_tokens[
-                        i, :output_len] = target_token_ids[start_loc:end_loc]
-                    all_probs[i, :output_len] = target_probs[start_loc:end_loc]
-                    all_logprobs[
-                        i, :output_len] = target_logprobs[start_loc:end_loc]
-                    start_loc = end_loc
+                    # Get sampled token (last position in chunk) and its prob.
+                    all_tokens[i, 0] = target_token_ids[end_loc - 1]
+                    all_probs[i, 0] = target_probs[end_loc - 1]
+                    all_logprobs[i, 0] = target_logprobs[end_loc - 1]
+
+                i += 1
+                start_loc = end_loc
+            # Decodes.
+            while i < len(target_seq_group_metadata_list):
+                proposed_len, seq_meta = all_proposal_lengths[
+                    i], target_seq_group_metadata_list[i]
+                output_len = proposed_len + 1
+                end_loc = start_loc + output_len
+                all_tokens[
+                    i, :output_len] = target_token_ids[start_loc:end_loc]
+                all_probs[i, :output_len] = target_probs[start_loc:end_loc]
+                all_logprobs[
+                    i, :output_len] = target_logprobs[start_loc:end_loc]
+                start_loc = end_loc
+                i += 1
 
         hidden_states = None
         if target_sampler_output.hidden_states is not None:
@@ -110,4 +153,5 @@ def score_proposals(
         return SpeculativeScores(probs=all_probs,
                                  token_ids=all_tokens,
                                  logprobs=all_logprobs,
-                                 hidden_states=hidden_states)
+                                 hidden_states=hidden_states,
+                                 prompt_logprobs=prompt_logprobs)
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 0d66ede3d907a..8e9802c7d333c 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -563,50 +563,57 @@ def _serialize_sampler_output_no_logprobs(
             (seq_id, seq_data) for sg in \
             execute_model_req.seq_group_metadata_list \
             for seq_id, seq_data in sg.seq_data.items()
-            if sg.do_sample # ignore empty token sequences
         ]
         completion_seq_group_output_list: List[
             CompletionSequenceGroupOutput] = []
         output_index = 0
         # Make sure the non-terminal prefill chunks are still aligned with
         # their own empty output.
-        for seq_group_meta in execute_model_req.seq_group_metadata_list:
-            # Since we can get chunks here, we dont always have a sampled token
-            # (only on last chunk) but we still have to provide an output.
-            if not seq_group_meta.do_sample:
-                completion_seq_group_output_list.append(
-                    CompletionSequenceGroupOutput(samples=[],
-                                                  prompt_logprobs=None))
-            else:
-                # Sequence with output.
-                seq_id, seq_data = seq_data_entries[output_index]
-                needs_prompt_logprobs = seq_output_prompt_logprobs[
-                    output_index]
-                if needs_prompt_logprobs:
-                    prompt_token_ids = seq_data.get_prompt_token_ids()
-                    prompt_logprobs = [
-                        create_logprobs_output(
-                            token_id=p_token_id,
-                            token_id_logprob_rank=-1,
-                            token_id_logprob=0.0,
-                            topk_token_ids=[],
-                            topk_logprobs=[],
-                        )
-                        # no prompt logprobs for the first token
-                        for p_token_id in prompt_token_ids[1:]
-                    ]
-                else:
-                    prompt_logprobs = None
-                completion_seq_group_output_list.append(
-                    create_sequence_group_output(
-                        token_id=sampled_token_ids_list[output_index][0],
+        for idx, seq_group_meta in enumerate(
+                execute_model_req.seq_group_metadata_list):
+            needs_prompt_logprobs = seq_output_prompt_logprobs[idx]
+            seq_id, seq_data = seq_data_entries[idx]
+            if needs_prompt_logprobs:
+                prompt_token_ids = seq_data.get_prompt_token_ids()
+
+                # Some of these sequences may belong to non-terminal chunks,
+                # which may still have to report logprobs for prompts.
+                start = 1 if seq_data._num_computed_tokens == 0 \
+                    else seq_data._num_computed_tokens
+                end = (seq_data._num_computed_tokens + \
+                       seq_group_meta.token_chunk_size)
+                prompt_token_ids = prompt_token_ids[start:end]
+                prompt_logprobs = [
+                    create_logprobs_output(
+                        token_id=p_token_id,
                         token_id_logprob_rank=-1,
                         token_id_logprob=0.0,
-                        seq_id=seq_id,
                         topk_token_ids=[],
                         topk_logprobs=[],
-                        prompt_logprobs=prompt_logprobs))
-                output_index += 1
+                    ) for p_token_id in prompt_token_ids
+                ]
+            else:
+                prompt_logprobs = None
+
+            # Since we can get chunks here, we dont always have a sampled token
+            # (only on last chunk) but we still have to provide an output.
+            if not seq_group_meta.do_sample:
+                completion_seq_group_output_list.append(
+                    CompletionSequenceGroupOutput(
+                        samples=[], prompt_logprobs=prompt_logprobs))
+                continue
+
+            # Sequence with output.
+            completion_seq_group_output_list.append(
+                create_sequence_group_output(
+                    token_id=sampled_token_ids_list[output_index][0],
+                    token_id_logprob_rank=-1,
+                    token_id_logprob=0.0,
+                    seq_id=seq_id,
+                    topk_token_ids=[],
+                    topk_logprobs=[],
+                    prompt_logprobs=prompt_logprobs))
+            output_index += 1
 
         return [SamplerOutput(outputs=completion_seq_group_output_list)]
 
@@ -624,24 +631,27 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
         assert len(sampler_output) == 1
         sampler_output = sampler_output[0]
 
-        # Store hidden states from target model execution.
+        # Store hidden states from target model execution, BxD.
         hidden_states = sampler_output.hidden_states
         if hidden_states is not None:
-            # remove hidden_states for prompt tokens
-            # TODO Enable `return_hidden_states`: prefill chunks hidden states
-            # are pruned by the logits processor. Also, they should be arranged
-            # back into full-prefill latent. Address it to enable MLPSpeculator.
-            if any(seq.is_prompt
-                   for seq in execute_model_req.seq_group_metadata_list):
+            # Only decodes and prefill terminal chunks need a hidden state.
+            seq_group_meta_with_hidden = [
+                sg for sg in execute_model_req.seq_group_metadata_list
+                if sg.do_sample
+            ]
+            if any(seq.is_prompt for seq in seq_group_meta_with_hidden):
+                # Drop hidden_states with no prediction (eg non-terminal chunks)
                 hidden_states = hidden_states[
                     torch.where(sampler_output.sampled_token_ids -
                                 VLLM_INVALID_TOKEN_ID)[0]]
-            if self.previous_hidden_states is None:
+            if self.previous_hidden_states is None and len(
+                    seq_group_meta_with_hidden):
                 self.previous_hidden_states = HiddenStates(
-                    hidden_states, execute_model_req.seq_group_metadata_list)
-            else:
-                self.previous_hidden_states.update(
-                    hidden_states, execute_model_req.seq_group_metadata_list)
+                    hidden_states, seq_group_meta_with_hidden)
+            elif self.previous_hidden_states and len(
+                    seq_group_meta_with_hidden):
+                self.previous_hidden_states.update(hidden_states,
+                                                   seq_group_meta_with_hidden)
 
         if not skip_proposer:
             # We prepare the prefill hidden states here so that there no
@@ -752,13 +762,13 @@ def _run_speculative_decoding_step(
         ]
         if len(non_spec_indices):
             all_hidden_states = proposal_scores.hidden_states
-            # TODO fix `return_hidden_states`, same as in `_run_no_spec`
             if all_hidden_states is not None:
                 prefill_hidden_states = all_hidden_states[non_spec_indices]
                 execute_model_req.previous_hidden_states = \
                     prepare_prefill_hidden_states(prefill_hidden_states)
             # Sync proposer KV cache for prefills.
             prefill_req = execute_model_req.clone(non_spec_seqs)
+            # TODO avoid sampling here?
             self.proposer_worker.execute_model(prefill_req)
 
         with Timer() as verification_timer:
@@ -774,6 +784,8 @@ def _run_speculative_decoding_step(
             execute_model_req.seq_group_metadata_list,
             accepted_token_ids,
             target_logprobs=target_logprobs,
+            prompt_logprobs=proposal_scores.prompt_logprobs
+            if not self._disable_logprobs else None,
             k=execute_model_req.num_lookahead_slots,
             stage_times=stage_times)
 
@@ -845,19 +857,32 @@ def _verify_tokens(
         # metadata.
         accepted_token_ids[original_indices] = accepted_token_ids.clone()
 
+        # B x K+1 x D
         hidden_states = proposal_scores.hidden_states
         if hidden_states is not None:
+            # Only get terminal hidden states for next step
+            terminal_metadata = [
+                sg for sg in seq_group_metadata_list if sg.do_sample
+            ]
+
             # Contract hidden states based on accepted tokens
             hs_size = hidden_states.shape[-1]
-
             accepted_index = accepted_token_ids + 1  # Convert -1 to 0
-            accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)
-            index = accepted_index[:, None, None].expand(-1, 1, hs_size)
+            accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)  # b
+            # Drop non-terminal prefill chunks hidden states.
+            hidden_states = hidden_states[
+                accepted_index != VLLM_INVALID_TOKEN_ID]
+            accepted_index = accepted_index[
+                accepted_index != VLLM_INVALID_TOKEN_ID]
+            assert len(accepted_index) == hidden_states.shape[0] == len(
+                terminal_metadata)
+            index = accepted_index[:, None, None].expand(-1, 1,
+                                                         hs_size)  # b x 1 x d
             second_last_token_hidden_states = hidden_states[:, -2]  # b x d
             hidden_states = hidden_states.gather(1, index).squeeze(1)  # b x d
             # Store hidden states from target model for subsequent decode step
             self.previous_hidden_states = HiddenStates(
-                hidden_states, seq_group_metadata_list,
+                hidden_states, terminal_metadata,
                 second_last_token_hidden_states)
         return accepted_token_ids, logprobs
 
@@ -866,6 +891,8 @@ def _create_output_sampler_list(
         seq_group_metadata_list: List[SequenceGroupMetadata],
         accepted_token_ids: torch.Tensor,  # shape: [batch_size, k+1]
         target_logprobs: torch.Tensor,  # shape: [batch_size, k+1, vocab_size]
+        prompt_logprobs: Optional[
+            torch.Tensor],  # shape: [nprompt_tokens, vocab_size]
         k: int,
         stage_times: Tuple[float, float, float],
     ) -> List[SamplerOutput]:
@@ -909,15 +936,89 @@ def _create_output_sampler_list(
 
         # Construct the output on a per-step, per-sequence basis.
         # Non-terminal prefill chunks will end up here as rows with just -1s
-        # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]]
+        # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] while
+        # terminal chunks will only have one generated token at time 0.
         sampler_output_list: List[SamplerOutput] = []
+
+        # Prefills are not multi-step (return at most 1 token), in order to
+        # avoid padding or repetition to fit decodes, we separate them.
+        for i, sg in enumerate(seq_group_metadata_list):
+            if not sg.is_prompt:
+                # Requests are ordered as prefills|decodes=>no more prefills.
+                break
+            num_logprobs = num_logprobs_per_seq[i]
+            seq_kwargs = dict(token_id=-1,
+                              token_id_logprob_rank=0,
+                              token_id_logprob=-float('inf'),
+                              topk_token_ids=[-1] * num_logprobs,
+                              topk_logprobs=[-float('inf')] * num_logprobs,
+                              seq_id=seq_ids[i])
+            # Terminal chunk, has token.
+            if sg.do_sample:
+                seq_kwargs.update(
+                    dict(
+                        token_id=accepted_token_ids[i][0].item(),
+                        token_id_logprob_rank=accepted_token_id_ranks_by_step[
+                            0][i],
+                        token_id_logprob=accepted_token_id_logprobs_by_step[0]
+                        [i],
+                        topk_token_ids=topk_indices_by_step[0][i]
+                        [:num_logprobs],
+                        # output only so step is 0
+                        topk_logprobs=topk_logprobs_by_step[0][i]
+                        [:num_logprobs],
+                    ))
+            needs_plogs = (sg.sampling_params.prompt_logprobs
+                           and sg.sampling_params.prompt_logprobs > 0)
+            plogs = None
+            if prompt_logprobs is not None:
+                # Even non-terminal prompt chunks can have logprobs here.
+                plogs = prompt_logprobs[i]
+            elif needs_plogs:
+                # Prompt logprobs are requested but `_disable_logprobs` is set.
+                seq_data = next(iter(sg.seq_data.values()))
+                # Get only the tokens in this chunk!
+                prompt_token_ids = seq_data.get_prompt_token_ids()
+                prompt_token_ids = prompt_token_ids[
+                    seq_data.
+                    _num_computed_tokens:seq_data._num_computed_tokens +
+                    sg.token_chunk_size]
+
+                is_first_chunk = seq_data._num_computed_tokens == 0
+                # There's no prob generated for the first token in a sequence.
+                if is_first_chunk:
+                    prompt_token_ids = prompt_token_ids[1:]
+                plogs = [
+                    create_logprobs_output(
+                        token_id=p_token_id,
+                        token_id_logprob_rank=-1,
+                        token_id_logprob=0.0,
+                        topk_token_ids=[],
+                        topk_logprobs=[],
+                    ) for p_token_id in prompt_token_ids
+                ]
+            seq_kwargs.update(dict(prompt_logprobs=plogs))
+
+            sampler_output_list.append(
+                SamplerOutput(
+                    outputs=[create_sequence_group_output(
+                        **seq_kwargs)]))  # type: ignore
+
+        # Decodes, create one SamplerOutput per-step (at most K+1).
         for step_index in range(num_steps):
-            if all(token_id == -1
-                   for token_id in accepted_token_ids_by_step[step_index]):
+            if all(token_id == -1 for sg, token_id in zip(
+                    seq_group_metadata_list,
+                    accepted_token_ids_by_step[step_index])
+                   if not sg.is_prompt):
                 break
 
             step_output_token_ids: List[CompletionSequenceGroupOutput] = []
             for sequence_index in range(batch_size):
+                seq_meta = seq_group_metadata_list[sequence_index]
+                # Prompts already processed above.
+                if seq_meta.is_prompt:
+                    continue
+
                 # Each sequence may have a different num_logprobs; retrieve it.
                 num_logprobs = num_logprobs_per_seq[sequence_index]
                 step_output_token_ids.append(
@@ -952,6 +1053,8 @@ def _create_output_sampler_list(
             # This is periodic because the rejection sampler emits metrics
             # periodically.
             self._maybe_log_stage_times(*stage_times)
+        # First `n_prefills` entries will contain prefills SamplerOutput when
+        # chunked prefill is enabled, the rest is decodes in multi-step format.
         return sampler_output_list
 
     def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float,

From 823ab796330825f4052d771e2c462ad3b55236eb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 28 Jan 2025 00:23:08 +0000
Subject: [PATCH 09/69] Update `pre-commit` hooks (#12475)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .pre-commit-config.yaml                       |  10 +-
 benchmarks/benchmark_serving.py               |   4 +-
 csrc/custom_all_reduce.cuh                    |   8 +-
 csrc/moe/marlin_kernels/marlin_moe_kernel.h   |   8 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  |  16 +--
 .../marlin/dense/marlin_cuda_kernel.cu        |   4 +-
 .../marlin/qqq/marlin_qqq_gemm_kernel.cu      |   4 +-
 csrc/quantization/marlin/sparse/common/mma.h  |   4 +-
 csrc/rocm/attention.cu                        |   4 +-
 setup.py                                      |   2 +-
 tests/kernels/test_block_fp8.py               |  25 ++--
 tests/kv_transfer/test_lookup_buffer.py       |  10 +-
 tests/lora/test_qwen2vl.py                    |   6 +-
 .../vision_language/test_models.py            | 130 ++++++++++--------
 .../vision_language/test_pixtral.py           |  17 ++-
 tests/quantization/test_compressed_tensors.py |   6 +-
 tests/samplers/test_rejection_sampler.py      |  15 +-
 tools/report_build_time_ninja.py              |   5 +-
 vllm/_custom_ops.py                           |   4 +-
 vllm/attention/ops/prefix_prefill.py          |  28 ++--
 vllm/attention/ops/triton_flash_attention.py  |   4 +-
 vllm/attention/selector.py                    |   4 +-
 vllm/config.py                                |   7 +-
 vllm/core/block/common.py                     |   7 +-
 vllm/core/block_manager.py                    |   4 +-
 vllm/core/scheduler.py                        |  23 ++--
 .../device_communicators/shm_broadcast.py     |   8 +-
 vllm/distributed/parallel_state.py            |   8 +-
 vllm/entrypoints/chat_utils.py                |   4 +-
 vllm/entrypoints/openai/serving_completion.py |   9 +-
 .../granite_20b_fc_tool_parser.py             |   4 +-
 vllm/lora/layers.py                           |  12 +-
 vllm/lora/models.py                           |   5 +-
 vllm/lora/ops/triton_ops/sgmv_expand.py       |   5 +-
 vllm/lora/ops/triton_ops/sgmv_shrink.py       |   4 +-
 .../kernels/mixed_precision/MPLinearKernel.py |  12 +-
 .../kernels/scaled_mm/ScaledMMLinearKernel.py |  14 +-
 .../layers/quantization/utils/fp8_utils.py    |   7 +-
 .../layers/quantization/utils/w8a8_utils.py   |   4 +-
 vllm/model_executor/layers/sampler.py         |   7 +-
 .../layers/vocab_parallel_embedding.py        |  16 +--
 vllm/model_executor/model_loader/loader.py    |   5 +-
 .../model_executor/model_loader/tensorizer.py |   4 +-
 vllm/model_executor/models/gemma.py           |   4 +-
 vllm/model_executor/models/granitemoe.py      |   6 +-
 vllm/model_executor/models/mllama.py          |   4 +-
 vllm/model_executor/models/mlp_speculator.py  |   4 +-
 vllm/model_executor/models/phimoe.py          |   8 +-
 vllm/model_executor/models/registry.py        |   3 +-
 vllm/model_executor/models/ultravox.py        |   8 +-
 vllm/model_executor/models/utils.py           |   5 +-
 vllm/model_executor/sampling_metadata.py      |  11 +-
 vllm/platforms/neuron.py                      |   4 +-
 vllm/scalar_type.py                           |   4 +-
 vllm/spec_decode/spec_decode_worker.py        |   4 +-
 vllm/spec_decode/top1_proposer.py             |  10 +-
 vllm/spec_decode/util.py                      |  12 +-
 vllm/transformers_utils/configs/nemotron.py   |   4 +-
 vllm/utils.py                                 |  10 +-
 vllm/v1/core/scheduler.py                     |   4 +-
 vllm/v1/stats/common.py                       |   4 +-
 vllm/v1/worker/gpu_model_runner.py            |   2 +-
 vllm/worker/hpu_worker.py                     |   8 +-
 vllm/worker/tpu_model_runner.py               |   4 +-
 64 files changed, 322 insertions(+), 288 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 432bf5ed18dbc..7b32df90bfd8b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,18 +3,18 @@ default_stages:
   - manual # Run in CI
 repos:
 - repo: https://github.com/google/yapf
-  rev: v0.32.0
+  rev: v0.43.0
   hooks:
   - id: yapf
     args: [--in-place, --verbose]
     additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.5
+  rev: v0.9.3
   hooks:
   - id: ruff
     args: [--output-format, github]
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.3.0
+  rev: v2.4.0
   hooks:
   - id: codespell
     exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
@@ -23,7 +23,7 @@ repos:
   hooks:
   - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v18.1.5
+  rev: v19.1.7
   hooks:
   - id: clang-format
     exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
@@ -35,7 +35,7 @@ repos:
   - id: pymarkdown
     files: docs/.*
 - repo: https://github.com/rhysd/actionlint
-  rev: v1.7.6
+  rev: v1.7.7
   hooks:
   - id: actionlint
 - repo: local
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 63d2c3f7c7dd9..8b3212831e7e0 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -926,8 +926,8 @@ def main(args: argparse.Namespace):
                     )
 
         # Traffic
-        result_json["request_rate"] = (
-            args.request_rate if args.request_rate < float("inf") else "inf")
+        result_json["request_rate"] = (args.request_rate if args.request_rate
+                                       < float("inf") else "inf")
         result_json["burstiness"] = args.burstiness
         result_json["max_concurrency"] = args.max_concurrency
 
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index 6be4d4f2b2eb8..b9df4ed160b03 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -38,9 +38,13 @@ struct Signal {
   alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
 };
 
-struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
+struct __align__(16) RankData {
+  const void* __restrict__ ptrs[8];
+};
 
-struct __align__(16) RankSignals { Signal* signals[8]; };
+struct __align__(16) RankSignals {
+  Signal* signals[8];
+};
 
 // like std::array, but aligned
 template <typename T, int sz>
diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
index a217401b3d7c2..47ecf109d0f53 100644
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
@@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
@@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
 
   const int SUB = 0x64006400;
   const int MUL = 0x2c002c00;
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 04ef842fbdf95..7c33fea93d6ae 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
@@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
   q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
 
   typename ScalarType<nv_bfloat16>::FragB frag_b;
   static constexpr uint32_t MUL = 0x3F803F80;
@@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
 
   const int SUB = 0x64006400;
   const int MUL = 0x2c002c00;
@@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
   q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
 
   typename ScalarType<nv_bfloat16>::FragB frag_b;
   static constexpr uint32_t MUL = 0x3F803F80;
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index c03fef886e4db..4db8f5dcdabf6 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
index 103a6444f3a21..048a3f736fb71 100644
--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
   static constexpr uint32_t HI = 0x00f000f0;
   static constexpr uint32_t EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   static constexpr uint32_t SUB = 0x64086408;
diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h
index b26505f771c8b..49eee4128ee7c 100644
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 9477790629c9f..ffa9d44610a7f 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
-    const int max_num_partitions){UNREACHABLE_CODE}
+    const int max_num_partitions) {
+  UNREACHABLE_CODE
+}
 
 #endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
 
diff --git a/setup.py b/setup.py
index ee193e4693806..59ece870b5585 100755
--- a/setup.py
+++ b/setup.py
@@ -417,7 +417,7 @@ def get_rocm_version():
 
         if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
                                   ctypes.byref(patch)) == 0):
-            return "%d.%d.%d" % (major.value, minor.value, patch.value)
+            return f"{major.value}.{minor.value}.{patch.value}"
         return None
     except Exception:
         return None
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index a16cc4582a180..f28fdf3feedbc 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A,
         A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
     ]
     B_tiles = [[
-        B[j * block_n:min((j + 1) * block_n, N),
-          i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
+        B[
+            j * block_n:min((j + 1) * block_n, N),
+            i * block_k:min((i + 1) * block_k, K),
+        ] for i in range(k_tiles)
     ] for j in range(n_tiles)]
     C_tiles = [
         C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
@@ -157,9 +159,9 @@ def setup_cuda():
     torch.set_default_device("cuda")
 
 
-@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
-                         itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
-                                           SEEDS))
+@pytest.mark.parametrize(
+    "num_tokens,d,dtype,group_size,seed",
+    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
 @torch.inference_mode()
 def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
     torch.manual_seed(seed)
@@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
     assert torch.allclose(scale, ref_scale)
 
 
-@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
-                         itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
-                                           SEEDS))
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     torch.manual_seed(seed)
@@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     assert rel_diff < 0.001
 
 
-@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
-                         itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
-                                           BLOCK_SIZE, DTYPES, SEEDS))
+@pytest.mark.parametrize(
+    "M,N,K,E,topk,block_size,dtype,seed",
+    itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
+                      SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
     torch.manual_seed(seed)
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 718730bb8cbbe..4d6890305af73 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-    print("My rank: %d, device: %s" % (my_rank, device))
+    print(f"My rank: {my_rank}, device: {device}")
 
     # insert
     tokens = torch.tensor([1, 2, 3]).to(device)
@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-    print("My rank: %d, Test run passed!" % (my_rank))
+    print(f"My rank: {my_rank}, Test run passed!")
 
 
 def stress_test(my_rank, buf, device):
@@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
                 assert torch.allclose(k, k_)
                 assert torch.allclose(v, v_)
                 assert torch.allclose(h, h_)
-    print('Rank %d done' % my_rank)
+    print(f"Rank {my_rank} done")
     torch.distributed.barrier()
 
     if my_rank == 0:
@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
     else:
         torch.distributed.send(torch.tensor([n]), 0)
 
-    print("My rank: %d, Passed stress test!" % (my_rank))
+    print(f"My rank: {my_rank}, Passed stress test!")
 
 
 if __name__ == "__main__":
@@ -122,7 +122,7 @@ def stress_test(my_rank, buf, device):
         rank=my_rank,
     )
 
-    print("initialized! My rank is %d" % my_rank)
+    print(f"initialized! My rank is {my_rank}")
 
     config = KVTransferConfig(
         kv_connector='PyNcclConnector',
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index ebdd129db5f6a..570aa3861d0be 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
-@pytest.mark.xfail(current_platform.is_rocm(),
-                   reason="Qwen2-VL dependency xformers incompatible with ROCm"
-                   )
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="Qwen2-VL dependency xformers incompatible with ROCm")
 def test_qwen2vl_lora(qwen2vl_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 14d9a739be318..d5f0d63288cc1 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -521,12 +521,13 @@ def _mark_splits(
 # - image embeddings
 # - video
 # - custom inputs
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.IMAGE,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.IMAGE,
+        fork_new_process_for_each_test=False,
+    ))
 def test_single_image_models(tmp_path: PosixPath, model_type: str,
                              test_case: ExpandableVLMTestArgs,
                              hf_runner: Type[HfRunner],
@@ -543,12 +544,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.MULTI_IMAGE,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.MULTI_IMAGE,
+        fork_new_process_for_each_test=False,
+    ))
 def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
                             hf_runner: Type[HfRunner],
@@ -565,12 +567,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.EMBEDDING,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.EMBEDDING,
+        fork_new_process_for_each_test=False,
+    ))
 def test_image_embedding_models(model_type: str,
                                 test_case: ExpandableVLMTestArgs,
                                 hf_runner: Type[HfRunner],
@@ -586,12 +589,13 @@ def test_image_embedding_models(model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.VIDEO,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.VIDEO,
+        fork_new_process_for_each_test=False,
+    ))
 def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
                       hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
                       video_assets: _VideoAssets):
@@ -605,12 +609,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.CUSTOM_INPUTS,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        fork_new_process_for_each_test=False,
+    ))
 def test_custom_inputs_models(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
@@ -627,12 +632,13 @@ def test_custom_inputs_models(
 
 
 #### Tests filtering for things running each test as a new process
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.IMAGE,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.IMAGE,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                    test_case: ExpandableVLMTestArgs,
@@ -650,12 +656,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.MULTI_IMAGE,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.MULTI_IMAGE,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
@@ -673,12 +680,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.EMBEDDING,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.EMBEDDING,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_image_embedding_models_heavy(model_type: str,
                                       test_case: ExpandableVLMTestArgs,
@@ -695,12 +703,13 @@ def test_image_embedding_models_heavy(model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.VIDEO,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.VIDEO,
+        fork_new_process_for_each_test=True,
+    ))
 def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                             hf_runner: Type[HfRunner],
                             vllm_runner: Type[VllmRunner],
@@ -715,12 +724,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.CUSTOM_INPUTS,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_custom_inputs_models_heavy(
     model_type: str,
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index 90c0fab99054c..8103e5305b91b 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs(
     outputs: OutputsLogprobs,
     filename: "StrPath",
 ) -> None:
-    json_data = [(tokens, text,
-                  [{k: asdict(v)
-                    for k, v in token_logprobs.items()}
-                   for token_logprobs in (logprobs or [])])
+    json_data = [(tokens, text, [{
+        k: asdict(v)
+        for k, v in token_logprobs.items()
+    } for token_logprobs in (logprobs or [])])
                  for tokens, text, logprobs in outputs]
 
     with open(filename, "w") as f:
@@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
     with open(filename, "rb") as f:
         json_data = json.load(f)
 
-    return [(tokens, text,
-             [{int(k): Logprob(**v)
-               for k, v in token_logprobs.items()}
-              for token_logprobs in logprobs])
-            for tokens, text, logprobs in json_data]
+    return [(tokens, text, [{
+        int(k): Logprob(**v)
+        for k, v in token_logprobs.items()
+    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
 
 
 @large_gpu_test(min_gb=80)
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index bf0d454ad511c..1072697ecf5cc 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -314,9 +314,9 @@ def check_model(model):
 
 
 @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="2of4 Sparse is not yet supported on this GPU type."
-                    )
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="2of4 Sparse is not yet supported on this GPU type.")
 @pytest.mark.parametrize(
     "args_2of4",
     [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 397fa2cc85821..dcb1b27bff37f 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -23,16 +23,17 @@ def mock_causal_accepted_tensor(
     """
     batch_size = last_accepted_indices.shape[0]
 
-    accepted = (torch.arange(k).expand(batch_size, k) <=
-                last_accepted_indices.unsqueeze(-1).broadcast_to(
+    accepted = (torch.arange(k).expand(batch_size, k)
+                <= last_accepted_indices.unsqueeze(-1).broadcast_to(
                     batch_size, k))
 
     # Sprinkle accepted values after the contiguous initial accepted values.
     # This replicates the behavior of rejection sampling, which may "accept"
     # a token that cannot be accepted because of causality.
-    sprinkle_candidates = (
-        torch.arange(k).expand(batch_size, k) >
-        last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1)
+    sprinkle_candidates = (torch.arange(k).expand(
+        batch_size,
+        k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) +
+                           1)
     sprinkle = torch.rand(batch_size, k) > 0.5
     accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
     return accepted
@@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution(
         distance_wrt_reference)
 
     expected_improvement_multiplier = 20
-    assert (relative_change_in_distance_wrt_target >
-            relative_change_in_distance_wrt_reference *
+    assert (relative_change_in_distance_wrt_target
+            > relative_change_in_distance_wrt_reference *
             expected_improvement_multiplier)
 
 
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index 51ad2adc74fe1..9dc19f5fd4cdd 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types):
     print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
           'parallelism)'.format(length, total_cpu_time,
                                 total_cpu_time * 1.0 / length))
-    print('    %d build steps completed, average of %1.2f/s' %
-          (len(entries), len(entries) / (length)))
+    print('    {} build steps completed, average of {:1.2f}/s'.format(
+        len(entries),
+        len(entries) / (length)))
 
 
 def main():
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 440bc52012ab7..85c1121ed6ff8 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -820,8 +820,8 @@ def scaled_int8_quant(
     if scale is not None:
         # static-per-tensor quantization.
         assert symmetric == (
-            azp is
-            None), "azp must only be provided for asymmetric quantization."
+            azp
+            is None), "azp must only be provided for asymmetric quantization."
         torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
 
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index e2f2b66dfc90c..ec3c8459c43ef 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -219,8 +219,8 @@ def _fwd_kernel(
                           float("-inf"))
             if SLIDING_WINDOW > 0:
                 qk = tl.where(
-                    offs_m[:, None] -
-                    (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000)
+                    offs_m[:, None] - (start_n + offs_n[None, :])
+                    < SLIDING_WINDOW, qk, -10000)
 
             # -- compute m_ij, p, l_ij
             m_ij = tl.max(qk, 1)
@@ -324,10 +324,10 @@ def _fwd_kernel_flash_attn_v2(
             (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
             cur_head * stride_qh + offs_d[None, :] * stride_qd)
 
-        q = tl.load(
-            Q + off_q,
-            mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-            other=0.0)
+        q = tl.load(Q + off_q,
+                    mask=offs_m[:, None]
+                    < cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
 
         # # initialize pointer to m and l
         m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -402,8 +402,8 @@ def _fwd_kernel_flash_attn_v2(
             # -- compute qk ----
             k = tl.load(k_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
+                        mask=(start_n + offs_n[None, :])
+                        < cur_batch_seq_len - cur_batch_ctx_len,
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -430,8 +430,8 @@ def _fwd_kernel_flash_attn_v2(
             # update acc
             v = tl.load(v_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
+                        mask=(start_n + offs_n[:, None])
+                        < cur_batch_seq_len - cur_batch_ctx_len,
                         other=0.0)
 
             p = p.to(v.dtype)
@@ -639,8 +639,8 @@ def _fwd_kernel_alibi(
             k = tl.load(k_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_kbs,
                         mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :]) <
-                         cur_batch_seq_len - cur_batch_ctx_len),
+                        ((start_n + offs_n[None, :])
+                         < cur_batch_seq_len - cur_batch_ctx_len),
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -677,8 +677,8 @@ def _fwd_kernel_alibi(
             v = tl.load(v_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_vbs,
                         mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None]) <
-                         cur_batch_seq_len - cur_batch_ctx_len),
+                        ((start_n + offs_n[:, None])
+                         < cur_batch_seq_len - cur_batch_ctx_len),
                         other=0.0)
             p = p.to(v.dtype)
 
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index f94211116a746..ef04603f22b6e 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -627,8 +627,8 @@ def attn_fwd(
                                         causal_start_idx,
                                         dtype=tl.int32)
             mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
-            out_ptrs_mask = (mask_m_offsets[:, None] >=
-                             out_mask_boundary[None, :])
+            out_ptrs_mask = (mask_m_offsets[:, None]
+                             >= out_mask_boundary[None, :])
             z = 0.0
             acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
     # write back LSE
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 81ea6eefb5410..1376274d57777 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,6 +1,6 @@
 import os
 from contextlib import contextmanager
-from functools import lru_cache
+from functools import cache
 from typing import Generator, Optional, Type
 
 import torch
@@ -100,7 +100,7 @@ def get_attn_backend(
     )
 
 
-@lru_cache(maxsize=None)
+@cache
 def _cached_get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
diff --git a/vllm/config.py b/vllm/config.py
index 7ab632d7e3667..d7c9311ae3cb0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -67,7 +67,8 @@
 
 _TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = {
     task: runner
-    for runner, tasks in _RUNNER_TASKS.items() for task in tasks
+    for runner, tasks in _RUNNER_TASKS.items()
+    for task in tasks
 }
 
 HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
@@ -1976,8 +1977,8 @@ def _verify_args(self) -> None:
                              "typical_acceptance_sampler.")
 
         if (self.draft_token_acceptance_method != 'rejection_sampler'
-                and self.draft_token_acceptance_method !=
-                'typical_acceptance_sampler'):
+                and self.draft_token_acceptance_method
+                != 'typical_acceptance_sampler'):
             raise ValueError(
                 "Expected draft_token_acceptance_method to be either "
                 "rejection_sampler or typical_acceptance_sampler. Instead it "
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index c03b5932eafb6..115f663e4ad34 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -34,9 +34,10 @@ class RefCounter(RefCounterProtocol):
 
     def __init__(self, all_block_indices: Iterable[BlockId]):
         deduped = set(all_block_indices)
-        self._refcounts: Dict[BlockId,
-                              RefCount] = {index: 0
-                                           for index in deduped}
+        self._refcounts: Dict[BlockId, RefCount] = {
+            index: 0
+            for index in deduped
+        }
 
     def incr(self, block_id: BlockId) -> RefCount:
         assert block_id in self._refcounts
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 62a5f0bda061a..2d6a132ed555b 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -136,8 +136,8 @@ def can_allocate(self,
             device=Device.GPU)
 
         # Use watermark to avoid frequent cache eviction.
-        if (self.num_total_gpu_blocks - num_required_blocks <
-                self.watermark_blocks):
+        if (self.num_total_gpu_blocks - num_required_blocks
+                < self.watermark_blocks):
             return AllocStatus.NEVER
         if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
             return AllocStatus.OK
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index b1630b34947bd..2bb961481e5fe 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -988,8 +988,8 @@ def _schedule_prefills(
                     waiting_queue.popleft()
                     continue
 
-            if (budget.num_batched_tokens >=
-                    self.scheduler_config.max_num_batched_tokens):
+            if (budget.num_batched_tokens
+                    >= self.scheduler_config.max_num_batched_tokens):
                 # We've reached the budget limit - since there might be
                 # continuous prefills in the running queue, we should break
                 # to avoid scheduling any new prefills.
@@ -1096,8 +1096,8 @@ def _schedule_default(self) -> SchedulerOutputs:
                     running_scheduled.swapped_out) == 0:
                 swapped_in = self._schedule_swapped(budget, curr_loras)
 
-        assert (budget.num_batched_tokens <=
-                self.scheduler_config.max_num_batched_tokens)
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
@@ -1189,8 +1189,8 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
                                            curr_loras,
                                            enable_chunking=True)
 
-        assert (budget.num_batched_tokens <=
-                self.scheduler_config.max_num_batched_tokens)
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
@@ -1358,8 +1358,8 @@ def schedule(
                 # NOTE: We use get_len instead of get_prompt_len because when
                 # a sequence is preempted, prefill includes previous generated
                 # output tokens.
-                if (token_chunk_size + num_computed_tokens <
-                        seqs[0].data.get_len()):
+                if (token_chunk_size + num_computed_tokens
+                        < seqs[0].data.get_len()):
                     do_sample = False
 
             # It assumes the scheduled_seq_groups is ordered by
@@ -1625,10 +1625,9 @@ def _passed_delay(self, now: float) -> bool:
         if self.scheduler_config.delay_factor > 0 and self.waiting:
             earliest_arrival_time = min(
                 [e.metrics.arrival_time for e in self.waiting])
-            passed_delay = (
-                (now - earliest_arrival_time) >
-                (self.scheduler_config.delay_factor * self.last_prompt_latency)
-                or not self.running)
+            passed_delay = ((now - earliest_arrival_time)
+                            > (self.scheduler_config.delay_factor *
+                               self.last_prompt_latency) or not self.running)
         else:
             passed_delay = True
         return passed_delay
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 4ced991f62f66..268edc0925fe8 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -352,8 +352,8 @@ def acquire_write(self, timeout: Optional[float] = None):
                     sched_yield()
 
                     # if we wait for a long time, log a message
-                    if (time.monotonic() - start_time >
-                            VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
+                    if (time.monotonic() - start_time
+                            > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
                         logger.debug("No available block found in %s second. ",
                                      VLLM_RINGBUFFER_WARNING_INTERVAL)
                         n_warning += 1
@@ -410,8 +410,8 @@ def acquire_read(self, timeout: Optional[float] = None):
                     sched_yield()
 
                     # if we wait for a long time, log a message
-                    if (time.monotonic() - start_time >
-                            VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
+                    if (time.monotonic() - start_time
+                            > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
                         logger.debug("No available block found in %s second. ",
                                      VLLM_RINGBUFFER_WARNING_INTERVAL)
                         n_warning += 1
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index ffdf8b0f48087..7fe9b68d4b9e8 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1014,8 +1014,8 @@ def initialize_model_parallel(
     backend = backend or torch.distributed.get_backend(
         get_world_group().device_group)
 
-    if (world_size !=
-            tensor_model_parallel_size * pipeline_model_parallel_size):
+    if (world_size
+            != tensor_model_parallel_size * pipeline_model_parallel_size):
         raise RuntimeError(
             f"world_size ({world_size}) is not equal to "
             f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
@@ -1069,8 +1069,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
         return
 
     if all([
-            vllm_config.kv_transfer_config.need_kv_parallel_group,
-            _KV_TRANSFER is None
+            vllm_config.kv_transfer_config.need_kv_parallel_group, _KV_TRANSFER
+            is None
     ]):
         _KV_TRANSFER = kv_transfer.KVTransferAgent(
             rank=get_world_group().rank,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index beedf5d16ab86..723d6e9085806 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -3,7 +3,7 @@
 import json
 from abc import ABC, abstractmethod
 from collections import defaultdict, deque
-from functools import lru_cache, partial
+from functools import cache, lru_cache, partial
 from pathlib import Path
 from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
                     Literal, Optional, Tuple, TypeVar, Union, cast)
@@ -377,7 +377,7 @@ def allowed_local_media_path(self):
         return self._model_config.allowed_local_media_path
 
     @staticmethod
-    @lru_cache(maxsize=None)
+    @cache
     def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
         return tokenizer.decode(token_index)
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 2c9c20caf8119..b0179f78bd635 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -522,11 +522,10 @@ def _create_completion_logprobs(
                 out_top_logprobs.append({
                     # Convert float("-inf") to the
                     # JSON-serializable float that OpenAI uses
-                    self._get_decoded_token(
-                        top_lp[1],
-                        top_lp[0],
-                        tokenizer,
-                        return_as_token_id=self.return_tokens_as_token_ids):
+                    self._get_decoded_token(top_lp[1],
+                                            top_lp[0],
+                                            tokenizer,
+                                            return_as_token_id=self.return_tokens_as_token_ids):
                     max(top_lp[1].logprob, -9999.0)
                     for i, top_lp in enumerate(step_top_logprobs.items())
                     if num_output_top_logprobs >= i
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 94db8f379e33a..93e357e8b9f21 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -62,8 +62,8 @@ def extract_tool_calls(
                 start_of_json = match.end()
                 # end_index == the start of the next function call
                 # (if exists)
-                next_function_call_start = (matches[i + 1].start()
-                                            if i + 1 < len(matches) else None)
+                next_function_call_start = (matches[i + 1].start() if i +
+                                            1 < len(matches) else None)
 
                 raw_function_calls.append(
                     dec.raw_decode(
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e6f26d2b74b2f..cdd439d0385b6 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -220,8 +220,10 @@ def set_lora(
                                 lora_b.T, non_blocking=True)
         if embeddings_tensor is not None:
             self.embeddings_tensors[
-                index, :embeddings_tensor.shape[0], :embeddings_tensor.
-                shape[1], ].copy_(embeddings_tensor, non_blocking=True)
+                index,
+                :embeddings_tensor.shape[0],
+                :embeddings_tensor.shape[1],
+            ].copy_(embeddings_tensor, non_blocking=True)
             if self.embeddings_slice is not None:
                 # TODO(yard1): Optimize this copy, we don't need to copy
                 # everything, just the modified part
@@ -1024,8 +1026,10 @@ def set_lora(
                                 lora_b.T, non_blocking=True)
         if embeddings_tensor is not None:
             self.embeddings_tensors[
-                index, :embeddings_tensor.shape[0], :embeddings_tensor.
-                shape[1], ] = embeddings_tensor
+                index,
+                :embeddings_tensor.shape[0],
+                :embeddings_tensor.shape[1],
+            ] = embeddings_tensor
 
     def _get_logits(
         self,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index b77b6b3d72ff4..2e04cb902d009 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -75,8 +75,9 @@ def __init__(
         # Scaling factor for long context lora model. None if it is not
         # fine tuned for the long context.
         self.scaling_factor = scaling_factor
-        assert (lora_model_id >
-                0), f"a valid lora id should be greater than 0, got {self.id}"
+        assert (
+            lora_model_id
+            > 0), f"a valid lora id should be greater than 0, got {self.id}"
         self.rank = rank
         self.loras: Dict[str, LoRALayerWeights] = loras
 
diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py
index 8af44b703810b..48fa5cd63741f 100644
--- a/vllm/lora/ops/triton_ops/sgmv_expand.py
+++ b/vllm/lora/ops/triton_ops/sgmv_expand.py
@@ -136,9 +136,8 @@ def _sgmv_expand_kernel(
     c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride +
              offset_cn[None, :] * output_d1_stride)
     M = tl.load(seq_lens + cur_batch)
-    c_mask = (offset_cm[:, None] <
-              (cur_seq_start + M)) & (offset_cn[None, :] <
-                                      (cur_slice_start + curr_N))
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (
+        offset_cn[None, :] < (cur_slice_start + curr_N))
     if ADD_INPUTS:
         tiled_out = tl.load(c_ptr, mask=c_mask)
         tiled_c += tiled_out
diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py
index 3d2ebe8286f56..9bb35e8ffd323 100644
--- a/vllm/lora/ops/triton_ops/sgmv_shrink.py
+++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py
@@ -114,8 +114,8 @@ def _sgmv_shrink_kernel(
                    slice_id * output_d0_stride)
     c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[
         None, :] * output_d2_stride
-    c_mask = (offset_cm[:, None] <
-              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :]
+                                                           < N)
     accumulator *= scaling
     # handles write-back with reduction-splitting
     if SPLIT_K == 1:
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
index b04612a9b00d9..915bdc4778929 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -73,12 +73,12 @@ def _transform_param(self, layer: torch.nn.Module, name: Optional[str],
                 torch.nn.Parameter(new_param.data, requires_grad=False))
 
     def _get_weight_params(
-            self, layer: torch.nn.Module
-    ) -> Tuple[torch.Tensor,  # w_q
-               torch.Tensor,  # w_s
-               Optional[torch.Tensor],  # w_zp, 
-               Optional[torch.Tensor]  # w_gidx
-               ]:
+            self, layer: torch.nn.Module) -> Tuple[
+                torch.Tensor,  # w_q
+                torch.Tensor,  # w_s
+                Optional[torch.Tensor],  # w_zp, 
+                Optional[torch.Tensor]  # w_gidx
+            ]:
         return (
             getattr(layer, self.w_q_name),
             getattr(layer, self.w_s_name),
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
index 75cf91f191136..c4a83b4faafe6 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -48,13 +48,13 @@ def apply_weights(self,
         raise NotImplementedError
 
     def _get_weight_params(
-            self, layer: torch.nn.Module
-    ) -> Tuple[torch.Tensor,  # weight
-               torch.Tensor,  # weight_scale
-               Optional[torch.Tensor],  # input_scale, 
-               Optional[torch.Tensor],  # input_zp
-               Optional[torch.Tensor],  # azp_adj
-               ]:
+            self, layer: torch.nn.Module) -> Tuple[
+                torch.Tensor,  # weight
+                torch.Tensor,  # weight_scale
+                Optional[torch.Tensor],  # input_scale, 
+                Optional[torch.Tensor],  # input_zp
+                Optional[torch.Tensor],  # azp_adj
+            ]:
         return (
             getattr(layer, self.w_q_name),
             getattr(layer, self.w_s_name),
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index b6882cc7c837c..43b1997019107 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -72,9 +72,10 @@ def block_quant_to_tensor_quant(
     x_dq_block = x_q_block.to(torch.float32)
 
     x_dq_block_tiles = [[
-        x_dq_block[j * block_n:min((j + 1) * block_n, n),
-                   i * block_k:min((i + 1) * block_k, k), ]
-        for i in range(k_tiles)
+        x_dq_block[
+            j * block_n:min((j + 1) * block_n, n),
+            i * block_k:min((i + 1) * block_k, k),
+        ] for i in range(k_tiles)
     ] for j in range(n_tiles)]
 
     for i in range(k_tiles):
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 7cdce67cf1677..9977804188a50 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -73,8 +73,8 @@ def requantize_with_max_scale(
     # from disk in this case. Skip requantization in this case (since)
     # we already are quantized with the single scale.
     # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
-    unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
-        torch.float8_e4m3fn).min)
+    unfused_module_in_checkpoint = (weight_scale[-1]
+                                    > torch.finfo(torch.float8_e4m3fn).min)
 
     # If unfused checkpoint, need requanize with the single scale.
     if unfused_module_in_checkpoint:
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index c2d12c466ba45..8dc26309d754e 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -716,9 +716,10 @@ def _sample_with_torch(
       tensors required for Pythonization
     '''
 
-    categorized_seq_group_ids: Dict[SamplingType,
-                                    List[int]] = {t: []
-                                                  for t in SamplingType}
+    categorized_seq_group_ids: Dict[SamplingType, List[int]] = {
+        t: []
+        for t in SamplingType
+    }
     categorized_sample_indices = sampling_metadata.categorized_sample_indices
     for i, seq_group in enumerate(sampling_metadata.seq_groups):
         sampling_params = seq_group.sampling_params
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 3eb5c39ccf580..f230efacacdbb 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -115,17 +115,17 @@ def num_elements_padded(self) -> int:
 
     def __post_init__(self):
         # sanity checks
-        assert (self.padded_org_vocab_start_index <=
-                self.padded_org_vocab_end_index)
-        assert (self.padded_added_vocab_start_index <=
-                self.padded_added_vocab_end_index)
+        assert (self.padded_org_vocab_start_index
+                <= self.padded_org_vocab_end_index)
+        assert (self.padded_added_vocab_start_index
+                <= self.padded_added_vocab_end_index)
 
         assert self.org_vocab_start_index <= self.org_vocab_end_index
         assert self.added_vocab_start_index <= self.added_vocab_end_index
 
         assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
-        assert (self.added_vocab_start_index <=
-                self.padded_added_vocab_start_index)
+        assert (self.added_vocab_start_index
+                <= self.padded_added_vocab_start_index)
         assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
         assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
 
@@ -141,8 +141,8 @@ def get_masked_input_and_mask(
         added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
     # torch.compile will fuse all of the pointwise ops below
     # into a single kernel, making it very fast
-    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
-                                                          org_vocab_end_index)
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (
+        input_ < org_vocab_end_index)
     added_vocab_mask = (input_ >= added_vocab_start_index) & (
         input_ < added_vocab_end_index)
     added_offset = added_vocab_start_index - (
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 527b4307f3670..712266ee42639 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1121,8 +1121,9 @@ def _load_weights(self, model_config: ModelConfig,
                 # from being incorrectly identified as being present in
                 # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
                 shard_pos = quant_param_name.find(shard_name)
-                can_correct_rename = (shard_pos > 0) and (
-                    quant_param_name[shard_pos - 1] == ".")
+                can_correct_rename = (shard_pos
+                                      > 0) and (quant_param_name[shard_pos - 1]
+                                                == ".")
                 # If the quant_param_name is packed, it won't occur in the
                 # param_dict before renaming.
                 new_quant_param_name = quant_param_name.replace(
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index e359aef9dcb7f..9266ca75ddaac 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -298,8 +298,8 @@ def _resize_lora_embeddings(self):
         to allow for adapter added tokens."""
         for child in self.model.modules():
             if (isinstance(child, VocabParallelEmbedding)
-                    and child.weight.shape[0] <
-                    child.num_embeddings_per_partition):
+                    and child.weight.shape[0]
+                    < child.num_embeddings_per_partition):
                 new_weight = torch.empty(child.num_embeddings_per_partition,
                                          child.embedding_dim,
                                          dtype=child.weight.dtype,
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 6de0c866bc2f0..b23aba829c549 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Gemma model compatible with HuggingFace weights."""
-from functools import lru_cache
+from functools import cache
 from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
@@ -48,7 +48,7 @@
 logger = init_logger(__name__)
 
 
-@lru_cache(maxsize=None)
+@cache
 def _get_gemma_act_fn(
     hidden_act: Optional[str],
     hidden_activation: Optional[str],
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index b518a0a6cbdee..cdf9414d5949c 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -429,10 +429,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 for e in range(p.size(0)):
                     w1_name = n.replace(
                         '.block_sparse_moe.input_linear.weight',
-                        ".block_sparse_moe.experts.%d.w1.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w1.weight")
                     w3_name = n.replace(
                         '.block_sparse_moe.input_linear.weight',
-                        ".block_sparse_moe.experts.%d.w3.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w3.weight")
                     w1_param, w3_param = p[e].chunk(2, dim=0)
                     assert w1_name not in new_weights
                     assert w3_name not in new_weights
@@ -442,7 +442,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 for e in range(p.size(0)):
                     w2_name = n.replace(
                         '.block_sparse_moe.output_linear.weight',
-                        ".block_sparse_moe.experts.%d.w2.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w2.weight")
                     w2_param = p[e]
                     assert w2_name not in new_weights
                     new_weights[w2_name] = w2_param
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 61baa8e588d74..e15ac84a6049b 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1365,8 +1365,8 @@ def forward(
         # For 1) text-only prefill and decode, 2) image-present decode.
         if image_inputs is None:
             full_text_row_masked_out_mask = (
-                attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to(
-                    input_ids.device)
+                attn_metadata.encoder_seq_lens_tensor
+                != 0).reshape(-1, 1).to(input_ids.device)
             skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
 
         # For image-present prefill.
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index d49da5f29aa14..f1d796ca26a16 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -81,8 +81,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         if self.tie_weights:
             assert (
-                self.n_predict >
-                1), "You cannot tie weights between stages when only 1 exists"
+                self.n_predict > 1
+            ), "You cannot tie weights between stages when only 1 exists"
             embedding = VocabParallelEmbedding(
                 config.vocab_size,
                 self.inner_dim,
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 881c09ea9db99..6367b770a0aff 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01):
         # compute mask for sparsity
         mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
         factor = scores.abs().clamp(min=mask_logits_threshold)
-        mask_logits_threshold = (
-            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+        mask_logits_threshold = ((mask_logits_threshold - scores) /
+                                 factor) > (2 * jitter_eps)
 
     # apply mask
     masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
@@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01):
         mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
                                                            keepdim=True)
         factor = scores.abs().clamp(min=mask_logits_threshold)
-        mask_logits_threshold = (
-            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+        mask_logits_threshold = ((mask_logits_threshold - scores) /
+                                 factor) > (2 * jitter_eps)
 
     # apply mask
     masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 8d2719ca2d00d..8d71b19060bf4 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -462,7 +462,8 @@ def is_hybrid_model(
 
 
 ModelRegistry = _ModelRegistry({
-    model_arch: _LazyRegisteredModel(
+    model_arch:
+    _LazyRegisteredModel(
         module_name=f"vllm.model_executor.models.{mod_relname}",
         class_name=cls_name,
     )
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index d577e545a473b..605a0ecf4e0a9 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -333,10 +333,10 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
-                                        info=UltravoxProcessingInfo,
-                                        dummy_inputs=UltravoxDummyInputsBuilder
-                                        )
+@MULTIMODAL_REGISTRY.register_processor(
+    UltravoxMultiModalProcessor,
+    info=UltravoxProcessingInfo,
+    dummy_inputs=UltravoxDummyInputsBuilder)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 43b3c973c97b8..01a232fdc76de 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -599,9 +599,8 @@ def make_empty_intermediate_tensors(
         device: torch.device,
     ) -> IntermediateTensors:
         return IntermediateTensors({
-            key: torch.zeros((batch_size, hidden_size),
-                             dtype=dtype,
-                             device=device)
+            key:
+            torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
             for key in keys
         })
 
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 1df8f84ed4093..61e8881b64f5d 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -166,7 +166,8 @@ def prepare(
             pin_memory=pin_memory,
         )
         categorized_sample_indices = {
-            t: async_tensor_h2d(
+            t:
+            async_tensor_h2d(
                 seq_ids,
                 dtype=torch.int,
                 target_device=device,
@@ -198,8 +199,12 @@ def _prepare_seq_groups(
     device: str,
     generators: Optional[Dict[str, torch.Generator]] = None,
     cache: Optional[SamplingMetadataCache] = None,
-) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType,
-                                                        List[int]], int, ]:
+) -> Tuple[
+        List[SequenceGroupToSample],
+        List[int],
+        Dict[SamplingType, List[int]],
+        int,
+]:
     """Prepare sequence groups and indices for sampling.
 
     Args:
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index ead3dab05a6b1..23a7126fb05cf 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -38,8 +38,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if parallel_config.world_size > 1:
             parallel_config.distributed_executor_backend = "uni"
 
-        assert (vllm_config.lora_config is
-                None), "LoRA is not supported for Neuron backend."
+        assert (vllm_config.lora_config
+                is None), "LoRA is not supported for Neuron backend."
         assert (not vllm_config.speculative_config
                 ), "Speculative decoding not yet supported for Neuron backend."
 
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index 9d711b0debcd8..20063a5b4b085 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -121,8 +121,8 @@ def _raw_min(self) -> Union[int, float]:
             min_raw = max_raw | sign_bit_double
             return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
         else:
-            assert (not self.is_signed() or
-                    self.size_bits <= 64), "Cannot represent min as a int64_t"
+            assert (not self.is_signed() or self.size_bits
+                    <= 64), "Cannot represent min as a int64_t"
 
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 8e9802c7d333c..af1c4dfcebbc0 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -510,8 +510,8 @@ def _should_disable_all_speculation(
             self, execute_model_req: ExecuteModelRequest) -> bool:
         # When the batch size is too large, disable speculative decoding
         # to stop trading off throughput for latency.
-        return (execute_model_req.running_queue_size >=
-                self.disable_by_batch_size)
+        return (execute_model_req.running_queue_size
+                >= self.disable_by_batch_size)
 
     def _maybe_disable_speculative_tokens(
             self, disable_all_speculation: bool,
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index 5a7999a258b2d..6bf7587cdda19 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -104,11 +104,11 @@ def get_spec_proposals(
             sampler_transposed=transposed,
         )
 
-        proposals = SpeculativeProposals(
-            proposal_token_ids=proposal_tokens,
-            proposal_probs=proposal_probs,
-            proposal_lens=proposal_lens,
-            no_proposals=maybe_sampler_output is None)
+        proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens,
+                                         proposal_probs=proposal_probs,
+                                         proposal_lens=proposal_lens,
+                                         no_proposals=maybe_sampler_output
+                                         is None)
         return proposals
 
     def _split_by_proposal_len(
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index da8706658d09a..c88820ab27b69 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -40,13 +40,15 @@ def get_sampled_token_logprobs(
     """
     num_steps, batch_size, vocab_size = logprob_tensor.shape
 
-    selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1),
-                                       torch.arange(batch_size),
-                                       sampled_token_ids, ]
+    selected_logprobs = logprob_tensor[
+        torch.arange(num_steps).unsqueeze(1),
+        torch.arange(batch_size),
+        sampled_token_ids,
+    ]
     expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
         -1, -1, vocab_size)
-    sampled_token_ids_ranks = (logprob_tensor >
-                               expanded_selected_logprobs).sum(-1).add_(1)
+    sampled_token_ids_ranks = (logprob_tensor
+                               > expanded_selected_logprobs).sum(-1).add_(1)
 
     return sampled_token_ids_ranks, selected_logprobs
 
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 93fec667d1cf3..1edf36329d83b 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -182,8 +182,8 @@ def _rope_scaling_validation(self):
         if self.rope_scaling is None:
             return
 
-        if not isinstance(self.rope_scaling,
-                          dict) or len(self.rope_scaling) != 2:
+        if not isinstance(self.rope_scaling, dict) or len(
+                self.rope_scaling) != 2:
             raise ValueError(
                 "`rope_scaling` must be a dictionary with two fields, "
                 f"`type` and `factor`, got {self.rope_scaling}")
diff --git a/vllm/utils.py b/vllm/utils.py
index 17bffd2846b46..15481fb06e08e 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -29,7 +29,7 @@
 from collections import OrderedDict, UserDict, defaultdict
 from collections.abc import Hashable, Iterable, Mapping
 from dataclasses import dataclass, field
-from functools import lru_cache, partial, wraps
+from functools import cache, lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
                     Dict, Generator, Generic, Iterator, List, Literal,
                     NamedTuple, Optional, Tuple, Type, TypeVar, Union,
@@ -352,7 +352,7 @@ def reset(self):
         self._index = 0
 
 
-@lru_cache(maxsize=None)
+@cache
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""
     from vllm import _custom_ops as ops
@@ -697,7 +697,7 @@ def create_kv_caches_with_random(
     return key_caches, value_caches
 
 
-@lru_cache(maxsize=None)
+@cache
 def is_pin_memory_available() -> bool:
     from vllm.platforms import current_platform
     return current_platform.is_pin_memory_available()
@@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None:
     init_hf_modules()
 
 
-@lru_cache(maxsize=None)
+@cache
 def find_library(lib_name: str) -> str:
     """
     Find the library file in the system.
@@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
     return module
 
 
-@lru_cache(maxsize=None)
+@cache
 def get_vllm_optional_dependencies():
     metadata = importlib.metadata.metadata("vllm")
     requirements = metadata.get_all("Requires-Dist", [])
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index de7fb1a698df6..7a88cc9433b32 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -247,8 +247,8 @@ def schedule(self) -> "SchedulerOutput":
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                has_partial_request = (num_computed_tokens + num_new_tokens <
-                                       request.num_tokens)
+                has_partial_request = (num_computed_tokens + num_new_tokens
+                                       < request.num_tokens)
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py
index 500bc356fc179..902800e0573bf 100644
--- a/vllm/v1/stats/common.py
+++ b/vllm/v1/stats/common.py
@@ -311,8 +311,8 @@ def output_token_latency_s_lst(self) -> List[float]:
             return []
         latency_s_lst = []
         for i in range(1, len(self.output_token_ts_s_lst)):
-            assert (self.output_token_ts_s_lst[i] >=
-                    self.output_token_ts_s_lst[i - 1])
+            assert (self.output_token_ts_s_lst[i]
+                    >= self.output_token_ts_s_lst[i - 1])
             latency_s = (self.output_token_ts_s_lst[i] -
                          self.output_token_ts_s_lst[i - 1])
             latency_s_lst.append(latency_s)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9d7e30079dfbb..a00c00c307335 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -205,7 +205,7 @@ def __init__(
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
-        # Keep the states of the pre-empted requests.
+        # Keep the states of the preempted requests.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
             self.encoder_cache.pop(req_id, None)
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 3c570212625c4..aaf9cb40bf2aa 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -173,13 +173,13 @@ def execute_model(
                 cpu_fallback_ctx as cpu_fallback_local_metric:
                 output = LocalOrDistributedWorkerBase.execute_model(
                     self, execute_model_req)
-            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
-                ) or log_graph_compilation_all:
+            if (log_graph_compilation and gc_local_metric.stats()[0][1]
+                    > 0) or log_graph_compilation_all:
                 msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
                        f"{gc_local_metric.stats()}, {input_stats}")
                 logger.warning(msg)
-            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
-                    0) or log_cpu_fallbacks_all:
+            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1]
+                    > 0) or log_cpu_fallbacks_all:
                 msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
                        f"{cpu_fallback_local_metric.stats()}, {input_stats}")
                 logger.warning(msg)
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index a3f648f4cc645..8749518284288 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -316,8 +316,8 @@ def warmup_model(
                     logger.info("batch_size: %d, seq_len: %d", batch_size,
                                 seq_len)
                     num_tokens = batch_size * seq_len
-                    if (num_tokens >=
-                            self.scheduler_config.max_num_batched_tokens):
+                    if (num_tokens
+                            >= self.scheduler_config.max_num_batched_tokens):
                         break
                     seq_len = seq_len * 2
             end = time.time()

From ddee88d0ff2757bdef98a83a9c78af1ea4559758 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Mon, 27 Jan 2025 17:31:16 -0800
Subject: [PATCH 10/69] [Neuron][Kernel] NKI-based flash-attention kernel with
 paged KV cache (#11277)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Co-authored-by: Jiangfei Duan <jfduan@outlook.com>
---
 .buildkite/run-neuron-test.sh        |   2 +-
 tests/neuron/test_prefix_prefill.py  | 456 ++++++++++++++++++
 vllm/attention/ops/nki_flash_attn.py | 669 +++++++++++++++++++++++++++
 3 files changed, 1126 insertions(+), 1 deletion(-)
 create mode 100644 tests/neuron/test_prefix_prefill.py
 create mode 100644 vllm/attention/ops/nki_flash_attn.py

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 0590dad4f311f..1ad77cf50f612 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -54,4 +54,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py
new file mode 100644
index 0000000000000..77b707a737118
--- /dev/null
+++ b/tests/neuron/test_prefix_prefill.py
@@ -0,0 +1,456 @@
+import random
+from typing import Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+
+class BlockDiagonalCausalFromBottomRightMask:
+
+    @staticmethod
+    def _from_seqlens(query_lens, seq_lens, block_size=None):
+        from torch import logical_and, logical_or
+
+        contexted = block_size is None
+        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+        n_queries = sum(query_lens)
+        num_seqs = len(query_lens)
+        if contexted:
+            key_lens_blockaligned = seq_lens
+        else:
+            n_blocks_per_seq = (context_lens + block_size - 1) // block_size
+            offset_per_seq = n_blocks_per_seq * block_size
+            key_lens_blockaligned = offset_per_seq[:num_seqs].tolist()
+        n_keys = sum(key_lens_blockaligned)
+
+        a = (torch.arange(n_queries).reshape(n_queries,
+                                             1).expand(n_queries, n_keys))
+        b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys)
+        q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0)
+        k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0)
+
+        prior_mask = torch.zeros(n_queries, n_keys)
+        new_masks: list[torch.Tensor] = []
+        for seq_id in range(num_seqs):
+            ri = q_cumsum[seq_id]
+            ci = k_cumsum[seq_id]
+            nr = query_lens[seq_id]
+
+            if contexted:
+                nc = seq_lens[seq_id]
+                a_offset = ci + nc - ri - nr
+                new_mask = (a + a_offset) >= b
+            else:
+                nc = context_lens[seq_id]
+                a_offset = ci + nc - 1
+                new_mask = a_offset >= b
+
+            left_mask = b >= ci
+            top_mask = a >= ri
+            bottom_mask = a < (ri + nr)
+
+            new_mask = logical_and(
+                logical_and(logical_and(new_mask, left_mask), top_mask),
+                bottom_mask,
+            )
+            prior_mask = logical_or(prior_mask, new_mask)
+            new_masks = new_masks + [new_mask]
+        return prior_mask
+
+    @staticmethod
+    def from_seqlens(query_lens, seq_lens, block_size=None):
+        contexted = block_size is None
+        if contexted:
+            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, seq_lens)
+            active_mask = None
+        else:
+            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, seq_lens, block_size)
+            active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, query_lens)
+        return prior_mask, active_mask
+
+
+def ref_softmax(x: torch.Tensor,
+                dim: int,
+                mixed_precision=False,
+                return_max_reduce=False):
+    max_value = torch.amax(x, dim=dim, keepdims=True)
+    exp = torch.exp(x - max_value)
+    if mixed_precision:
+        sum_value = torch.sum(exp.astype(torch.float32),
+                              dim=dim,
+                              keepdims=True).astype(x.dtype)
+    else:
+        sum_value = torch.sum(exp, dim=dim, keepdims=True)
+    if return_max_reduce:
+        return exp / sum_value, max_value, torch.reciprocal(sum_value)
+    return exp / sum_value
+
+
+def ref_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    attn_mask: Optional[torch.Tensor] = None,
+    return_max_reduce: Optional[bool] = False,
+) -> torch.Tensor:
+    scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    if attn_mask is not None:
+        masked_score = scaled_qk + attn_mask.float()
+    if return_max_reduce:
+        norm_score, cached_max, cached_sum_reciprocal = ref_softmax(
+            masked_score, dim=-1, return_max_reduce=True)
+    else:
+        norm_score = ref_softmax(masked_score, dim=-1)
+    out = torch.einsum("hqk,khd->qhd", norm_score, value)
+    if return_max_reduce:
+        return (
+            out,
+            cached_max,
+            cached_sum_reciprocal,
+            norm_score,
+            masked_score,
+            scaled_qk,
+        )
+    else:
+        return out
+
+
+def ref_context_attention(
+    query,
+    key,
+    value,
+    query_lens,
+    seq_lens,
+    head_size,
+    num_kv_heads,
+    num_heads,
+    num_queries_per_kv,
+    return_max_reduce=False,
+):
+    scale = float(1.0 / (head_size**0.5))
+    if num_queries_per_kv > 1:
+        # Handle MQA and GQA
+        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
+        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
+
+    attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+        query_lens, seq_lens)
+
+    # convert binary mask to -inf values
+    attn_mask = torch.logical_not(attn_mask)
+    attn_mask = attn_mask.float() * -30000
+
+    output, cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
+        ref_masked_attention(
+            query,
+            key,
+            value,
+            scale,
+            attn_mask,
+            return_max_reduce=return_max_reduce,
+        ))
+
+    output = output.unsqueeze(1)
+    if return_max_reduce:
+        return (
+            output,
+            cached_max,
+            cached_sum_reciprocal,
+            lse,
+            masked_score,
+            scaled_qk,
+        )
+    else:
+        return output
+
+
+@pytest.mark.parametrize(
+    "num_heads,num_queries_per_kv,head_size,mixed_precision",
+    [
+        (4, 2, 8, False),
+        (4, 2, 8, True),
+        (32, 8, 64, True),
+    ],
+)
+@torch.inference_mode()
+def test_contexted_kv_attention(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    mixed_precision: bool,
+) -> None:
+    import os
+
+    import torch_xla.core.xla_model as xm
+
+    from vllm.attention.ops.nki_flash_attn import flash_attn_varlen_nkifunc
+
+    device = xm.xla_device()
+
+    os.environ["NEURON_CC_FLAGS"] = (
+        " --model-type=transformer -O1 "
+        " --internal-hlo2tensorizer-options='--verify-hlo' ")
+
+    random.seed(0)
+    torch.manual_seed(0)
+    torch.set_printoptions(sci_mode=False)
+
+    min_ctx_len = 2
+    max_ctx_len = 64
+    min_query_len = 2
+    max_query_len = 64
+    prefill_batch_size = 2
+    decode_batch_size = 6
+    batch_size = prefill_batch_size + decode_batch_size
+    block_size = 32
+    max_model_len = (max_query_len + max_ctx_len) * 4
+
+    max_block_per_request = max_model_len // block_size
+    dtype = torch.float32
+    cache_size = (batch_size * max_block_per_request) + 2
+    ctx_lens = [
+        random.randint(min_ctx_len, max_ctx_len)
+        for _ in range(prefill_batch_size)
+    ] + [
+        random.randint(min_ctx_len, max_ctx_len)
+        for _ in range(decode_batch_size)
+    ]
+    query_lens = [
+        random.randint(min_query_len, max_query_len)
+        for _ in range(prefill_batch_size)
+    ] + [1 for _ in range(decode_batch_size)]
+    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
+    num_kv_heads = num_heads // num_queries_per_kv
+
+    num_tokens = sum(query_lens)
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+    query.uniform_(-1, 1)
+    torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+
+    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
+    kv.uniform_(-1, 1)
+    key, value = kv.unbind(dim=1)
+
+    k_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    v_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = values[torch.randperm(cache_size)]
+    block_table = values[:batch_size * max_block_per_request].view(
+        batch_size, max_block_per_request)
+    torch.tensor(seq_lens, dtype=torch.long)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
+                                            dtype=torch.long),
+                               dim=0)
+    # copy kv to cache
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
+                                                dtype=torch.long),
+                                   dim=0)
+    for i in range(batch_size):
+        for j in range(query_lens[i]):
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
+                                            j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
+                                              b_ctx_len[i] + j])
+        cur_ctx = 0
+        block_id = 0
+        while cur_ctx < b_ctx_len[i]:
+            start_loc = b_seq_start_loc[i] + cur_ctx
+            if cur_ctx + block_size > b_ctx_len[i]:
+                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
+            else:
+                end_loc = start_loc + block_size
+            start_slot = block_table[i, block_id] * block_size
+            end_slot = start_slot + end_loc - start_loc
+            k_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             key[start_loc:end_loc])
+            v_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             value[start_loc:end_loc])
+            cur_ctx += block_size
+            block_id += 1
+
+    (
+        output_ref,
+        cached_max,
+        cached_sum_reciprocal,
+        lse,
+        masked_score,
+        scaled_qk,
+    ) = ref_context_attention(
+        query,
+        key,
+        value,
+        query_lens,
+        seq_lens,
+        head_size,
+        num_kv_heads,
+        num_heads,
+        num_queries_per_kv,
+        return_max_reduce=True,
+    )
+
+    # build neuron program
+    return_debug_tensors = False
+    B_P_SIZE = 128
+    LARGE_TILE_SZ = 2048
+    max_num_queries = (
+        (sum(query_lens) + block_size - 1) // block_size) * block_size
+
+    def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
+                                num_blocks):
+        context_lens = seq_lens - query_lens
+        blocks_per_seq = (context_lens + block_size - 1) // block_size
+        num_seqs = len(seq_lens)
+        active_blocks: list[int] = []
+        for seq_id in range(num_seqs):
+            active_blocks = (
+                active_blocks +
+                block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
+        return F.pad(
+            torch.tensor(active_blocks),
+            (0, num_blocks - len(active_blocks)),
+            "constant",
+            0,
+        )
+
+    def shift_bit_length(x):
+        return 1 << (x - 1).bit_length()
+
+    # calculate input shapes
+    max_num_queries_shifted = shift_bit_length(max_num_queries)
+    max_num_queries_factor = B_P_SIZE // max_num_queries_shifted
+    max_num_queries_padded = max_num_queries_shifted * max_num_queries_factor
+    assert (max_num_queries_padded == B_P_SIZE
+            ), "invalid {max_num_queries_padded=}"
+    head_size_padded = B_P_SIZE
+    context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+    num_active_blocks_shifted = shift_bit_length(
+        ((context_lens + block_size - 1) // block_size).sum().item())
+    num_active_blocks_factor = (LARGE_TILE_SZ // block_size //
+                                num_active_blocks_shifted)
+    num_active_blocks = num_active_blocks_shifted * num_active_blocks_factor
+    assert (num_active_blocks *
+            block_size) == LARGE_TILE_SZ, "invalid {num_active_blocks=}"
+    context_kv_len = num_active_blocks * block_size
+    assert context_kv_len == LARGE_TILE_SZ, f"invalid {context_kv_len=}"
+
+    # pad QKV tensors
+    pad_dims = (
+        0,
+        head_size_padded - query.shape[2],
+        0,
+        0,
+        0,
+        max_num_queries_padded - query.shape[0],
+    )
+    query = F.pad(query, pad_dims, "constant", 0)
+    k = F.pad(k, pad_dims, "constant", 0)
+    v = F.pad(v, pad_dims, "constant", 0)
+    k_cache = F.pad(k_cache, (0, head_size_padded - head_size), "constant", 0)
+    v_cache = F.pad(v_cache, (0, head_size_padded - head_size), "constant", 0)
+
+    # permute QKV tensors
+    # query: (1, n_heads, d, seq_q)
+    # key:   (1, n_kv_heads, d, seq_k)
+    # value: (1, n_kv_heads, seq_v, d)
+    query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+    k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+    v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
+
+    # transform block table
+    active_block_table = get_active_block_tables(
+        block_table,
+        torch.tensor(query_lens),
+        torch.tensor(seq_lens),
+        block_size,
+        num_active_blocks,
+    )
+
+    # Build attention masks
+    prior_mask, active_mask = (
+        BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+            query_lens, seq_lens, block_size=block_size))
+    attn_mask = torch.concat(
+        [
+            F.pad(
+                prior_mask,
+                (
+                    0,
+                    context_kv_len - prior_mask.shape[1],
+                    0,
+                    B_P_SIZE - prior_mask.shape[0],
+                ),
+                "constant",
+                0,
+            ).bool(),
+            F.pad(
+                active_mask,
+                (
+                    0,
+                    B_P_SIZE - active_mask.shape[1],
+                    0,
+                    B_P_SIZE - active_mask.shape[0],
+                ),
+                "constant",
+                0,
+            ).bool(),
+        ],
+        dim=1,
+    )
+
+    input_args = (
+        query.to(device=device),
+        k.to(device=device),
+        v.to(device=device),
+        k_cache.to(device=device),
+        v_cache.to(device=device),
+        active_block_table.to(torch.int32).to(device=device),
+        attn_mask.to(device=device),
+    )
+    input_kwargs = dict(
+        n_kv_head=num_kv_heads,
+        head_size=head_size,
+        mixed_precision=mixed_precision,
+    )
+
+    if return_debug_tensors:
+        output_nki, *debug_tensors = flash_attn_varlen_nkifunc(
+            *input_args, **input_kwargs)
+    else:
+        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+        debug_tensors = []
+
+    output_nki = torch.tensor(output_nki).cpu()
+    debug_tensors = [torch.tensor(dt).cpu() for dt in debug_tensors]
+
+    num_actual_tokens = sum(query_lens)
+    print(f"{num_actual_tokens=}")
+    # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
+    output_nki = output_nki.permute(
+        0, 2, 1, 3)[:, :, :, :head_size].cpu()[0, :num_actual_tokens, :, :]
+    output_ref_padded = F.pad(
+        output_ref,
+        (0, 0, 0, 0, 0, 0, 0, max_num_queries_padded - output_ref.shape[0]),
+        "constant",
+        0,
+    )
+    output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
+
+    torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
new file mode 100644
index 0000000000000..b9765b0f0283d
--- /dev/null
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -0,0 +1,669 @@
+from dataclasses import dataclass
+
+import neuronxcc.nki.isa as nisa
+import neuronxcc.nki.language as nl
+import numpy as np
+from neuronxcc import nki
+from neuronxcc.nki.language import par_dim
+
+
+@dataclass(frozen=True)
+class FlashConfig:
+    """
+    Config class for flash attention with default values
+    """
+
+    seq_tile_size: int = 2048
+    should_transpose_v: bool = False
+
+    __annotations__ = {
+        "seq_tile_size": int,
+        "should_transpose_v": bool,
+    }
+
+
+@nki.jit
+def transpose_p_local(p_local_transposed,
+                      p_local,
+                      LARGE_TILE_SZ,
+                      forward_mask,
+                      B_F_SIZE=512):
+    for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
+        if nisa.get_nc_version() == nisa.nc_version.gen3:
+            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
+                                       buffer=nl.sbuf,
+                                       dtype=p_local.dtype)
+        else:
+            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
+                                       buffer=nl.psum,
+                                       dtype=np.float32)
+
+        for j in nl.affine_range(B_F_SIZE // 128):
+            j_128_slice = nl.ds(j * 128, 128)
+            i_j_128_slice = nl.ds(i * B_F_SIZE + j * 128, 128)
+
+            if nisa.get_nc_version() == nisa.nc_version.gen3:
+                p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose(
+                    p_local[:, i_j_128_slice], mask=forward_mask)
+            else:
+                p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose(
+                    p_local[:, i_j_128_slice], mask=forward_mask)
+
+        p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy(
+            p_local_t_tmp, dtype=p_local_transposed.dtype, mask=forward_mask)
+
+
+@nki.jit
+def _flash_attention_core(
+    q_local_tile,
+    k,
+    v,
+    q_h_per_k_h,
+    seqlen_q,
+    nheads,
+    o_buffer,
+    l_buffer,
+    m_buffer,
+    batch_id,
+    head_id,
+    gqa_head_idx,
+    q_tile_idx,
+    local_k_large_tile_idx,
+    kernel_dtype,
+    acc_type,
+    flash_config: FlashConfig,
+    use_causal_mask=False,
+    continuous_batching_mask=None,
+    initialize=False,
+    B_P_SIZE=128,
+    B_F_SIZE=512,
+    B_D_SIZE=128,
+    dropout_p=0.0,
+    dropout_p_tensor=None,
+    seed_tensor=None,
+    logit_bias_tile=None,
+    qk_res_buffer=None,
+):
+    """
+    The flash attention core function to calculate self attention between a tile
+    of q and a block of K and V.
+    The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF 
+    already. The block size of K and V
+    is defined in the seq_tile_size of the flash_config. The results are stored
+    in the following three buffers
+    o_buffer: (B_P_SIZE, d)
+    l_buffer: (B_P_SIZE, 1)
+    m_buffer: (B_P_SIZE, 1)
+    """
+    LARGE_TILE_SZ = flash_config.seq_tile_size
+    num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
+    seqlen_k = k.shape[-1]
+    seqlen_q // B_P_SIZE
+    seqlen_k // B_F_SIZE
+
+    # TODO : support logit_bias with continuous_batching_mask
+    assert not use_causal_mask, "causal mask is not supported."
+    assert (continuous_batching_mask
+            is not None), "continuous_batching_mask input is required."
+    if continuous_batching_mask is not None:
+        assert (logit_bias_tile is
+                None), "continuous_batching_mask does not support logit_bias!"
+
+    # mask are used to only apply computation to the lower half of the matrix,
+    # which reduce the arthimetic intensity by half
+    forward_mask = (q_tile_idx * B_P_SIZE >= local_k_large_tile_idx *
+                    LARGE_TILE_SZ if use_causal_mask else None)
+
+    qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                            buffer=nl.sbuf,
+                            dtype=acc_type)
+    max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile),
+                           dtype=acc_type)
+    for k_i in nl.affine_range(num_k_tile_per_large_tile):
+        k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
+
+        qk_psum = nl.zeros((par_dim(B_P_SIZE), B_F_SIZE),
+                           dtype=np.float32,
+                           buffer=nl.psum)  # (128, 512)
+        qk_psum[:, :] = nl.matmul(q_local_tile,
+                                  k[:, k_i_b_f_slice],
+                                  transpose_x=True,
+                                  mask=None)  # (p(128), 512)
+
+        qk_res_buf[:, k_i_b_f_slice] = nl.where(
+            continuous_batching_mask[:, k_i_b_f_slice],
+            qk_psum[:, nl.ds(0, B_F_SIZE)],
+            -9984.0,
+            dtype=acc_type,
+        )
+
+        # Calculate max of the current tile
+        max_local[:, k_i] = nisa.tensor_reduce(
+            np.max,
+            qk_res_buf[:, k_i_b_f_slice],
+            axis=(1, ),
+            dtype=acc_type,
+            negate=False,
+            mask=forward_mask,
+        )
+
+    if qk_res_buffer is not None:
+        qk_res_buffer[:, :] = nl.copy(qk_res_buf[:, :])
+
+    max_ = nisa.tensor_reduce(
+        np.max,
+        max_local[:, :],
+        axis=(1, ),
+        dtype=acc_type,
+        negate=False,
+        mask=forward_mask,
+    )
+
+    o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE),
+                                   dtype=o_buffer.dtype)
+
+    if initialize:
+        m_buffer[:, 0] = nl.copy(max_)
+        m_current = max_
+    else:
+        m_previous = nl.copy(m_buffer[:, 0])
+        m_buffer[:, 0] = nl.maximum(m_previous, max_,
+                                    mask=forward_mask)  # (128,1)
+
+        m_current = m_buffer[:, 0]
+        # Compute scaling factor
+        alpha = nisa.activation(
+            np.exp,
+            m_previous,
+            bias=-1 * m_current,
+            scale=1.0,
+            mask=forward_mask,
+        )
+        o_previous_scaled[...] = nl.multiply(o_buffer[:, :],
+                                             alpha,
+                                             mask=forward_mask)
+
+    p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                         dtype=kernel_dtype)
+    REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2)
+
+    p_partial_sum = nl.ndarray(
+        (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), dtype=acc_type)
+
+    for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE):
+        k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE)
+
+        # compute exp(qk - max)
+        # Compute partial row - tile sum of exp(qk - max))
+        # FIXME : Use activation accumulate to accumulate over k_r_i loop ?
+        p_local[:, k_r_i_reduce_slice] = nisa.activation_reduce(
+            np.exp,
+            qk_res_buf[:, k_r_i_reduce_slice],
+            bias=-1 * m_current,
+            scale=1.0,
+            reduce_op=nl.add,
+            reduce_res=p_partial_sum[:, k_r_i],
+            dtype=kernel_dtype,
+            mask=forward_mask,
+        )
+
+    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type, mask=forward_mask)
+
+    p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                                    dtype=kernel_dtype)
+    transpose_p_local(
+        p_local_transposed=p_local_transposed,
+        p_local=p_local,
+        LARGE_TILE_SZ=LARGE_TILE_SZ,
+        forward_mask=forward_mask,
+        B_F_SIZE=B_F_SIZE,
+    )
+
+    pv_psum = nl.zeros((par_dim(B_P_SIZE), B_D_SIZE),
+                       dtype=np.float32,
+                       buffer=nl.psum)
+    for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
+        pv_psum[:, :] += nl.matmul(
+            p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
+            v[k_i, :, :],
+            transpose_x=True,
+            mask=forward_mask,
+        )  # (128, 128) (p(Br), d)
+
+    if initialize:
+        o_buffer[:, :] = nl.copy(pv_psum[:, :])
+        l_buffer[:, 0] = nl.add(nl.log(ps), max_)
+    else:
+        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum, mask=forward_mask)
+
+        l_prev = l_buffer[:, 0]
+        l_exp = nl.add(
+            nl.exp(
+                nl.subtract(l_prev, m_current, mask=forward_mask),
+                mask=forward_mask,
+            ),
+            ps,
+            mask=forward_mask,
+        )
+        l_buffer[:, 0] = nl.add(m_current,
+                                nl.log(l_exp, mask=forward_mask),
+                                mask=forward_mask)
+
+
+@nki.jit
+def load_v_tile(v_hbm_tile, cur_v_tile, j, v_i, config):
+    LARGE_TILE_SZ = config.seq_tile_size
+    B_P_SIZE = 128
+
+    if not config.should_transpose_v:
+        cur_v_tile[v_i, :, :] = nl.load(
+            v_hbm_tile[nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), :],
+            dtype=cur_v_tile.dtype,
+        )
+        return
+
+    if nisa.get_nc_version() == nisa.nc_version.gen3:
+        cur_v_tile_transposed = nisa.dma_transpose(
+            v_hbm_tile[:,
+                       nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)])
+        cur_v_tile[v_i, :, :] = nisa.tensor_copy(cur_v_tile_transposed,
+                                                 dtype=cur_v_tile.dtype)
+        return
+
+    cur_v_tile[v_i, :, :] = nl.load_transpose2d(
+        v_hbm_tile[:, nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)],
+        dtype=cur_v_tile.dtype,
+    )
+
+
+@nki.jit
+def flash_paged_attention(
+    query,
+    key,
+    value,
+    key_cache,
+    value_cache,
+    block_tables,
+    mask,
+    softmax_scale=None,
+    mixed_precision=True,
+    config=None,
+    return_debug_tensors=False,
+):
+    """
+    Flash PagedAttention Forward Kernel.
+      - PagedAttention Paper: https://arxiv.org/abs/2309.06180
+      - Chunked Prefill Paper: https://arxiv.org/abs/2403.02310
+
+    IO tensor layouts:
+      - query: shape   (1, n_heads, d, seq_q)
+      - key:   shape   (1, n_kv_heads, d, seq_k)
+      - value: shape   (1, n_kv_heads, seq_v, d)
+      - key_cache: (num_blocks, block_size, n_kv_heads, d)
+      - value_cache: (num_blocks, block_size, n_kv_heads, d)
+      - block_tables: (num_active_blocks, )
+      - mask: (seq_q, num_active_blocks * block_size)
+      - o: shape (1, n_heads, seq_q, d)
+      - l_m: shape (1, n_heads, seq_q, 2)
+
+      - This kernel requires seq_k == seq_v
+      - We use continuous batching by default, so the batch dimension is
+        always 1, and different requests are concatenated along sequence
+        dimension.
+      - We use paged cache blocks (key_cache, value_cache) to store KV cache.
+
+    IO tensor dtypes:
+      - This kernel assumes all IO tensors have the same dtype except for 
+        block_tables (int32) and mask (int32)
+      - If mixed_percision is True, then all Tensor Engine operation will be 
+        performed in bfloat16 and accumulation will be performed in float32. 
+        Otherwise the intermediates will be in the same type as the inputs.
+
+    Compile-time Constants:
+      - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)`
+      - mixed_precision: flag to set non-matmul ops in fp32 precision, default
+        is set to `true`, if false, we use same precision as input types 
+      - config: Instance of dataclass :class:`nki.kernels.attention.FlashConfig`
+          with Performance config parameters for flash attention with default
+          values
+        seq_tile_size: `default=2048`, size of the kv tile size for attention 
+          computation reduction
+
+    GQA support Notes:
+      the spmd kernel for launching kernel should be on kv_heads instead of 
+      nheads
+
+    Example usage:
+      MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d]
+        usage: `flash_fwd[b, h](q, k, v, ...)`
+      GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d]
+        usage: `flash_fwd[b, kv_h](q, k, v, ...)`
+    """
+    config = config or FlashConfig()
+    B_F_SIZE = 512
+    B_P_SIZE = 128
+    b, h, d, seqlen_q = query.shape
+    B_D_SIZE = d
+    LARGE_TILE_SZ = config.seq_tile_size
+    n_tile_q = seqlen_q // B_P_SIZE  # since q will be loaded on tensor engine
+    num_blocks, block_size, k_h, _ = key_cache.shape
+    q_h_per_k_h = h // k_h
+    assert tuple(key_cache.shape) == (
+        num_blocks,
+        block_size,
+        k_h,
+        d,
+    ), "Input shape mismatch!"
+    assert tuple(value_cache.shape) == (
+        num_blocks,
+        block_size,
+        k_h,
+        d,
+    ), "Input shape mismatch!"
+    assert b == 1, f"invalid batch size {b=}"
+    assert d <= 128, f" we do not support head_dim > 128, got head dim {d}"
+    kernel_dtype = nl.bfloat16 if mixed_precision else query.dtype
+    acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype
+
+    o = nl.ndarray((b, h, seqlen_q, d),
+                   dtype=query.dtype,
+                   buffer=nl.shared_hbm)
+    hbm_l_buffer, hbm_m_buffer, hbm_qk_res, qk_res_buffer = (
+        None,
+        None,
+        None,
+        None,
+    )
+    if return_debug_tensors:
+        hbm_l_buffer = nl.ndarray((b, h, seqlen_q),
+                                  dtype=acc_type,
+                                  buffer=nl.shared_hbm)
+        hbm_m_buffer = nl.ndarray((b, h, seqlen_q),
+                                  dtype=acc_type,
+                                  buffer=nl.shared_hbm)
+        hbm_qk_res = nl.ndarray((b, h, B_P_SIZE, seqlen_q),
+                                dtype=acc_type,
+                                buffer=nl.shared_hbm)
+        qk_res_buffer = nl.zeros(
+            (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), seqlen_q),
+            dtype=acc_type,
+            buffer=nl.sbuf,
+            lazy_initialization=True,
+        )
+
+    assert (
+        nl.program_ndim() == 2
+    ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!"
+    batch_id = nl.program_id(axis=0)
+    head_id = nl.program_id(axis=1)
+
+    softmax_scale = softmax_scale or (1.0 / (d**0.5))
+
+    (num_active_blocks, ) = block_tables.shape
+    context_kv_len = num_active_blocks * block_size
+    assert (config.seq_tile_size >= 512
+            ), f" seq tile_size {config.seq_tile_size} cannot be less than 512"
+    assert (context_kv_len % LARGE_TILE_SZ == 0
+            ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}"
+    assert (
+        LARGE_TILE_SZ % B_P_SIZE == 0
+    ), f"Need LARGE_TILE_SZ ({LARGE_TILE_SZ}) to be divisible by {B_P_SIZE=}"
+    assert (B_P_SIZE % block_size == 0
+            ), f"Need B_P_SIZE ({B_P_SIZE}) to be divisible by {block_size=}"
+    num_large_k_tile = context_kv_len // LARGE_TILE_SZ
+    num_blocks_per_large_tile = LARGE_TILE_SZ // block_size
+    assert (num_blocks_per_large_tile <= B_P_SIZE
+    ), f"The number of blocks in each large tile " \
+    f"({num_blocks_per_large_tile}) shouldn't exceed partition size {B_P_SIZE}"
+
+    block_tables_sbuf = nl.full((par_dim(B_P_SIZE), num_large_k_tile),
+                                0,
+                                dtype=np.int32,
+                                buffer=nl.sbuf)
+    for j in nl.affine_range(num_large_k_tile):
+        i_p = nl.arange(num_blocks_per_large_tile)[:, None]
+        block_tables_sbuf[i_p, j] = nl.load(
+            block_tables[j * num_blocks_per_large_tile + i_p], dtype=np.int32)
+
+    # Global Flash Attention accumulators
+    o_buffer = nl.zeros(
+        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), d),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+    l_buffer = nl.zeros(
+        (par_dim(B_P_SIZE), n_tile_q, q_h_per_k_h),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+    m_buffer = nl.zeros(
+        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+
+    for j in nl.sequential_range(0, num_large_k_tile):
+        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
+                                dtype=kernel_dtype)
+        cur_v_tile = nl.ndarray(
+            (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE),
+            dtype=kernel_dtype,
+        )
+
+        for k_i in nl.affine_range(num_blocks_per_large_tile):
+            loaded = nl.load(key_cache[block_tables_sbuf[k_i, j], :,
+                                       head_id, :])
+            cur_k_tile[:, nl.ds(k_i *
+                                block_size, block_size)] = nl.transpose(loaded)
+
+        load_tile_size = B_P_SIZE
+        num_blocks_per_partition = load_tile_size // block_size
+        for partition_idx in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
+            for block_in_partition in nl.affine_range(
+                    num_blocks_per_partition):
+                v_i = (partition_idx * num_blocks_per_partition +
+                       block_in_partition)
+                loaded_v = nl.load(value_cache[block_tables_sbuf[v_i, j], :,
+                                               head_id, :])
+                cur_v_tile[partition_idx,
+                           nl.ds(block_in_partition *
+                                 block_size, block_size), :, ] = loaded_v
+
+        cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                              dtype=mask.dtype)
+        for m_i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
+            cur_mask[:, nl.ds(m_i * B_F_SIZE, B_F_SIZE)] = nl.load(
+                mask[:, nl.ds(j * LARGE_TILE_SZ + m_i * B_F_SIZE, B_F_SIZE)])
+
+        for i_q_h in nl.affine_range(q_h_per_k_h):
+            for i in nl.affine_range(n_tile_q):
+                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
+                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
+                q_sbuf_tile = nl.load(
+                    q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
+                    dtype=kernel_dtype,
+                )  # load (d, 128) tile in SBUF
+                q_tile[:, :] = q_sbuf_tile * softmax_scale
+
+                _flash_attention_core(
+                    q_local_tile=q_tile,
+                    k=cur_k_tile,
+                    v=cur_v_tile,
+                    q_h_per_k_h=q_h_per_k_h,
+                    seqlen_q=seqlen_q,
+                    nheads=h,
+                    o_buffer=o_buffer[i, i_q_h],
+                    l_buffer=l_buffer[:, i, i_q_h],
+                    m_buffer=m_buffer[i, i_q_h],
+                    batch_id=batch_id,
+                    head_id=head_id,
+                    gqa_head_idx=i_q_h,
+                    q_tile_idx=i,
+                    local_k_large_tile_idx=j,
+                    kernel_dtype=kernel_dtype,
+                    acc_type=acc_type,
+                    flash_config=config,
+                    use_causal_mask=False,
+                    continuous_batching_mask=cur_mask,
+                    initialize=j == 0,
+                    B_P_SIZE=B_P_SIZE,
+                    B_F_SIZE=B_F_SIZE,
+                    B_D_SIZE=B_D_SIZE,
+                    dropout_p=0.0,
+                    dropout_p_tensor=None,
+                    seed_tensor=None,
+                    logit_bias_tile=None,
+                )
+
+    # compute attention between input query, key and value
+    if key is not None and value is not None:
+        B_F_SIZE = seqlen_q
+        LARGE_TILE_SZ = seqlen_q
+        active_config = FlashConfig(
+            seq_tile_size=LARGE_TILE_SZ,
+            should_transpose_v=config.should_transpose_v,
+        )
+
+        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
+                                dtype=kernel_dtype)
+        cur_v_tile = nl.ndarray(
+            (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE),
+            dtype=kernel_dtype,
+        )
+
+        cur_k_tile[:, :] = nl.load(key[batch_id, head_id, :, :])
+
+        load_tile_size = B_P_SIZE
+        v_hbm_tile = value[batch_id, head_id]
+        for v_i in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
+            load_v_tile(
+                v_hbm_tile=v_hbm_tile,
+                cur_v_tile=cur_v_tile,
+                j=0,
+                v_i=v_i,
+                config=active_config,
+            )
+
+        cur_mask = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE), dtype=mask.dtype)
+        cur_mask[:, :] = nl.load(mask[:, nl.ds(context_kv_len, B_F_SIZE)])
+
+        for i_q_h in nl.affine_range(q_h_per_k_h):
+            for i in nl.affine_range(n_tile_q):
+                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
+                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
+                q_sbuf_tile = nl.load(
+                    q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
+                    dtype=kernel_dtype,
+                )  # load (d, 128) tile in SBUF
+                q_tile[:, :] = q_sbuf_tile * softmax_scale
+                _flash_attention_core(
+                    q_local_tile=q_tile,
+                    k=cur_k_tile,
+                    v=cur_v_tile,
+                    q_h_per_k_h=q_h_per_k_h,
+                    seqlen_q=seqlen_q,
+                    nheads=h,
+                    o_buffer=o_buffer[i, i_q_h],
+                    l_buffer=l_buffer[:, i, i_q_h],
+                    m_buffer=m_buffer[i, i_q_h],
+                    batch_id=batch_id,
+                    head_id=head_id,
+                    gqa_head_idx=i_q_h,
+                    q_tile_idx=i,
+                    local_k_large_tile_idx=0,
+                    kernel_dtype=kernel_dtype,
+                    acc_type=acc_type,
+                    flash_config=active_config,
+                    use_causal_mask=False,
+                    continuous_batching_mask=cur_mask,
+                    initialize=False,
+                    B_P_SIZE=B_P_SIZE,
+                    B_F_SIZE=B_F_SIZE,
+                    B_D_SIZE=B_D_SIZE,
+                    dropout_p=0.0,
+                    dropout_p_tensor=None,
+                    seed_tensor=None,
+                    logit_bias_tile=None,
+                    qk_res_buffer=qk_res_buffer[i, i_q_h]
+                    if qk_res_buffer is not None else None,
+                )
+
+    # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- #
+    for i_q_h in nl.affine_range(q_h_per_k_h):
+        for i in nl.affine_range(n_tile_q):
+            out = nl.multiply(
+                o_buffer[i, i_q_h, :, :],
+                nl.exp(m_buffer[i, i_q_h, :, :] - l_buffer[:, i, i_q_h]),
+                dtype=kernel_dtype,
+            )
+
+            nl.store(
+                o[batch_id, head_id * q_h_per_k_h + i_q_h,
+                  nl.ds(i * B_P_SIZE, B_P_SIZE), :, ],
+                out,
+            )
+            # maximum and summation statistics
+            if return_debug_tensors:
+                nl.store(
+                    hbm_m_buffer[batch_id, head_id * q_h_per_k_h + i_q_h,
+                                 nl.ds(i * B_P_SIZE, B_P_SIZE), ],
+                    m_buffer[i, i_q_h, :, :],
+                )
+                nl.store(
+                    hbm_l_buffer[batch_id, head_id * q_h_per_k_h + i_q_h,
+                                 nl.ds(i * B_P_SIZE, B_P_SIZE), ],
+                    l_buffer[:, i, i_q_h],
+                )
+                nl.store(
+                    hbm_qk_res[batch_id, head_id * q_h_per_k_h + i_q_h, :, :],
+                    qk_res_buffer[batch_id, i_q_h, :, :],
+                )
+
+    if return_debug_tensors:
+        return o, hbm_m_buffer, hbm_l_buffer, hbm_qk_res
+    return o
+
+
+def flash_attn_varlen_nkifunc(
+    query,
+    key,
+    value,
+    key_cache,
+    value_cache,
+    block_table,
+    attn_mask,
+    n_kv_head=None,
+    head_size=None,
+    B_P_SIZE=128,
+    LARGE_TILE_SZ=2048,
+    return_debug_tensors=False,
+    mixed_precision=True,
+):
+    config = FlashConfig(
+        seq_tile_size=LARGE_TILE_SZ,
+        should_transpose_v=False,
+    )
+    kwargs = dict(
+        query=query,
+        key=key,
+        value=value,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        block_tables=block_table,
+        mask=attn_mask,
+        softmax_scale=1.0 / (head_size**0.5),
+        config=config,
+        mixed_precision=mixed_precision,
+        return_debug_tensors=return_debug_tensors,
+    )
+    _, n_kv_head, _, _ = key.shape
+
+    if return_debug_tensors:
+        o, *debug_tensors = flash_paged_attention[1, n_kv_head](**kwargs)
+        return o, *debug_tensors
+    else:
+        o = flash_paged_attention[1, n_kv_head](**kwargs)
+        return o

From 426a5c362557c6df4604ed084660b8915fbca30c Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 27 Jan 2025 20:56:31 -0500
Subject: [PATCH 11/69] Fix bad path in prometheus example (#12481)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 examples/online_serving/prometheus_grafana/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
index c49e5306a1cb4..4a85f953b0b4c 100644
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@@ -24,7 +24,7 @@ Submit some sample requests to the server:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
-python3 ../../benchmarks/benchmark_serving.py \
+python3 ../../../benchmarks/benchmark_serving.py \
     --model mistralai/Mistral-7B-v0.1 \
     --tokenizer mistralai/Mistral-7B-v0.1 \
     --endpoint /v1/completions \

From 23a7cbc88b5a17499766d1cbc0de283c9f980509 Mon Sep 17 00:00:00 2001
From: Hossein Sarshar <hossein.sarshar@gmail.com>
Date: Mon, 27 Jan 2025 22:18:07 -0500
Subject: [PATCH 12/69] [CI/Build] Fixed the xla nightly issue report in #12451
 (#12453)

---
 requirements-tpu.txt | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 51a0c65eac5aa..1abde714af7c9 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -10,17 +10,14 @@ wheel
 jinja2
 ray[default]
 
-# Install torch, torch_xla
+# Install torch_xla
+--pre
+--extra-index-url https://download.pytorch.org/whl/nightly/cpu
+--find-links https://storage.googleapis.com/libtpu-wheels/index.html
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-# Note: This torch whl can be slightly different from the official torch nightly whl
-# since they are not built on the same commit (but on the same day). This difference may cause C++ undefined symbol issue
-# if some change between the 2 commits introduce some C++ API change.
-# Here we install the exact torch whl from which torch_xla is built from, to avoid potential C++ undefined symbol issue.
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.6.0.dev20241216+cpu
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

From 0f465ab53303fbd3c8ad32163db161cdb0cf8dad Mon Sep 17 00:00:00 2001
From: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com>
Date: Tue, 28 Jan 2025 00:30:13 -0300
Subject: [PATCH 13/69] [FEATURE] Enables offline /score for embedding models
 (#12021)

Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
---
 .../models/embedding/language/test_scoring.py | 100 +++++++++++
 vllm/entrypoints/llm.py                       | 160 +++++++++++++-----
 2 files changed, 216 insertions(+), 44 deletions(-)

diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
index be6e3842821e2..3db27d942ac8c 100644
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -5,12 +5,18 @@
 import math
 
 import pytest
+import torch
+import torch.nn.functional as F
 
 MODELS = [
     "cross-encoder/ms-marco-MiniLM-L-6-v2",  # Bert
     "BAAI/bge-reranker-v2-m3",  # Roberta
 ]
 
+EMBEDDING_MODELS = [
+    "sentence-transformers/all-MiniLM-L12-v2",
+]
+
 TEXTS_1 = [
     "What is the capital of France?",
     "What is the capital of Germany?",
@@ -87,3 +93,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
 
     assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
     assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
+def emb_model_name(request):
+    yield request.param
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = hf_model.encode(text_pair)
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1860ed3d7db5a..46b595b0da73c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -5,6 +5,7 @@
                     Tuple, Type, Union, cast, overload)
 
 import cloudpickle
+import torch
 import torch.nn as nn
 from tqdm import tqdm
 from typing_extensions import TypeVar, deprecated
@@ -996,6 +997,107 @@ def classify(
 
         return [ClassificationRequestOutput.from_base(item) for item in items]
 
+    def _embedding_score(
+        self,
+        tokenizer: AnyTokenizer,
+        text_1: List[Union[str, TextPrompt, TokensPrompt]],
+        text_2: List[Union[str, TextPrompt, TokensPrompt]],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ScoringRequestOutput]:
+
+        encoded_output = self.encode(
+            text_1 + text_2,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request)
+        encoded_output_1 = encoded_output[0:len(text_1)]
+        encoded_output_2 = encoded_output[len(text_1):]
+
+        if len(encoded_output_1) == 1:
+            encoded_output_1 = encoded_output_1 * len(encoded_output_2)
+
+        output_pairs = [(t1, t2)
+                        for t1, t2 in zip(encoded_output_1, encoded_output_2)]
+
+        scores = []
+        scorer = torch.nn.CosineSimilarity(0)
+
+        for embed_1, embed_2 in output_pairs:
+            pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data)
+
+            if (pad_token_id := getattr(tokenizer, "pad_token_id",
+                                        None)) is not None:
+                tokens = embed_1.prompt_token_ids + [
+                    pad_token_id
+                ] + embed_2.prompt_token_ids
+            else:
+                tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids
+
+            scores.append(
+                PoolingRequestOutput(
+                    request_id=f"{embed_1.request_id}_{embed_2.request_id}",
+                    outputs=pair_score,
+                    prompt_token_ids=tokens,
+                    finished=True))
+
+        items = self.engine_class.validate_outputs(scores,
+                                                   PoolingRequestOutput)
+        return [ScoringRequestOutput.from_base(item) for item in items]
+
+    def _cross_encoding_score(
+        self,
+        tokenizer: Union[AnyTokenizer],
+        text_1: List[Union[str, TextPrompt, TokensPrompt]],
+        text_2: List[Union[str, TextPrompt, TokensPrompt]],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ScoringRequestOutput]:
+
+        if isinstance(tokenizer, MistralTokenizer):
+            raise ValueError(
+                "Score API is only enabled for `--task embed or score`")
+
+        if len(text_1) == 1:
+            text_1 = text_1 * len(text_2)
+
+        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
+
+        pooling_params = PoolingParams()
+
+        tokenization_kwargs: Dict[str, Any] = {}
+        if truncate_prompt_tokens is not None:
+            tokenization_kwargs["truncation"] = True
+            tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+        parsed_prompts = []
+
+        for q, t in input_pairs:
+            prompt_inputs = tokenizer(text=q,
+                                      text_pair=t,
+                                      **tokenization_kwargs)
+            engine_prompt = TokensPrompt(
+                prompt_token_ids=prompt_inputs["input_ids"],
+                token_type_ids=prompt_inputs.get("token_type_ids"))
+            parsed_prompts.append(engine_prompt)
+
+        self._validate_and_add_requests(
+            prompts=parsed_prompts,
+            params=pooling_params,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+        outputs = self._run_engine(use_tqdm=use_tqdm)
+        items = self.engine_class.validate_outputs(outputs,
+                                                   PoolingRequestOutput)
+
+        return [ScoringRequestOutput.from_base(item) for item in items]
+
     def score(
         self,
         text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
@@ -1047,25 +1149,20 @@ def score(
 
             raise ValueError(" ".join(messages))
 
-        if not self.llm_engine.model_config.is_cross_encoder:
-            raise ValueError("Your model does not support cross encoding")
-        if self.llm_engine.model_config.task != "score":
-            raise ValueError("Score API is only enabled for `--task score`")
-
-        tokenizer = self.llm_engine.get_tokenizer()
-
-        if isinstance(tokenizer, MistralTokenizer):
+        if self.llm_engine.model_config.task not in ("embed", "score"):
             raise ValueError(
-                "MistralTokenizer not supported for cross-encoding")
+                "Score API is only enabled for `--task embed or --task score`")
 
         # the tokenizer for models such as
         # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
         # lists of tokens to the `text` and `text_pair` kwargs
+        tokenizer = self.llm_engine.get_tokenizer()
+
         def ensure_str(prompt: SingletonPrompt):
             if isinstance(prompt, dict):
                 if "multi_modal_data" in prompt:
                     raise ValueError("Multi-modal prompt is not "
-                                     "supported for cross encoding")
+                                     "supported for scoring")
                 elif "prompt_token_ids" in prompt:
                     prompt = tokenizer.decode(
                         cast(TokensPrompt, prompt)["prompt_token_ids"])
@@ -1091,40 +1188,15 @@ def ensure_str(prompt: SingletonPrompt):
         if len(text_2) == 0:
             raise ValueError("At least one text_pair element must be given")
 
-        if len(text_1) == 1:
-            text_1 = text_1 * len(text_2)
-
-        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
-        pooling_params = PoolingParams()
-
-        tokenization_kwargs: Dict[str, Any] = {}
-        if truncate_prompt_tokens is not None:
-            tokenization_kwargs["truncation"] = True
-            tokenization_kwargs["max_length"] = truncate_prompt_tokens
-
-        parsed_prompts = []
-
-        for q, t in input_pairs:
-            prompt_inputs = tokenizer(text=q,
-                                      text_pair=t,
-                                      **tokenization_kwargs)
-            engine_prompt = TokensPrompt(
-                prompt_token_ids=prompt_inputs["input_ids"],
-                token_type_ids=prompt_inputs.get("token_type_ids"))
-            parsed_prompts.append(engine_prompt)
-
-        self._validate_and_add_requests(
-            prompts=parsed_prompts,
-            params=pooling_params,
-            lora_request=lora_request,
-            prompt_adapter_request=prompt_adapter_request,
-        )
-
-        outputs = self._run_engine(use_tqdm=use_tqdm)
-        items = self.engine_class.validate_outputs(outputs,
-                                                   PoolingRequestOutput)
-
-        return [ScoringRequestOutput.from_base(item) for item in items]
+        if self.llm_engine.model_config.is_cross_encoder:
+            return self._cross_encoding_score(tokenizer, text_1, text_2,
+                                              truncate_prompt_tokens, use_tqdm,
+                                              lora_request,
+                                              prompt_adapter_request)
+        else:
+            return self._embedding_score(tokenizer, text_1, text_2,
+                                         truncate_prompt_tokens, use_tqdm,
+                                         lora_request, prompt_adapter_request)
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()

From dd66fd2b01e1195b7ccc8ffcd4b5d49ff1946a56 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Tue, 28 Jan 2025 14:11:05 +0800
Subject: [PATCH 14/69] [CI] fix pre-commit error (#12494)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/attention/ops/nki_flash_attn.py   | 37 +++++++++++++++++---------
 vllm/spec_decode/spec_decode_worker.py |  8 +++---
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index b9765b0f0283d..9de4ef7f5a140 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -106,11 +106,12 @@ def _flash_attention_core(
     assert (continuous_batching_mask
             is not None), "continuous_batching_mask input is required."
     if continuous_batching_mask is not None:
-        assert (logit_bias_tile is
-                None), "continuous_batching_mask does not support logit_bias!"
+        assert (
+            logit_bias_tile
+            is None), "continuous_batching_mask does not support logit_bias!"
 
     # mask are used to only apply computation to the lower half of the matrix,
-    # which reduce the arthimetic intensity by half
+    # which reduce the arithmetic intensity by half
     forward_mask = (q_tile_idx * B_P_SIZE >= local_k_large_tile_idx *
                     LARGE_TILE_SZ if use_causal_mask else None)
 
@@ -468,9 +469,11 @@ def flash_paged_attention(
                        block_in_partition)
                 loaded_v = nl.load(value_cache[block_tables_sbuf[v_i, j], :,
                                                head_id, :])
-                cur_v_tile[partition_idx,
-                           nl.ds(block_in_partition *
-                                 block_size, block_size), :, ] = loaded_v
+                cur_v_tile[
+                    partition_idx,
+                    nl.ds(block_in_partition * block_size, block_size),
+                    :,
+                ] = loaded_v
 
         cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
                               dtype=mask.dtype)
@@ -601,20 +604,30 @@ def flash_paged_attention(
             )
 
             nl.store(
-                o[batch_id, head_id * q_h_per_k_h + i_q_h,
-                  nl.ds(i * B_P_SIZE, B_P_SIZE), :, ],
+                o[
+                    batch_id,
+                    head_id * q_h_per_k_h + i_q_h,
+                    nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    :,
+                ],
                 out,
             )
             # maximum and summation statistics
             if return_debug_tensors:
                 nl.store(
-                    hbm_m_buffer[batch_id, head_id * q_h_per_k_h + i_q_h,
-                                 nl.ds(i * B_P_SIZE, B_P_SIZE), ],
+                    hbm_m_buffer[
+                        batch_id,
+                        head_id * q_h_per_k_h + i_q_h,
+                        nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    ],
                     m_buffer[i, i_q_h, :, :],
                 )
                 nl.store(
-                    hbm_l_buffer[batch_id, head_id * q_h_per_k_h + i_q_h,
-                                 nl.ds(i * B_P_SIZE, B_P_SIZE), ],
+                    hbm_l_buffer[
+                        batch_id,
+                        head_id * q_h_per_k_h + i_q_h,
+                        nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    ],
                     l_buffer[:, i, i_q_h],
                 )
                 nl.store(
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index af1c4dfcebbc0..8d6d05cbaea75 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -870,10 +870,10 @@ def _verify_tokens(
             accepted_index = accepted_token_ids + 1  # Convert -1 to 0
             accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)  # b
             # Drop non-terminal prefill chunks hidden states.
-            hidden_states = hidden_states[
-                accepted_index != VLLM_INVALID_TOKEN_ID]
-            accepted_index = accepted_index[
-                accepted_index != VLLM_INVALID_TOKEN_ID]
+            hidden_states = hidden_states[accepted_index !=
+                                          VLLM_INVALID_TOKEN_ID]
+            accepted_index = accepted_index[accepted_index !=
+                                            VLLM_INVALID_TOKEN_ID]
             assert len(accepted_index) == hidden_states.shape[0] == len(
                 terminal_metadata)
             index = accepted_index[:, None, None].expand(-1, 1,

From 8cbc4249758d399c0606ef4a1241e01176d0160b Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 28 Jan 2025 00:22:41 -0800
Subject: [PATCH 15/69] Update README.md with V1 alpha release (#12495)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 4ed905bf7aa9d..5fd30f2b1b9d7 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ Easy, fast, and cheap LLM serving for everyone
 ---
 
 *Latest News* 🔥
+- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).

From e29d4358ef054163b80dfb7e53ce3eb0e08d1328 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Tue, 28 Jan 2025 03:27:41 -0500
Subject: [PATCH 16/69] [V1] Include Engine Version in Logs (#12496)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/engine/llm_engine.py | 2 +-
 vllm/v1/engine/core.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index ab67ae29723cd..dd677300fc66a 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -230,7 +230,7 @@ def __init__(
         )
 
         logger.info(
-            "Initializing an LLM engine (v%s) with config: %s, "
+            "Initializing a V0 LLM engine (v%s) with config: %s, "
             "use_cached_outputs=%s, ",
             VLLM_VERSION,
             vllm_config,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index cf94033a38d96..f50303bda58fd 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -42,7 +42,7 @@ def __init__(
     ):
         assert vllm_config.model_config.runner_type != "pooling"
 
-        logger.info("Initializing an LLM engine (v%s) with config: %s",
+        logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
 
         # Setup Model.

From 2079e43beecc486a607c9d79ab691e0e4563aa11 Mon Sep 17 00:00:00 2001
From: Sebastian Schoennenbeck <sebastian.schoennenbeck@comma-soft.com>
Date: Tue, 28 Jan 2025 11:56:45 +0100
Subject: [PATCH 17/69] [Core] Make raw_request optional in ServingCompletion
 (#12503)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Sebastian Schönnenbeck <sebastian.schoennenbeck@comma-soft.com>
---
 vllm/entrypoints/openai/serving_completion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index b0179f78bd635..13c3926368890 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -58,7 +58,7 @@ def __init__(
     async def create_completion(
         self,
         request: CompletionRequest,
-        raw_request: Request,
+        raw_request: Optional[Request] = None,
     ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]:
         """Completion API similar to OpenAI's API.
 
@@ -137,7 +137,7 @@ async def create_completion(
                                  lora_request=lora_request,
                                  prompt_adapter_request=prompt_adapter_request)
 
-                trace_headers = (await
+                trace_headers = (None if raw_request is None else await
                                  self._get_trace_headers(raw_request.headers))
 
                 if isinstance(sampling_params, BeamSearchParams):

From 8f58a5135874770ac8429f4772d7f92fe33094e5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 29 Jan 2025 00:25:05 +0800
Subject: [PATCH 18/69] [VLM] Merged multi-modal processor and V1 support for
 Qwen-VL (#12504)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md        |   2 +-
 .../multimodal/processing/test_common.py      |  64 +-
 .../models/multimodal/processing/test_qwen.py | 144 ----
 vllm/model_executor/models/qwen.py            | 654 ++++++++++--------
 4 files changed, 387 insertions(+), 477 deletions(-)
 delete mode 100644 tests/models/multimodal/processing/test_qwen.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 8cdc663a0320f..e59150cdd3b83 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -745,7 +745,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
   - ✅︎
   - ✅︎
-  -
+  - ✅︎
 * - `Qwen2AudioForConditionalGeneration`
   - Qwen2-Audio
   - T + A<sup>+</sup>
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index fe5b733c750a8..b575ec6acbef3 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -16,7 +16,6 @@
 
 def _test_processing_correctness(
     model_id: str,
-    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -25,11 +24,6 @@ def _test_processing_correctness(
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    limit_mm_per_prompt = {
-        modality: 3 if supports_multi else 1
-        for modality, supports_multi in modalities.items()
-    }
-
     model_config = ModelConfig(
         model_id,
         task="auto",
@@ -40,18 +34,29 @@ def _test_processing_correctness(
         dtype="float16",
         revision=None,
         hf_overrides=model_info.hf_overrides,
-        limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
     ctx = InputProcessingContext(
         model_config,
-        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+        tokenizer=cached_get_tokenizer(
+            model_config.tokenizer,
+            trust_remote_code=model_info.trust_remote_code,
+        ),
     )
     # Ensure that it can fit all of the data
     cache = ProcessingCache(capacity=1 << 30)
 
+    processing_info = factories.info(ctx)
+    supported_mm_limits = processing_info.get_supported_mm_limits()
+    limit_mm_per_prompt = {
+        modality: 3 if limit is None else limit
+        for modality, limit in supported_mm_limits.items()
+    }
+
+    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
+
     baseline_processor = factories.build_processor(ctx, cache=None)
     cached_processor = factories.build_processor(ctx, cache=cache)
     dummy_inputs = baseline_processor.dummy_inputs
@@ -82,8 +87,8 @@ def _test_processing_correctness(
         mm_data = {
             k:
             [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(limit_mm_per_prompt[k]))]
-            for k in modalities
+             for _ in range(rng.randint(limit))]
+            for k, limit in limit_mm_per_prompt.items()
         }
 
         mm_counts = {k: len(vs) for k, vs in mm_data.items()}
@@ -135,21 +140,22 @@ def _test_processing_correctness(
 
 # yapf: disable
 # True if the model supports multiple data items of the modality per request
-@pytest.mark.parametrize(("model_id", "modalities"), [
-    ("rhymes-ai/Aria", {"image": True}),
-    ("Salesforce/blip2-opt-2.7b", {"image": False}),
-    ("facebook/chameleon-7b", {"image": False}),
-    ("deepseek-ai/deepseek-vl2-tiny", {"image": True}),
-    ("adept/fuyu-8b", {"image": False}),
-    ("llava-hf/llava-1.5-7b-hf", {"image": True}),
-    ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
-    ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
-    ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}),  # noqa: E501
-    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
-    ("mistral-community/pixtral-12b", {"image": True}),
-    ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
-    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
-    ("fixie-ai/ultravox-v0_3", {"audio": True}),
+@pytest.mark.parametrize("model_id", [
+    "rhymes-ai/Aria",
+    "Salesforce/blip2-opt-2.7b",
+    "facebook/chameleon-7b",
+    "deepseek-ai/deepseek-vl2-tiny",
+    "adept/fuyu-8b",
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    "llava-hf/LLaVA-NeXT-Video-7B-hf",
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "mistral-community/pixtral-12b",
+    "Qwen/Qwen-VL-Chat",
+    "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen/Qwen2-Audio-7B-Instruct",
+    "fixie-ai/ultravox-v0_3",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -157,14 +163,12 @@ def _test_processing_correctness(
 # yapf: enable
 def test_processing_correctness(
     model_id: str,
-    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
 ):
     _test_processing_correctness(
         model_id,
-        modalities,
         hit_rate=hit_rate,
         num_batches=num_batches,
         simplify_rate=simplify_rate,
@@ -172,16 +176,13 @@ def test_processing_correctness(
 
 
 # yapf: disable
-@pytest.mark.parametrize(("model_id", "modalities"), [
-    ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
-])
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
 # yapf: enable
 def test_processing_correctness_phi3v(
     model_id: str,
-    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -195,7 +196,6 @@ def test_processing_correctness_phi3v(
 
     _test_processing_correctness(
         model_id,
-        modalities,
         hit_rate=hit_rate,
         num_batches=num_batches,
         simplify_rate=simplify_rate,
diff --git a/tests/models/multimodal/processing/test_qwen.py b/tests/models/multimodal/processing/test_qwen.py
deleted file mode 100644
index af0ace711ba3e..0000000000000
--- a/tests/models/multimodal/processing/test_qwen.py
+++ /dev/null
@@ -1,144 +0,0 @@
-"""Tests for Qwen's multimodal preprocessing kwargs."""
-from typing import Dict, List, Union
-
-import pytest
-import torch
-from PIL.Image import Image
-
-from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.utils import cached_get_tokenizer
-
-from ....conftest import IMAGE_ASSETS
-from ...utils import build_model_context
-
-### Multimodal preprocessing tests
-SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
-# These values are specific to Qwen-VL/Chat; we can get these from the model
-# config also, but they are hardcoded here to keep the parameterize/fixtures
-# easy to read.
-IMG_START_ID = 151857
-IMG_END_ID = 151858
-IMG_PAD_ID = 151859
-TOKS_PER_IMG = 256
-VIS_ENC_DIM = 4096
-IMG_SIZE = 448
-
-
-@pytest.fixture()
-def input_mapper_for_qwen():
-    # Lazy import to avoid initializing CUDA during test collection
-    from vllm.model_executor.models.qwen import input_mapper_for_qwen
-    return input_mapper_for_qwen
-
-
-@pytest.fixture()
-def input_processor_for_qwen():
-    # Lazy import to avoid initializing CUDA during test collection
-    from vllm.model_executor.models.qwen import input_processor_for_qwen
-    return input_processor_for_qwen
-
-
-@pytest.fixture()
-def qwen_vl_context() -> InputContext:
-    """Get an InputContext for Qwen-VL."""
-    return build_model_context(model_name="Qwen/Qwen-VL",
-                               trust_remote_code=True)
-
-
-# Happy path tests for single/multi-image scenarios for the multimodal
-# input processor and mapper, respectively
-@pytest.mark.parametrize("num_images", [1, 2])
-def test_input_processor_valid_mm_data(input_processor_for_qwen,
-                                       qwen_vl_context: InputContext,
-                                       num_images: int):
-    """Happy cases for image inputs to Qwen's multimodal input processor."""
-    prompt = "".join(
-        [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
-    inputs = token_inputs(
-        prompt=prompt,
-        # When processing multimodal data for a multimodal model, the qwen
-        # input processor will overwrite the provided prompt_token_ids with
-        # the image prompts
-        prompt_token_ids=[],
-        multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
-    )
-    proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
-    assert isinstance(proc_inputs, dict)
-
-    # Each image should have one start / stop and a fixed context of 256
-    proc_tokens = proc_inputs["prompt_token_ids"]
-    assert proc_tokens.count(IMG_START_ID) == num_images
-    assert proc_tokens.count(IMG_END_ID) == num_images
-    assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
-
-
-@pytest.mark.parametrize(
-    "img_data,expected_shape",
-    [
-        # single / multi-image
-        (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
-        (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
-        # single / multi-image embeddings
-        (torch.rand(
-            (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
-        (torch.rand(
-            (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
-        (torch.rand(
-            (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
-    ])
-def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
-                                    qwen_vl_context: InputContext,
-                                    img_data: Union[torch.Tensor, List[Image],
-                                                    Image],
-                                    expected_shape: List[int]):
-    """Happy cases for image inputs to Qwen's multimodal input mapper."""
-    mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
-    # Ensure that we get the appropriately shaped pixel_values
-    # for images and image embeddings, respectively.
-    assert isinstance(mapped_img_data, MultiModalKwargs)
-    assert "pixel_values" in mapped_img_data
-    assert mapped_img_data["pixel_values"].shape == expected_shape
-
-
-# Sad path tests for the multimodal input processor and mapper, respectively
-@pytest.mark.parametrize("mm_data", [
-    {
-        "image": torch.rand(5)
-    },
-    {
-        "image": torch.rand((5, 5, 5, 5, 5))
-    },
-])
-def test_input_processor_invalid_mm_data(input_processor_for_qwen,
-                                         qwen_vl_context: InputContext,
-                                         mm_data: Dict[str, torch.Tensor]):
-    """Test sad cases validated in Qwen's multimodal input processor."""
-    tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
-                                     trust_remote_code=True)
-    prompt = "Picture 1: <img></img>\n"
-    prompt_token_ids = tokenizer.encode(prompt)
-    inputs = token_inputs(prompt=prompt,
-                          prompt_token_ids=prompt_token_ids,
-                          multi_modal_data=mm_data)
-    # Should fail since we have too many or too few dimensions for embeddings
-    with pytest.raises(ValueError):
-        input_processor_for_qwen(qwen_vl_context, inputs)
-
-
-@pytest.mark.parametrize(
-    "img_data",
-    [
-        # Wrong context length
-        torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
-        # Wrong visual encoder output size
-        torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
-    ])
-def test_input_mapper_invalid_mm_data(
-    input_mapper_for_qwen,
-    qwen_vl_context: InputContext,
-    img_data: Union[torch.Tensor, List[Image], Image],
-):
-    """Sad cases validated in Qwen VL's multimodal input mapper."""
-    with pytest.raises(ValueError):
-        input_mapper_for_qwen(qwen_vl_context, img_data)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 1345b381f0a99..86a9d3089c3ee 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -4,26 +4,28 @@
 # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 """Inference-only QWen model compatible with HuggingFace weights."""
 
+import copy
 import math
 import re
-from functools import partial
-from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Set, Tuple, TypedDict, Union)
+import unicodedata
+from functools import lru_cache, partial
+from typing import (AbstractSet, Any, Callable, Collection, Dict, Iterable,
+                    List, Literal, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
-import numpy as np
 import torch
-from PIL import Image
 from torch import nn
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
-from transformers import PretrainedConfig
+from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer,
+                          TensorType)
+from transformers.image_utils import ImageInput
+from transformers.tokenization_utils_base import TextInput
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -42,15 +44,20 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.utils import is_list_of
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptReplacementDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
 
@@ -353,8 +360,10 @@ def __init__(self,
         self.ln_post = norm_layer(output_dim)
         self.proj = nn.Parameter(
             (output_dim**-0.5) * torch.randn(output_dim, output_dim))
+
         self.image_start_id = image_start_id
         self.image_end_id = image_start_id + 1
+        self.image_pad_id = image_start_id + 2
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x.to(
@@ -383,21 +392,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         return x
 
-    def get_image_positions(self,
-                            input_ids: torch.Tensor) -> Optional[torch.Tensor]:
-        """Given the input IDs, extracts start/stop points corresponding to
-        images.
-
-        args:
-        Returns:
-            Optional torch tensor corresponding to start/stop pairs of images.
-        """
-        if torch.any(input_ids == self.image_start_id):
-            bos_pos = torch.where(input_ids == self.image_start_id)
-            eos_pos = torch.where(input_ids == self.image_end_id)
-            return torch.stack((bos_pos[0], eos_pos[0]), dim=1)
-        return None
-
 
 class QWenMLP(nn.Module):
     """MLP for the language component of the Qwen model, which contains a
@@ -579,9 +573,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
-        self.visual = VisionTransformer(**config.visual,
-                                        quant_config=quant_config) if hasattr(
-                                            config, "visual") else None
+
+        if (vision_config := getattr(config, "visual", None)):
+            self.visual = VisionTransformer(**vision_config,
+                                            quant_config=quant_config)
+        else:
+            self.visual = None
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
@@ -593,38 +590,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
-        pixel_values: Optional[QwenImageInputs],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        img_pos = None
-        # If pixel / visual embeddings are provided, this is a visual model
-        if pixel_values is not None and self.visual is not None:
-            if pixel_values["type"] != "image_embeds":
-                image_embeds = self.visual(pixel_values["data"])
-            else:
-                image_embeds = pixel_values["data"]
-
-            # features should be of shape (# images, 256, hidden_dim)
-            img_pos = self.visual.get_image_positions(input_ids)
-            if isinstance(
-                    img_pos,
-                    np.ndarray) and img_pos.shape[0] != image_embeds.shape[0]:
-                raise ValueError(
-                    f"Number of placeholders: {img_pos.shape[0]} "
-                    f"does not match number of images {image_embeds.shape[0]}."
-                )
-
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
                 hidden_states = self.get_input_embeddings(input_ids)
-            hidden_states = self.wte(input_ids)
-            # Merge the image embeddings into the hidden states if actually have
-            # visual features and the corresponding image tokens
-            if img_pos is not None:
-                for idx, (img_bos, img_eos) in enumerate(img_pos):
-                    hidden_states[img_bos + 1:img_eos] = image_embeds[idx]
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -648,159 +620,9 @@ def forward(
         return hidden_states
 
 
-def get_image_text(image_num: int, padding: bool) -> str:
-    """Retrieves a placeholder text that when tokenized, will be expanded with
-    image pads.
-
-    Args:
-        image_num: The number of the image that we want a text prompt for.
-            Images should be indexed starting at 1.
-        padding: Whether or not padding should be manually added.
-
-    Returns:
-        Text placeholder prompt for the image being considered.
-    """
-    image_start = f"Picture {image_num}: {IMG_START}"
-    image_end = f"{IMG_END}\n"
-    if not padding:
-        return f"{image_start}{image_end}"
-    return f"{image_start}{MAX_QWEN_IMG_TOKENS * IMG_PAD}{image_end}"
-
-
-def input_processor_for_qwen(ctx: InputContext,
-                             inputs: DecoderOnlyInputs) -> DecoderOnlyInputs:
-    """Processes the inputs, which may or may not be multimodal.
-    Multimodal inputs will only be processed if the model has a "visual"
-    component in its model config, otherwise they'll be ignored.
-
-    Args:
-        ctx: Context of the loaded model.
-        inputs: LLM inputs which may have a multi_modal_data attribute.
-
-    Returns:
-        If the model is language only or not multimodal inputs were provided,
-        returns inputs unmodified. Otherwise, processes the multimodal
-        images / image embeddings and adds the fixed-length image placeholders.
-    """
-    multi_modal_data = inputs.get("multi_modal_data")
-
-    # Only process images if we have multimodal data and a visual config
-    hf_config = ctx.get_hf_config()
-    if (multi_modal_data is None or "image" not in multi_modal_data
-            or not hasattr(hf_config, "visual")):
-        return inputs
-
-    prompt = inputs.get("prompt")
-    prompt_token_ids = inputs["prompt_token_ids"]
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, torch.Tensor):
-        num_dims = len(image_data.shape)
-        if num_dims < 2 or num_dims > 3:
-            raise ValueError(
-                f"Expected img embeds to be have 3 dimensions, got {num_dims}")
-        num_images = 1 if num_dims == 2 else image_data.shape[0]
-    elif isinstance(image_data, Image.Image):
-        num_images = 1
-    elif is_list_of(image_data, Image.Image):
-        num_images = len(image_data)
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    if prompt is None:
-        prompt = tokenizer.decode(prompt_token_ids)
-
-    # Drops anything between <img>/</img> tags; encoding with the tokenizer
-    # will automatically add the image pads for the context.
-    new_prompt, num_matched_images = re.subn(
-        r"(Picture \d*: <img>).*?(<\/img>\n)",
-        r"\1\2",
-        prompt,
-    )
-
-    if num_matched_images != num_images:
-        logger.warning(
-            "Number of matched image placeholders %s doesn't match the number "
-            "of expected images %s; check your placeholder formatting.",
-            num_matched_images, num_images)
-
-    new_prompt_token_ids = tokenizer.encode(new_prompt)
-
-    return token_inputs(prompt=new_prompt,
-                        prompt_token_ids=new_prompt_token_ids,
-                        multi_modal_data=multi_modal_data)
-
-
-def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs:
-    """Maps the input data to its MultiModalKwargs (if any).
-
-    Args:
-        ctx: Context of the loaded model.
-        data: data potentially containing image/image embeddings to be mapped
-            to pixel_values in .forward() for a visual QWenLMHeadModel model.
-
-    Returns:
-        MultiModalKwargs containing the stacked normalized images tensor or
-        image embeddings.
-    """
-    # Early exit if we have provided an image to a language only Qwen model
-    hf_config = ctx.get_hf_config()
-    if not hasattr(hf_config, "visual"):
-        logger.warning(
-            "Images were provided but this model has no visual config; "
-            "multimodal inputs will not be forwarded to the model.")
-        return MultiModalKwargs()
-
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-
-    image_pair_tok = tokenizer.encode(IMG_START + IMG_END,
-                                      add_special_tokens=False,
-                                      return_tensors="pt").squeeze()
-    image_start_id = image_pair_tok[0]
-    image_end_id = image_pair_tok[-1]
-    if (image_start_id + 1) != image_end_id:
-        raise ValueError(
-            f"Found image end ID {image_end_id}, but expected {IMG_START} + 1")
-    if len(image_pair_tok) != (MAX_QWEN_IMG_TOKENS + 2):
-        raise ValueError(
-            f"Expected image context length of {MAX_QWEN_IMG_TOKENS}, "
-            f"but got {image_pair_tok - 2}")
-
-    hf_config = ctx.get_hf_config()
-    image_size = hf_config.visual["image_size"]
-    img_emb_size = hf_config.visual["output_dim"]
-
-    if isinstance(data, torch.Tensor):
-        # It's expected that our values have already been processed
-        # by the visual transformer; shape is expected to be:
-        # (# images, 256, hidden_size)
-        if len(data.shape) == 2:
-            # Assume only one image embed was provided; unsqueeze the extra dim
-            data = data.unsqueeze(0)
-        if len(data.shape) != 3 or data.shape[
-                1] != MAX_QWEN_IMG_TOKENS or data.shape[2] != img_emb_size:
-            raise ValueError(
-                "Expected image embeds to be a tensor of shape"
-                f"[# images, {MAX_QWEN_IMG_TOKENS}, {img_emb_size}], but "
-                f"received shape [{data.shape}]")
-        pixel_values = data
-    else:
-        transform = build_normalization_transform(image_size)
-        if not isinstance(data, (list, tuple)):
-            data = [data]
-        transformed_images = [transform(datum) for datum in data]
-        pixel_values = torch.stack(transformed_images, dim=0)
-    return MultiModalKwargs({"pixel_values": pixel_values})
-
-
 def build_normalization_transform(image_size: int) -> transforms.Compose:
-    """Builds a normalization transform which can be applied to one or
+    """
+    Build a normalization transform which can be applied to one or
     more input images from which we want to extract visual features.
 
     Args:
@@ -817,62 +639,251 @@ def build_normalization_transform(image_size: int) -> transforms.Compose:
     ])
 
 
-def dummy_data_for_qwen(
-    ctx: InputContext,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-) -> DummyData:
-    """Build dummy data for warming up Qwen models; this will only contain text
-    matching the defaults for VLLM unless the model has a visual config.
+@lru_cache(maxsize=1)
+def _get_tokenizer_without_image_pad(
+        tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
+    """
+    The logic of adding image pad tokens should only be applied in
+    :class:`QWenVLProcessor`, so they are patched out here.
 
-    Args:
-        ctx: Context of the loaded model.
-        seq_len: Number of tokens in the text sequence.
-        mm_counts: multimodal data counts.
-    
-    Returns:
-        Tuple containing sequential and multimodal data.
+    The definition of the wrapped tokenizer can be found here:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
+    """
+    new_tokenizer = copy.deepcopy(tokenizer)
+
+    class TokenizerWithoutImagePad(tokenizer.__class__):  # type: ignore
+
+        def tokenize(
+            self,
+            text: str,
+            allowed_special: Union[AbstractSet[str], str] = "all",
+            disallowed_special: Union[Collection[str], str] = (),
+            **kwargs,
+        ) -> list[Union[bytes, str]]:
+            text = unicodedata.normalize("NFC", text)
+
+            return [
+                self.decoder[t] for t in self.tokenizer.encode(
+                    text,
+                    allowed_special=allowed_special,
+                    disallowed_special=disallowed_special,
+                )
+            ]
+
+        def _decode(
+            self,
+            token_ids: Union[int, List[int]],
+            skip_special_tokens: bool = False,
+            errors: Optional[str] = None,
+            **kwargs,
+        ) -> str:
+            if isinstance(token_ids, int):
+                token_ids = [token_ids]
+
+            return self.tokenizer.decode(
+                token_ids,
+                errors=errors or self.errors,
+            )
+
+    TokenizerWithoutImagePad.__name__ = \
+        f"{tokenizer.__class__.__name__}WithoutImagePad"
+
+    new_tokenizer.__class__ = TokenizerWithoutImagePad
+    return new_tokenizer
+
+
+class QWenVLProcessor:
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    We call the wrapped tokenizer to automatically insert image pad tokens:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245
+
+    The image processor is defined here:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354
     """
-    hf_config = ctx.get_hf_config()
-
-    # The presence of a visual config indicates this is a multimodal model.
-    # If we don't have it, the model is considered an LLM for warmup purposes.
-    if not hasattr(hf_config, "visual"):
-        seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
-        mm_data = None
-        return DummyData(seq_data, mm_data)
-
-    # We have a visual component - use images to warm up
-    num_images = mm_counts["image"]
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-
-    # Build the image prompts with no imgpads; the tokenizer will add img pads
-    image_prompt = ''.join(
-        [get_image_text(idx, False) for idx in range(1, num_images + 1)])
-    toks = tokenizer.encode(image_prompt, add_special_tokens=False)
-
-    # Make sure we actually get the fixed context size per tok padding
-    num_pads = toks.count(tokenizer.encode(IMG_PAD)[0])
-    if num_pads != (num_images * MAX_QWEN_IMG_TOKENS):
-        raise ValueError(
-            f"Tokenized dummy data should encode {MAX_QWEN_IMG_TOKENS} pads"
-            f" per image, but got {num_pads} pads for {num_images} image(s)"
-            " in total. Are you using a qwen tokenizer?")
-
-    # Ensure the number of tokens is at minimum the sequence length provided
-    if len(toks) < seq_len:
-        toks += [0] * (seq_len - len(toks))
-
-    seq_data = SequenceData.from_seqs(toks)
-
-    # Build the input images; width/height doesn't actually matter here since
-    # the data will get resized and the # of tokens per image is constant
-    image = Image.new("RGB", (224, 224), color=0)
-    mm_data = {"image": image if num_images == 1 else [image] * num_images}
-    return DummyData(seq_data, mm_data)
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: PreTrainedTokenizer,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        if hasattr(self.config, "visual"):
+            self.image_transform = build_normalization_transform(
+                config.visual["image_size"])
+        else:
+            self.image_transform = None
+
+        special_tokens: dict[str,
+                             int] = tokenizer.special_tokens  # type: ignore
+        self.img_start_id = special_tokens[IMG_START]
+        self.img_end_id = special_tokens[IMG_END]
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        text_inputs = self.tokenizer(text)
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            if self.image_transform is None:
+                raise ValueError("This model does not support image inputs")
+
+            pixel_values = [self.image_transform(image) for image in images]
+            image_inputs = {"pixel_values": torch.stack(pixel_values)}
+
+        return BatchFeature(
+            {
+                **text_inputs,
+                **image_inputs,
+            },
+            tensor_type=return_tensors,
+        )
+
+
+class QWenVLProcessingInfo(BaseProcessingInfo):
+
+    def get_tokenizer(self) -> PreTrainedTokenizer:
+        tokenizer = self.ctx.tokenizer
+        assert isinstance(tokenizer, PreTrainedTokenizer)
+
+        return _get_tokenizer_without_image_pad(tokenizer)
+
+    def get_hf_processor(self) -> QWenVLProcessor:
+        tokenizer = self.ctx.tokenizer
+        assert isinstance(tokenizer, PreTrainedTokenizer)
+
+        return QWenVLProcessor(self.get_hf_config(), tokenizer)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        return MAX_QWEN_IMG_TOKENS
+
+
+class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.info.get_hf_config()
+        if not hasattr(hf_config, "visual"):
+            return ProcessorInputs(prompt_text="", mm_data={})
+
+        vision_config = hf_config.visual
+
+        max_image_size = vision_config["image_size"]
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="".join(f"Picture {i}: {IMG_START}{IMG_END}\n"
+                                for i in range(1, num_images + 1)),
+            mm_data=mm_data,
+        )
+
+
+class QWenVLMultiModalProcessor(BaseMultiModalProcessor[QWenVLProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Drops anything between <img>/</img> tags; encoding with the tokenizer
+        # will automatically add the image pads for the context.
+        prompt, num_matched_images = re.subn(
+            r"(Picture \d*: <img>).*?(<\/img>\n)",
+            r"\1\2",
+            prompt,
+        )
+
+        image_data = mm_data.get("images")
+        if image_data is not None:
+            assert isinstance(image_data, list)
+
+            num_images = len(image_data)
+            if num_matched_images != num_images:
+                logger.warning(
+                    "Number of matched image placeholders %s doesn't match "
+                    "the number of expected images %s; check your placeholder "
+                    "formatting.", num_matched_images, num_images)
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        tokenizer = self.info.get_tokenizer()
+        special_tokens: dict[str,
+                             int] = tokenizer.special_tokens  # type: ignore
+
+        img_start_id = special_tokens[IMG_START]
+        img_end_id = special_tokens[IMG_END]
+        img_pad_id = special_tokens[IMG_PAD]
+
+        num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [img_pad_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[img_start_id, img_end_id],
+                replacement=PromptReplacementDetails(
+                    full=[img_start_id] + image_tokens + [img_end_id],
+                    features=image_tokens,
+                ),
+            )
+        ]
 
 
 class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA):
@@ -898,38 +909,77 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
-    def _get_image_input_type(
-            self,
-            pixel_values: Optional[torch.Tensor]) -> Optional[QwenImageInputs]:
-        """Determines if the provided pixel_values are normalized pixel values
-        or image embeddings.
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.visual["image_size"]
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
 
-        Args:
-            pixel_values: Optional data to processed into visual embeddings.
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[QwenImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, torch.Tensor):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return QwenImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return QwenImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
 
-        Returns:
-            None of the QwenImageInputs type used to determine whether or not
-            the visual transformer needs to process the pixel_values.
-        """
-        if pixel_values is not None and self.transformer.visual is not None:
-            pixel_values = flatten_bn(pixel_values)
-            if len(pixel_values.shape) == 3 and pixel_values.shape[
-                    1] == MAX_QWEN_IMG_TOKENS and pixel_values.shape[
-                        2] == self.config.visual["output_dim"]:
-                return QwenImageEmbeddingInputs(
-                    type="image_embeds",
-                    data=pixel_values,
-                )
-            else:
-                # If we have the wrong shape, assume we still need to process
-                return QwenImagePixelInputs(
-                    type="pixel_values",
-                    data=pixel_values,
-                )
         return None
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
+    def _process_image_input(self,
+                             image_input: QwenImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.transformer.visual is not None
+        return self.transformer.visual(image_input["data"])
+
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.transformer.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None:
+            assert self.transformer.visual is not None
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.transformer.visual.image_pad_id)
+
+        return inputs_embeds
 
     def forward(
         self,
@@ -938,18 +988,23 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
-        pixel_values: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
             input_ids = None
-            pixel_values = None
-        else:
-            pixel_values = self._get_image_input_type(pixel_values)
 
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata, intermediate_tensors,
-                                         pixel_values, inputs_embeds)
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
@@ -1063,10 +1118,9 @@ def get_mm_mapping(self) -> MultiModelKeys:
             tower_model="transformer.visual.transformer")
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen)
+@MULTIMODAL_REGISTRY.register_processor(QWenVLMultiModalProcessor,
+                                        info=QWenVLProcessingInfo,
+                                        dummy_inputs=QWenVLDummyInputsBuilder)
 class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
     """
     QWenLMHeadModel is not only applicable to LLM  but also to VL, which is not 
@@ -1084,7 +1138,7 @@ def __new__(
         cls,
         vllm_config: VllmConfig,
         prefix: str = "",
-    ) -> None:
+    ) -> QWenBaseModel:
         config = vllm_config.model_config.hf_config
         # Initialize VL
         if hasattr(config, "visual"):

From 925d2f19089b50736ce5e0f2ba0c9b7f3da6fb15 Mon Sep 17 00:00:00 2001
From: Jun Duan <jun.duan.phd@outlook.com>
Date: Tue, 28 Jan 2025 11:37:10 -0500
Subject: [PATCH 19/69] [Doc] Fix typo for x86 CPU installation (#12514)

Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
---
 docs/source/getting_started/installation/cpu/x86.inc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
index e4f99d3cebdf2..e0eaac5099305 100644
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -18,7 +18,7 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
 :::
 
 ```{note}
-- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
+- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
 - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
 ```
 

From 3fd1fb63efb6c96f30237b12e2816b4f2c5323d0 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 28 Jan 2025 16:38:38 +0000
Subject: [PATCH 20/69] [V1][Metrics] Hook up IterationStats for Prometheus
 metrics (#12478)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |  7 ++-
 vllm/v1/engine/async_llm.py              |  3 +-
 vllm/v1/metrics/loggers.py               | 68 ++++++++++++++++++++----
 3 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 469a5fb039fb6..64deaedf0f2c1 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -105,8 +105,6 @@ async def client(server):
 @pytest.mark.asyncio
 async def test_metrics_counts(server: RemoteOpenAIServer,
                               client: openai.AsyncClient, use_v1: bool):
-    if use_v1:
-        pytest.skip("Skipping test on vllm V1")
     for _ in range(_NUM_REQUESTS):
         # sending a request triggers the metrics to be logged.
         await client.completions.create(
@@ -120,6 +118,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
     # Loop over all expected metric_families
     for metric_family, suffix_values_list in EXPECTED_VALUES.items():
+        if use_v1 and metric_family not in EXPECTED_METRICS_V1:
+            continue
+
         found_metric = False
 
         # Check to see if the metric_family is found in the prom endpoint.
@@ -199,6 +200,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 EXPECTED_METRICS_V1 = [
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
+    "vllm:prompt_tokens_total",
+    "vllm:generation_tokens_total",
 ]
 
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 917d52d3220b8..022b6d0668e99 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -305,7 +305,8 @@ def _log_stats(
             return
 
         for logger in self.stat_loggers:
-            logger.log(scheduler_stats=scheduler_stats)
+            logger.log(scheduler_stats=scheduler_stats,
+                       iteration_stats=iteration_stats)
 
     def encode(
         self,
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index b84f03fa3267c..6a7bb423749e1 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,11 +1,12 @@
 import time
 from abc import ABC, abstractmethod
-from typing import Dict
+from typing import Dict, List
 
+import numpy as np
 import prometheus_client
 
 from vllm.logger import init_logger
-from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
 
@@ -15,27 +16,61 @@
 class StatLoggerBase(ABC):
 
     @abstractmethod
-    def log(self, scheduler_stats: SchedulerStats):
+    def log(self, scheduler_stats: SchedulerStats,
+            iteration_stats: IterationStats):
         ...
 
 
 class LoggingStatLogger(StatLoggerBase):
 
     def __init__(self):
-        self.last_log_time = time.monotonic()
+        self._reset(time.monotonic())
 
-    def log(self, scheduler_stats: SchedulerStats):
-        """Log Stats to standard output."""
+    def _reset(self, now):
+        self.last_log_time = now
+
+        # Tracked stats over current local logging interval.
+        self.num_prompt_tokens: List[int] = []
+        self.num_generation_tokens: List[int] = []
 
+    def _local_interval_elapsed(self, now: float) -> bool:
         # Log every _LOCAL_LOGGING_INTERVAL_SEC.
+        elapsed_time = now - self.last_log_time
+        return elapsed_time > _LOCAL_LOGGING_INTERVAL_SEC
+
+    def _track_iteration_stats(self, iteration_stats: IterationStats):
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
+        self.num_generation_tokens.append(
+            iteration_stats.num_generation_tokens)
+
+    def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
+        # Compute summary metrics for tracked stats
+        return float(np.sum(tracked_stats) / (now - self.last_log_time))
+
+    def log(self, scheduler_stats: SchedulerStats,
+            iteration_stats: IterationStats):
+        """Log Stats to standard output."""
+
+        self._track_iteration_stats(iteration_stats)
+
         now = time.monotonic()
-        if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC:
+        if not self._local_interval_elapsed(now):
             return
-        self.last_log_time = now
+
+        prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
+        generation_throughput = self._get_throughput(
+            self.num_generation_tokens, now)
+
+        self._reset(now)
 
         # Format and print output.
         logger.info(
+            "Avg prompt throughput: %.1f tokens/s, "
+            "Avg generation throughput: %.1f tokens/s, "
             "Running: %d reqs, Waiting: %d reqs ",
+            prompt_throughput,
+            generation_throughput,
             scheduler_stats.num_running_reqs,
             scheduler_stats.num_waiting_reqs,
         )
@@ -61,11 +96,26 @@ def __init__(self, labels: Dict[str, str]):
             documentation="Number of requests waiting to be processed.",
             labelnames=labelnames).labels(*labelvalues)
 
-    def log(self, scheduler_stats: SchedulerStats):
+        self.counter_prompt_tokens = prometheus_client.Counter(
+            name="vllm:prompt_tokens_total",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.counter_generation_tokens = prometheus_client.Counter(
+            name="vllm:generation_tokens_total",
+            documentation="Number of generation tokens processed.",
+            labelnames=labelnames).labels(*labelvalues)
+
+    def log(self, scheduler_stats: SchedulerStats,
+            iteration_stats: IterationStats):
         """Log to prometheus."""
         self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
         self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
 
+        self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
+        self.counter_generation_tokens.inc(
+            iteration_stats.num_generation_tokens)
+
     @staticmethod
     def _unregister_vllm_metrics():
         # Unregister any existing vLLM collectors (for CI/CD

From 0f657bdc52d4ad1d079beddf8e7556c419aca7b4 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 28 Jan 2025 14:06:32 -0500
Subject: [PATCH 21/69] Replace missed warning_once for rerank API (#12472)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/entrypoints/openai/api_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 45cf06566faaa..077bc993726ae 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -528,7 +528,7 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
 @router.post("/v1/rerank")
 @with_cancellation
 async def do_rerank_v1(request: RerankRequest, raw_request: Request):
-    logger.warning(
+    logger.warning_once(
         "To indicate that the rerank API is not part of the standard OpenAI"
         " API, we have located it at `/rerank`. Please update your client"
         "accordingly. (Note: Conforms to JinaAI rerank API)")

From f26d790718b8e50a11a366f3301b6a9300377797 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 28 Jan 2025 20:05:27 +0000
Subject: [PATCH 22/69] Do not run `suggestion` `pre-commit` hook multiple
 times (#12521)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .pre-commit-config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7b32df90bfd8b..77010090965d4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -90,3 +90,4 @@ repos:
     entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
     language: system
     verbose: true
+    pass_filenames: false

From c386c43ca3a7156a953e0ca4d8f2c2f36ccf1423 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 28 Jan 2025 22:07:22 +0000
Subject: [PATCH 23/69] [V1][Metrics] Add per-request prompt/generation_tokens
 histograms (#12516)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |  6 +++
 vllm/v1/engine/async_llm.py              |  3 +-
 vllm/v1/engine/output_processor.py       | 11 ++++-
 vllm/v1/metrics/loggers.py               | 60 +++++++++++++++++++++---
 vllm/v1/metrics/stats.py                 | 36 ++++++++++++--
 5 files changed, 102 insertions(+), 14 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 64deaedf0f2c1..9a84c82b62fdf 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -202,6 +202,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:num_requests_waiting",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
+    "vllm:request_prompt_tokens_sum",
+    "vllm:request_prompt_tokens_bucket",
+    "vllm:request_prompt_tokens_count",
+    "vllm:request_generation_tokens_sum",
+    "vllm:request_generation_tokens_bucket",
+    "vllm:request_generation_tokens_count",
 ]
 
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 022b6d0668e99..b9dc3561d1750 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -53,8 +53,7 @@ def __init__(
         self.log_stats = log_stats
         self.stat_loggers: List[StatLoggerBase] = [
             LoggingStatLogger(),
-            PrometheusStatLogger(labels=dict(
-                model_name=self.model_config.served_model_name)),
+            PrometheusStatLogger(vllm_config.model_config),
         ]
 
         # Tokenizer (+ ensure liveness if running in another process).
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 564eab51bd3a8..39217b8090140 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -8,7 +8,7 @@
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 from vllm.v1.engine.detokenizer import (DetokenizerOutput,
                                         IncrementalDetokenizer)
-from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.metrics.stats import IterationStats, RequestStateStats
 
 
 @dataclass
@@ -37,6 +37,8 @@ def __init__(
         self.is_prefilling = True
         self.queue = queue
 
+        self.stats = RequestStateStats()
+
     @classmethod
     def from_new_request(
         cls,
@@ -146,7 +148,8 @@ def process_outputs(
             # 1) Compute stats for this iteration.
             iteration_stats.update_from_output(engine_core_output,
                                                req_state.is_prefilling,
-                                               req_state.prompt_len)
+                                               req_state.prompt_len,
+                                               req_state.stats)
             req_state.is_prefilling = False
 
             # 2) Detokenize the token ids into text.
@@ -171,6 +174,10 @@ def process_outputs(
                         # detected stop string, abort needed in EngineCore.
                         reqs_to_abort.append(req_id)
 
+                    # Track per-request stats
+                    iteration_stats.update_from_finished_request(
+                        request_output, req_state.stats)
+
         return OutputProcessorOutput(
             request_outputs=request_outputs,
             reqs_to_abort=reqs_to_abort,
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 6a7bb423749e1..87d9d63652c05 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,10 +1,11 @@
 import time
 from abc import ABC, abstractmethod
-from typing import Dict, List
+from typing import List
 
 import numpy as np
 import prometheus_client
 
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
@@ -78,13 +79,13 @@ def log(self, scheduler_stats: SchedulerStats,
 
 class PrometheusStatLogger(StatLoggerBase):
 
-    def __init__(self, labels: Dict[str, str]):
-        self.labels = labels
+    def __init__(self, model_config: ModelConfig):
+        self._unregister_vllm_metrics()
 
-        labelnames = self.labels.keys()
-        labelvalues = self.labels.values()
+        labelnames = ["model_name"]
+        labelvalues = [model_config.served_model_name]
 
-        self._unregister_vllm_metrics()
+        max_model_len = model_config.max_model_len
 
         self.gauge_scheduler_running = prometheus_client.Gauge(
             name="vllm:num_requests_running",
@@ -106,6 +107,20 @@ def __init__(self, labels: Dict[str, str]):
             documentation="Number of generation tokens processed.",
             labelnames=labelnames).labels(*labelvalues)
 
+        self.histogram_num_prompt_tokens_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_prompt_tokens",
+                documentation="Number of prefill tokens processed.",
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_num_generation_tokens_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_generation_tokens",
+                documentation="Number of generation tokens processed.",
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
     def log(self, scheduler_stats: SchedulerStats,
             iteration_stats: IterationStats):
         """Log to prometheus."""
@@ -116,9 +131,42 @@ def log(self, scheduler_stats: SchedulerStats,
         self.counter_generation_tokens.inc(
             iteration_stats.num_generation_tokens)
 
+        for finished_request in iteration_stats.finished_requests:
+            self.histogram_num_prompt_tokens_request.observe(
+                finished_request.num_prompt_tokens)
+            self.histogram_num_generation_tokens_request.observe(
+                finished_request.num_generation_tokens)
+
     @staticmethod
     def _unregister_vllm_metrics():
         # Unregister any existing vLLM collectors (for CI/CD
         for collector in list(prometheus_client.REGISTRY._collector_to_names):
             if hasattr(collector, "_name") and "vllm" in collector._name:
                 prometheus_client.REGISTRY.unregister(collector)
+
+
+def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
+    """
+    Builds a list of buckets with increasing powers of 10 multiplied by
+    mantissa values until the value exceeds the specified maximum.
+
+    """
+    exponent = 0
+    buckets: List[int] = []
+    while True:
+        for m in mantissa_lst:
+            value = m * 10**exponent
+            if value <= max_value:
+                buckets.append(value)
+            else:
+                return buckets
+        exponent += 1
+
+
+def build_1_2_5_buckets(max_value: int) -> List[int]:
+    """
+    Example:
+    >>> build_1_2_5_buckets(100)
+    [1, 2, 5, 10, 20, 50, 100]
+    """
+    return build_buckets([1, 2, 5], max_value)
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 60cb986f8bbce..55d85a7992cc5 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,7 +1,8 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, List
 
 if TYPE_CHECKING:
+    from vllm.outputs import RequestOutput
     from vllm.v1.engine import EngineCoreOutput
 
 
@@ -16,6 +17,21 @@ class SchedulerStats:
     # gpu_prefix_cache_hit_rate: float = 0.0
 
 
+@dataclass
+class RequestStateStats:
+    """Stats that need to be tracked across delta updates."""
+
+    num_generation_tokens: int = 0
+
+
+@dataclass
+class FinishedRequestStats:
+    """Stats associated with a finished request."""
+
+    num_prompt_tokens: int = 0
+    num_generation_tokens: int = 0
+
+
 class IterationStats:
     """Stats associated with a single set of EngineCoreOutputs."""
 
@@ -23,17 +39,29 @@ def __init__(self, log_stats: bool):
         self.log_stats = log_stats
         self.num_generation_tokens = 0
         self.num_prompt_tokens = 0
+        self.finished_requests: List[FinishedRequestStats] = []
 
     def update_from_output(self, output: "EngineCoreOutput",
-                           is_prefilling: bool, prompt_len: int):
+                           is_prefilling: bool, prompt_len: int,
+                           request_state_stats: RequestStateStats):
         if not self.log_stats:
             return
 
-        self.num_generation_tokens += len(output.new_token_ids)
+        num_new_generation_tokens = len(output.new_token_ids)
+
+        self.num_generation_tokens += num_new_generation_tokens
         if is_prefilling:
             # This relies on the invariant that EngineCore does
             # not stream outputs for partially completed prefills
             # (scheduler.update_from_output makes EngineCoreOutput
             # iff num_computed_tokens == num_tokens).
-            assert (len(output.new_token_ids) > 0)
+            assert (num_new_generation_tokens > 0)
             self.num_prompt_tokens += prompt_len
+
+        request_state_stats.num_generation_tokens += num_new_generation_tokens
+
+    def update_from_finished_request(self, request_output: "RequestOutput",
+                                     request_state_stats: RequestStateStats):
+        self.finished_requests.append(
+            FinishedRequestStats(len(request_output.prompt_token_ids),
+                                 request_state_stats.num_generation_tokens))

From 80fcc3ed1c940ea43e1b495bbdf8b9765f837128 Mon Sep 17 00:00:00 2001
From: fenghuizhang <159459388+fenghuizhang@users.noreply.github.com>
Date: Tue, 28 Jan 2025 14:36:44 -0800
Subject: [PATCH 24/69] [Kernel] Pipe attn_logits_soft_cap through paged
 attention TPU kernels (#12482)

Signed-off-by: Fenghui Zhang <fhzhang@google.com>
---
 .buildkite/run-tpu-test.sh        |  0
 vllm/attention/backends/pallas.py | 42 ++++++++++++-------------------
 2 files changed, 16 insertions(+), 26 deletions(-)
 mode change 100644 => 100755 .buildkite/run-tpu-test.sh

diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
old mode 100644
new mode 100755
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index facdee6b29e39..209a623ba441c 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -110,6 +110,7 @@ def __init__(
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.logits_soft_cap = logits_soft_cap
         if head_size % 128 != 0:
             raise NotImplementedError("Head size must be a multiple of 128.")
         if alibi_slopes is not None:
@@ -120,9 +121,6 @@ def __init__(
             raise NotImplementedError("FP8 KV cache dtype is not supported.")
         if blocksparse_params is not None:
             raise NotImplementedError("Blocksparse is not supported.")
-        if logits_soft_cap is not None:
-            raise NotImplementedError(
-                "Attention logits soft-capping is not supported.")
 
         if torch_xla.tpu.version() < 4:
             raise NotImplementedError("TPU version must be 4 or higher.")
@@ -230,6 +228,7 @@ def forward(
                     num_kv_pages_per_compute_block,
                     num_queries_per_compute_block,
                     use_kernel=True,
+                    attn_logits_soft_cap=self.logits_soft_cap,
                 )
         else:
             # Decoding run.
@@ -257,6 +256,7 @@ def forward(
                     attn_metadata.block_tables,
                     pages_per_compute_block,
                     self.megacore_mode,
+                    attn_logits_soft_cap=self.logits_soft_cap,
                 )
             else:
                 chunk_size = max_num_seq
@@ -280,6 +280,7 @@ def forward(
                         attn_metadata.block_tables[chunk_start:chunk_end],
                         pages_per_compute_block,
                         self.megacore_mode,
+                        attn_logits_soft_cap=self.logits_soft_cap,
                     )
                     output[chunk_start:chunk_end] = chunk_output
 
@@ -313,6 +314,8 @@ def paged_attention(
     block_tables: torch.Tensor,
     pages_per_compute_block: int,
     megacore_mode: Optional[str],
+    *,
+    attn_logits_soft_cap: Optional[float],
 ) -> torch.Tensor:
     batch_size = query.shape[0]
     if megacore_mode == "batch" and batch_size % 2 != 0:
@@ -320,26 +323,13 @@ def paged_attention(
     else:
         megacore_mode = megacore_mode
 
-    # NOTE(woosuk): A temporary workaround to avoid the error:
-    # "xla::paged_attention() Expected a value of type 'str' for
-    # argument 'megacore_mode' but instead found type 'NoneType'."
-    if megacore_mode is not None:
-        output = torch.ops.xla.paged_attention(
-            query,
-            key_cache,
-            value_cache,
-            context_lens,
-            block_tables,
-            pages_per_compute_block,
-            megacore_mode=megacore_mode,
-        )
-    else:
-        output = torch.ops.xla.paged_attention(
-            query,
-            key_cache,
-            value_cache,
-            context_lens,
-            block_tables,
-            pages_per_compute_block,
-        )
-    return output
+    return torch.ops.xla.paged_attention(
+        query,
+        key_cache,
+        value_cache,
+        context_lens,
+        block_tables,
+        pages_per_compute_block,
+        megacore_mode=megacore_mode,
+        attn_logits_soft_cap=attn_logits_soft_cap,
+    )

From fbb5bd4cefd62e3e389e2b873d5859eb8e07cbfa Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 28 Jan 2025 22:16:47 -0500
Subject: [PATCH 25/69] [TPU] Add example for profiling TPU inference (#12531)

Signed-off-by: mgoin <mgoin@redhat.com>
---
 .../offline_inference/profiling_tpu/README.md |  67 ++++++++++++
 .../profiling_tpu/profiling.py                | 101 ++++++++++++++++++
 2 files changed, 168 insertions(+)
 create mode 100644 examples/offline_inference/profiling_tpu/README.md
 create mode 100644 examples/offline_inference/profiling_tpu/profiling.py

diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md
new file mode 100644
index 0000000000000..08efa63dc1021
--- /dev/null
+++ b/examples/offline_inference/profiling_tpu/README.md
@@ -0,0 +1,67 @@
+# vLLM TPU Profiling
+
+This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes.
+
+Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes.
+
+We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/ai_accelerator/index.html).
+
+> In all examples below, we run several warmups before (so `--enforce-eager` is okay)
+
+## Profile Examples
+
+### Generate Prefill Trace
+
+This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations.
+
+```bash
+export XLA_HLO_DEBUG=1
+export MODEL=Qwen/Qwen2.5-7B-Instruct
+export VLLM_TPU_PROFILE_DURATION_MS=3000
+export VLLM_TPU_PROFILE_DELAY_MS=0
+
+python3 profiling.py \
+    --model $MODEL \
+    --input-len 1024 --output-len 1 \
+    --batch-size 1 --enforce-eager \
+    --max-model-len 2048 \
+    --tensor-parallel-size 1 \
+    --profile-result-dir profiles
+```
+
+
+### Generate Decode Trace
+
+This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill).
+
+```bash
+export XLA_HLO_DEBUG=1
+export MODEL=meta-llama/Llama-3.1-70B-Instruct
+export VLLM_TPU_PROFILE_DURATION_MS=2000
+export VLLM_TPU_PROFILE_DELAY_MS=1000
+
+rm -rf ~/.cache/vllm/xla_cache
+python3 profiling.py \
+    --model $MODEL \
+    --input-len 1 \
+    --output-len 128 \
+    --batch-size 32 \
+    --enforce-eager \
+    --profile-result-dir profiles \
+    --max-model-len 2048 --tensor-parallel-size 8
+```
+
+
+## Visualizing the profiles
+
+Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).
+
+Here are most likely the dependencies you need to install:
+```bash
+pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
+```
+
+Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
+```bash
+tensorboard --logdir profiles/ --port 6006
+```
\ No newline at end of file
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
new file mode 100644
index 0000000000000..d7423e6c6da93
--- /dev/null
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -0,0 +1,101 @@
+import argparse
+import dataclasses
+import os
+import time
+from typing import List
+
+import numpy as np
+import torch_xla.debug.profiler as xp
+from tqdm import tqdm
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.utils import FlexibleArgumentParser
+
+DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
+DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    _ = xp.start_server(9012)
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompts: List[PromptType] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+
+    def run_to_completion():
+        start_time = time.perf_counter()
+        llm.generate(dummy_prompts,
+                     sampling_params=sampling_params,
+                     use_tqdm=False)
+        end_time = time.perf_counter()
+        latency = end_time - start_time
+        return latency
+
+    # Warmup
+    print("Warming up...")
+    warmup_latencies = []
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        warmup_latencies.append(run_to_completion())
+    print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s")
+
+    # Profile
+    profile_dir = args.profile_result_dir
+    print(f"Profiling (results will be saved to '{profile_dir}')...")
+    # Enable tracing on server
+    xp.trace_detached("localhost:9012",
+                      profile_dir,
+                      delay_ms=DELAY_MS,
+                      duration_ms=DURATION_MS)
+    if DELAY_MS == 0:
+        time.sleep(1.0)
+    profile_latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profile iterations"):
+        profile_latencies.append(run_to_completion())
+    print(f"Average profile latency: {np.mean(profile_latencies):.4f}s")
+
+    return
+
+
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser(
+        description='Benchmark the latency of processing a single batch of '
+        'requests till completion.')
+    parser.add_argument('--input-len', type=int, default=32)
+    parser.add_argument('--output-len', type=int, default=128)
+    parser.add_argument('--batch-size', type=int, default=8)
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=5,
+                        help='Number of iterations to run for warmup.')
+    parser.add_argument('--num-iters',
+                        type=int,
+                        default=1,
+                        help='Number of iterations to run for profiling.')
+    parser.add_argument(
+        '--profile-result-dir',
+        type=str,
+        default="profiles",
+        help=
+        ('path to save the pytorch profiler output. Can be visualized '
+         'with ui.perfetto.dev or Tensorboard '
+         '(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).'
+         ))
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)

From a7e3eba66fff82f7e12bb2354c4b26635f0f7761 Mon Sep 17 00:00:00 2001
From: Ce Gao <gaocegege@hotmail.com>
Date: Wed, 29 Jan 2025 11:38:08 +0800
Subject: [PATCH 26/69] [Frontend] Support reasoning content for deepseek r1
 (#12473)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
---
 docs/source/features/reasoning_outputs.md     | 151 +++++++++++++++++
 docs/source/index.md                          |   1 +
 .../openai_chat_completion_with_reasoning.py  |  53 ++++++
 ...hat_completion_with_reasoning_streaming.py |  90 ++++++++++
 .../openai/reasoning_parsers/__init__.py      |   0
 .../test_deepseekr1_reasoning_parser.py       | 120 +++++++++++++
 .../openai/reasoning_parsers/utils.py         |  93 +++++++++++
 tests/entrypoints/openai/test_cli_args.py     |  29 ++++
 vllm/entrypoints/openai/api_server.py         |  10 ++
 vllm/entrypoints/openai/cli_args.py           |  30 ++++
 vllm/entrypoints/openai/protocol.py           |   2 +
 .../openai/reasoning_parsers/__init__.py      |   6 +
 .../abs_reasoning_parsers.py                  | 158 ++++++++++++++++++
 .../deepseek_r1_reasoning_parser.py           | 133 +++++++++++++++
 vllm/entrypoints/openai/serving_chat.py       | 105 +++++++++++-
 vllm/scripts.py                               |   1 +
 16 files changed, 977 insertions(+), 5 deletions(-)
 create mode 100644 docs/source/features/reasoning_outputs.md
 create mode 100644 examples/online_serving/openai_chat_completion_with_reasoning.py
 create mode 100644 examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
 create mode 100644 tests/entrypoints/openai/reasoning_parsers/__init__.py
 create mode 100644 tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
 create mode 100644 tests/entrypoints/openai/reasoning_parsers/utils.py
 create mode 100644 vllm/entrypoints/openai/reasoning_parsers/__init__.py
 create mode 100644 vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
 create mode 100644 vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py

diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
new file mode 100644
index 0000000000000..e39bbacf1138d
--- /dev/null
+++ b/docs/source/features/reasoning_outputs.md
@@ -0,0 +1,151 @@
+(reasoning-outputs)=
+
+# Reasoning Outputs
+
+vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
+
+Reasoning models return a additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
+
+## Supported Models
+
+vLLM currently supports the following reasoning models:
+
+- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) (`deepseek_r1`, which looks for `<think> ... </think>`)
+
+## Quickstart
+
+To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+Next, make a request to the model that should return the reasoning content in the response.
+
+```python
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+# Round 1
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+response = client.chat.completions.create(model=model, messages=messages)
+
+reasoning_content = response.choices[0].message.reasoning_content
+content = response.choices[0].message.content
+
+print("reasoning_content:", reasoning_content)
+print("content:", content)
+```
+
+The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
+
+## Streaming chat completions
+
+Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
+
+```json
+{
+    "id": "chatcmpl-123",
+    "object": "chat.completion.chunk",
+    "created": 1694268190,
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+    "system_fingerprint": "fp_44709d6fcb",
+    "choices": [
+        {
+            "index": 0,
+            "delta": {
+                "role": "assistant",
+                "reasoning_content": "is",
+            },
+            "logprobs": null,
+            "finish_reason": null
+        }
+    ]
+}
+```
+
+Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests.
+
+## How to support a new reasoning model
+
+You can add a new `ReasoningParser` similar to `vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py`.
+
+```python
+# import the required packages
+
+from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
+    ReasoningParser, ReasoningParserManager)
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+
+# define a reasoning parser and register it to vllm
+# the name list in register_module can be used
+# in --reasoning-parser.
+@ReasoningParserManager.register_module(["example"])
+class ExampleParser(ReasoningParser):
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Instance method that should be implemented for extracting reasoning
+        from an incomplete response; for use when handling reasoning calls and
+        streaming. Has to be an instance method because  it requires state -
+        the current tokens/diffs, but also the information about what has
+        previously been parsed and extracted (see constructor)
+        """
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from a complete model-generated string.
+
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+
+        Parameters:
+        model_output: str
+            The model-generated string to extract reasoning content from.
+
+        request: ChatCompletionRequest
+            The request object that was used to generate the model_output.
+
+        Returns:
+        Tuple[Optional[str], Optional[str]]
+            A tuple containing the reasoning content and the content.
+        """
+```
+
+After defining the reasoning parser, you can use it by specifying the `--reasoning-parser` flag when making a request to the chat completion endpoint.
+
+```bash
+vllm serve <model_tag> \
+    --enable-reasoning --reasoning-parser example
+```
+
+## Limitations
+
+- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
+- It is not compatible with the [`structured_outputs`](#structured_outputs) and [`tool_calling`](#tool_calling) features.
+- The reasoning content is not available for all models. Check the model's documentation to see if it supports reasoning.
diff --git a/docs/source/index.md b/docs/source/index.md
index 2c302d3f3e863..6957d5dd0f2e7 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -90,6 +90,7 @@ models/extensions/index
 features/quantization/index
 features/lora
 features/tool_calling
+features/reasoning_outputs
 features/structured_outputs
 features/automatic_prefix_caching
 features/disagg_prefill
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
new file mode 100644
index 0000000000000..83e51a48bcc6b
--- /dev/null
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -0,0 +1,53 @@
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+
+To run this example, you need to start the vLLM server with the reasoning 
+parser:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+     --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+This example demonstrates how to generate chat completions from reasoning models
+using the OpenAI Python client library.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+# Round 1
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+response = client.chat.completions.create(model=model, messages=messages)
+
+reasoning_content = response.choices[0].message.reasoning_content
+content = response.choices[0].message.content
+
+print("reasoning_content:", reasoning_content)
+print("content:", content)
+
+# Round 2
+messages.append({"role": "assistant", "content": content})
+messages.append({
+    "role": "user",
+    "content": "How many Rs are there in the word 'strawberry'?",
+})
+response = client.chat.completions.create(model=model, messages=messages)
+
+reasoning_content = response.choices[0].message.reasoning_content
+content = response.choices[0].message.content
+
+print("reasoning_content:", reasoning_content)
+print("content:", content)
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
new file mode 100644
index 0000000000000..8c14aac6b4ecb
--- /dev/null
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -0,0 +1,90 @@
+"""
+An example shows how to generate chat completions from reasoning models
+like DeepSeekR1.
+
+To run this example, you need to start the vLLM server with the reasoning 
+parser:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+     --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
+streaming chat completions feature.
+
+The streaming chat completions feature allows you to receive chat completions
+in real-time as they are generated by the model. This is useful for scenarios
+where you want to display chat completions to the user as they are generated
+by the model.
+
+Here we do not use the OpenAI Python client library, because it does not support
+`reasoning_content` fields in the response.
+"""
+
+import json
+
+import requests
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+models = requests.get(
+    f"{openai_api_base}/models",
+    headers={
+        "Authorization": f"Bearer {openai_api_key}"
+    },
+).json()
+model = models["data"][0]["id"]
+
+# Streaming chat completions
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+
+response = requests.post(
+    f"{openai_api_base}/chat/completions",
+    headers={"Authorization": f"Bearer {openai_api_key}"},
+    json={
+        "model": model,
+        "messages": messages,
+        "stream": True
+    },
+)
+
+print("client: Start streaming chat completions...")
+printed_reasoning_content = False
+printed_content = False
+# Make the streaming request
+if response.status_code == 200:
+    # Process the streaming response
+    for line in response.iter_lines():
+        if line:  # Filter out keep-alive new lines
+            # Decode the line and parse the JSON
+            decoded_line = line.decode("utf-8")
+            if decoded_line.startswith("data:"):
+                data = decoded_line[5:].strip()  # Remove "data:" prefix
+                if data == "[DONE]":  # End of stream
+                    print("\nclient: Stream completed.")
+                    break
+                try:
+                    # Parse the JSON data
+                    chunk = json.loads(data)
+                    reasoning_content = chunk["choices"][0]["delta"].get(
+                        "reasoning_content", "")
+                    content = chunk["choices"][0]["delta"].get("content", "")
+
+                    if reasoning_content:
+                        if not printed_reasoning_content:
+                            printed_reasoning_content = True
+                            print("reasoning_content:", end="", flush=True)
+                        print(reasoning_content, end="", flush=True)
+                    elif content:
+                        if not printed_content:
+                            printed_content = True
+                            print("\ncontent:", end="", flush=True)
+                        # Extract and print the content
+                        print(content, end="", flush=True)
+                except json.JSONDecodeError:
+                    print("Error decoding JSON:", decoded_line)
+else:
+    print(f"Error: {response.status_code} - {response.text}")
diff --git a/tests/entrypoints/openai/reasoning_parsers/__init__.py b/tests/entrypoints/openai/reasoning_parsers/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
new file mode 100644
index 0000000000000..4607e4dfe4d0b
--- /dev/null
+++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
@@ -0,0 +1,120 @@
+from typing import List
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.entrypoints.openai.reasoning_parsers.utils import (
+    run_reasoning_extraction)
+from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
+                                                       ReasoningParserManager)
+
+parser_name = "deepseek_r1"
+start_token = "<think>"
+end_token = "</think>"
+
+SIMPLE_REASONING = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+NO_REASONING = {
+    "output": "This is a reasoning section",
+    "reasoning_content": None,
+    "content": "This is a reasoning section",
+}
+MULTIPLE_LINES = {
+    "output": "<think>This\nThat</think>This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+SHORTEST_REASONING_NO_STREAMING = {
+    "output": "<think></think>This is the rest",
+    "reasoning_content": "",
+    "content": "This is the rest",
+}
+SHORTEST_REASONING = {
+    "output": "<think></think>This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_streaming",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_streaming",
+    ),
+    pytest.param(
+        False,
+        NO_REASONING,
+        id="no_streaming",
+    ),
+    pytest.param(
+        True,
+        NO_REASONING,
+        id="no_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING,
+        id="shortest_streaming",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING,
+        id="shortest_streaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+):
+    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+    tokenizer.add_tokens([start_token, end_token])
+    output = tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: List[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/entrypoints/openai/reasoning_parsers/utils.py
new file mode 100644
index 0000000000000..ac73ad50a7395
--- /dev/null
+++ b/tests/entrypoints/openai/reasoning_parsers/utils.py
@@ -0,0 +1,93 @@
+from typing import List, Optional, Tuple, Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser
+
+
+class StreamingReasoningReconstructor:
+
+    def __init__(self):
+        self.reasoning_content = None
+        self.other_content = None
+
+    def append_delta(self, delta: DeltaMessage):
+        # content and the reasoning content should not be present
+        # at the same time
+        assert delta.content is None or delta.reasoning_content is None, (
+            "Both content and reasoning content are present in the "
+            "delta message")
+        if delta.content is not None:
+            if self.other_content is None:
+                self.other_content = delta.content
+            else:
+                self.other_content += delta.content
+        else:
+            if self.reasoning_content is None:
+                self.reasoning_content = delta.reasoning_content
+            else:
+                self.reasoning_content += delta.reasoning_content
+
+
+def run_reasoning_extraction(
+    reasoning_parser: ReasoningParser,
+    model_output: List[str],
+    request: Union[ChatCompletionRequest, None] = None,
+    streaming: bool = False,
+) -> Tuple[Optional[str], Optional[str]]:
+    if streaming:
+        reconstructor = run_reasoning_extraction_streaming(
+            reasoning_parser,
+            model_output,
+            request,
+        )
+        return (
+            reconstructor.reasoning_content,
+            reconstructor.other_content or None,
+        )
+    else:
+        reasoning, content = run_reasoning_extraction_nonstreaming(
+            reasoning_parser, model_output, request)
+        return reasoning, content
+
+
+def run_reasoning_extraction_nonstreaming(
+    reasoning_parser: ReasoningParser,
+    model_output: List[str],
+    request: Union[ChatCompletionRequest, None] = None,
+) -> Tuple[Optional[str], Optional[str]]:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    return reasoning_parser.extract_reasoning_content(
+        model_output=''.join(model_output), request=request)
+
+
+def run_reasoning_extraction_streaming(
+    reasoning_parser: ReasoningParser,
+    model_deltas: List[str],
+    request: Union[ChatCompletionRequest, None] = None,
+) -> StreamingReasoningReconstructor:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    reconstructor = StreamingReasoningReconstructor()
+    previous_text = ""
+    previous_tokens: List[int] = []
+    for delta in model_deltas:
+        token_delta = [
+            reasoning_parser.vocab.get(token)
+            for token in reasoning_parser.model_tokenizer.tokenize(delta)
+            if token in reasoning_parser.vocab
+        ]
+        current_text = previous_text + delta
+        current_tokens = previous_tokens + token_delta
+        delta_message = reasoning_parser.extract_reasoning_content_streaming(
+            previous_text,
+            current_text,
+            delta,
+            previous_tokens,
+            current_tokens,
+            token_delta,
+        )
+        if delta_message is not None:
+            reconstructor.append_delta(delta_message)
+        previous_text = current_text
+        previous_tokens = current_tokens
+    return reconstructor
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index e49562ad6a21f..01bcd78aa91a8 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -116,6 +116,35 @@ def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
     validate_parsed_serve_args(args)
 
 
+def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
+    """Ensure validation fails if reasoning is enabled with auto tool choice"""
+    args = serve_parser.parse_args(args=[
+        "--enable-auto-tool-choice",
+        "--enable-reasoning",
+    ])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+
+
+def test_enable_reasoning_passes_with_reasoning_parser(serve_parser):
+    """Ensure validation passes if reasoning is enabled 
+    with a reasoning parser"""
+    args = serve_parser.parse_args(args=[
+        "--enable-reasoning",
+        "--reasoning-parser",
+        "deepseek_r1",
+    ])
+    validate_parsed_serve_args(args)
+
+
+def test_enable_reasoning_fails_without_reasoning_parser(serve_parser):
+    """Ensure validation fails if reasoning is enabled 
+    without a reasoning parser"""
+    args = serve_parser.parse_args(args=["--enable-reasoning"])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+
+
 def test_chat_template_validation_for_happy_paths(serve_parser):
     """Ensure validation passes if the chat template exists"""
     args = serve_parser.parse_args(
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 077bc993726ae..9e5cf4ba2e490 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -61,6 +61,7 @@
                                               TokenizeRequest,
                                               TokenizeResponse,
                                               UnloadLoraAdapterRequest)
+from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -771,6 +772,8 @@ async def init_app_state(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_auto_tools=args.enable_auto_tool_choice,
         tool_parser=args.tool_call_parser,
+        enable_reasoning=args.enable_reasoning,
+        reasoning_parser=args.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
     ) if model_config.runner_type == "generate" else None
     state.openai_serving_completion = OpenAIServingCompletion(
@@ -844,6 +847,13 @@ async def run_server(args, **uvicorn_kwargs) -> None:
         raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
                        f"(chose from {{ {','.join(valid_tool_parses)} }})")
 
+    valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
+    if args.enable_reasoning \
+        and args.reasoning_parser not in valid_reasoning_parses:
+        raise KeyError(
+            f"invalid reasoning parser: {args.reasoning_parser} "
+            f"(chose from {{ {','.join(valid_reasoning_parses)} }})")
+
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.
     # see https://github.com/vllm-project/vllm/issues/8204
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 4df75a665bab9..9cfe07c65d55e 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -12,6 +12,7 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
+from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
 from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
                                                     PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
@@ -208,6 +209,23 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         default=False,
         help="Enable auto tool choice for supported models. Use "
         "``--tool-call-parser`` to specify which parser to use.")
+    parser.add_argument(
+        "--enable-reasoning",
+        action="store_true",
+        default=False,
+        help="Whether to enable reasoning_content for the model. "
+        "If enabled, the model will be able to generate reasoning content.")
+
+    valid_reasoning_parsers = ReasoningParserManager.reasoning_parsers.keys()
+    parser.add_argument(
+        "--reasoning-parser",
+        type=str,
+        metavar="{" + ",".join(valid_reasoning_parsers) + "}",
+        default=None,
+        help=
+        "Select the reasoning parser depending on the model that you're using."
+        " This is used to parse the reasoning content into OpenAI API "
+        "format. Required for ``--enable-reasoning``.")
 
     valid_tool_parsers = ToolParserManager.tool_parsers.keys()
     parser.add_argument(
@@ -267,6 +285,18 @@ def validate_parsed_serve_args(args: argparse.Namespace):
         raise TypeError("Error: --enable-auto-tool-choice requires "
                         "--tool-call-parser")
 
+    # Enable reasoning needs a reasoning parser to be valid
+    if args.enable_reasoning and not args.reasoning_parser:
+        raise TypeError("Error: --enable-reasoning requires "
+                        "--reasoning-parser")
+
+    # Ref https://api-docs.deepseek.com/guides/reasoning_model
+    # tool call and reasoning cannot be enabled at the same time.
+    if args.enable_auto_tool_choice and args.enable_reasoning:
+        raise TypeError(
+            "Error: --enable-auto-tool-choice and "
+            "--enable-reasoning cannot be enabled at the same time")
+
 
 def create_parser_for_docs() -> FlexibleArgumentParser:
     parser_for_docs = FlexibleArgumentParser(
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index f89c3f42aab17..2bc136cc48038 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1202,6 +1202,7 @@ class ExtractedToolCallInformation(BaseModel):
 
 class ChatMessage(OpenAIBaseModel):
     role: str
+    reasoning_content: Optional[str] = None
     content: Optional[str] = None
     tool_calls: List[ToolCall] = Field(default_factory=list)
 
@@ -1243,6 +1244,7 @@ class ChatCompletionResponse(OpenAIBaseModel):
 class DeltaMessage(OpenAIBaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
+    reasoning_content: Optional[str] = None
     tool_calls: List[DeltaToolCall] = Field(default_factory=list)
 
 
diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/entrypoints/openai/reasoning_parsers/__init__.py
new file mode 100644
index 0000000000000..a21bff52f61fa
--- /dev/null
+++ b/vllm/entrypoints/openai/reasoning_parsers/__init__.py
@@ -0,0 +1,6 @@
+from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
+from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+
+__all__ = [
+    "ReasoningParser", "ReasoningParserManager", "DeepSeekR1ReasoningParser"
+]
diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
new file mode 100644
index 0000000000000..e5d10ee0bc3a8
--- /dev/null
+++ b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
@@ -0,0 +1,158 @@
+import os
+from functools import cached_property
+from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import import_from_path, is_list_of
+
+logger = init_logger(__name__)
+
+
+class ReasoningParser:
+    """
+    Abstract reasoning parser class that should not be used directly. 
+    Provided and methods should be used in derived classes.
+
+    It is used to extract reasoning content from the model output.
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        self.model_tokenizer = tokenizer
+
+    @cached_property
+    def vocab(self) -> Dict[str, int]:
+        # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
+        # whereas all tokenizers have .get_vocab()
+        return self.model_tokenizer.get_vocab()
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from a complete model-generated string.
+
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+
+        Parameters:
+        model_output: str
+            The model-generated string to extract reasoning content from.
+
+        request: ChatCompletionRequest
+            The request object that was used to generate the model_output.
+
+        Returns:
+        Tuple[Optional[str], Optional[str]]
+            A tuple containing the reasoning content and the content.
+        """
+
+        raise NotImplementedError(
+            "AbstractReasoningParser.extract_reasoning_calls "
+            "has not been implemented!")
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Instance method that should be implemented for extracting reasoning
+        from an incomplete response; for use when handling reasoning calls and
+        streaming. Has to be an instance method because  it requires state -
+        the current tokens/diffs, but also the information about what has
+        previously been parsed and extracted (see constructor)
+        """
+        raise NotImplementedError(
+            "AbstractReasoningParser.extract_reasoning_content_streaming "
+            "has not been implemented!")
+
+
+class ReasoningParserManager:
+    reasoning_parsers: Dict[str, Type] = {}
+
+    @classmethod
+    def get_reasoning_parser(cls, name) -> Type:
+        """
+        Get reasoning parser by name which is registered by `register_module`.
+
+        Raise a KeyError exception if the name is not registered.
+        """
+        if name in cls.reasoning_parsers:
+            return cls.reasoning_parsers[name]
+
+        raise KeyError(f"reasoning helper: '{name}' not found in "
+                       "reasoning_parsers")
+
+    @classmethod
+    def _register_module(cls,
+                         module: Type,
+                         module_name: Optional[Union[str, List[str]]] = None,
+                         force: bool = True) -> None:
+        if not issubclass(module, ReasoningParser):
+            raise TypeError("module must be subclass of ReasoningParser, "
+                            f"but got {type(module)}")
+        if module_name is None:
+            module_name = module.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in cls.reasoning_parsers:
+                existed_module = cls.reasoning_parsers[name]
+                raise KeyError(f"{name} is already registered "
+                               f"at {existed_module.__module__}")
+            cls.reasoning_parsers[name] = module
+
+    @classmethod
+    def register_module(
+            cls,
+            name: Optional[Union[str, List[str]]] = None,
+            force: bool = True,
+            module: Union[Type, None] = None) -> Union[type, Callable]:
+        """
+        Register module with the given name or name list. it can be used as a
+        decoder(with module as None) or normal function(with module as not 
+        None).
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f"force must be a boolean, but got {type(force)}")
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str)
+                or is_list_of(name, str)):
+            raise TypeError(
+                "name must be None, an instance of str, or a sequence of str, "
+                f"but got {type(name)}")
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(module):
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        return _register
+
+    @classmethod
+    def import_reasoning_parser(cls, plugin_path: str) -> None:
+        """
+        Import a user-defined reasoning parser by the path 
+        of the reasoning parser define file.
+        """
+        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
+
+        try:
+            import_from_path(module_name, plugin_path)
+        except Exception:
+            logger.exception("Failed to load module '%s' from %s.",
+                             module_name, plugin_path)
+            return
diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
new file mode 100644
index 0000000000000..a440ddc8d3b5d
--- /dev/null
+++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
@@ -0,0 +1,133 @@
+import re
+from typing import Optional, Sequence, Tuple, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
+    ReasoningParser, ReasoningParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("deepseek_r1")
+class DeepSeekR1ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for DeepSeek R1 model.
+
+    The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning 
+    text. This parser extracts the reasoning content from the model output.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+        self.think_start_token = "<think>"
+        self.think_end_token = "</think>"
+
+        self.reasoning_regex = re.compile(
+            rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction.")
+
+        self.think_start_token_id = self.vocab.get(self.think_start_token)
+        self.think_end_token_id = self.vocab.get(self.think_end_token)
+        if (self.think_start_token_id is None
+                or self.think_end_token_id is None):
+            raise RuntimeError(
+                "DeepSeek R1 reasoning parser could not locate think start/end "
+                "tokens in the tokenizer!")
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract reasoning content from a delta message.
+        Handles streaming output where previous + delta = current.
+        Uses token IDs for faster processing.
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        """
+        # Skip single special tokens
+        if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
+                self.think_start_token_id, self.think_end_token_id
+        ]):
+            return None
+
+        if self.think_start_token_id in previous_token_ids:
+            if self.think_end_token_id in delta_token_ids:
+                # <think> in previous, </think> in delta,
+                # extract reasoning content
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            elif self.think_end_token_id in previous_token_ids:
+                # <think> in previous, </think> in previous,
+                # reasoning content continues
+                return DeltaMessage(content=delta_text)
+            else:
+                # <think> in previous, no </think> in previous or delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        elif self.think_start_token_id in delta_token_ids:
+            logger.info(delta_text)
+            if self.think_end_token_id in delta_token_ids:
+                # <think> in delta, </think> in delta, extract reasoning content
+                start_index = delta_text.find(self.think_start_token)
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[start_index +
+                                               len(self.think_start_token
+                                                   ):end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            else:
+                # <think> in delta, no </think> in delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        else:
+            # No <think> in previous or delta, reasoning content continues.
+            return DeltaMessage(content=delta_text)
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> Tuple[Optional[str], Optional[str]]:
+
+        # Check if the model output contains the <think> tokens.
+        if (self.think_start_token not in model_output
+                or self.think_end_token not in model_output):
+            return None, model_output
+        else:
+            # Use a regex to find the reasoning content
+            reasoning_content = self.reasoning_regex.findall(model_output)[0]
+
+            # Remove the reasoning content from the model output
+            # Although deepseek's <think> token is always at the
+            # beginning of the line, we cannot guarantee that the
+            # other models will follow this convention.
+            # Therefore, we need to add :start_index.
+            start_index = model_output.find(self.think_start_token)
+            if start_index != -1:
+                end_index = start_index + len(
+                    f"{self.think_start_token}{reasoning_content}{self.think_end_token}"
+                )
+                model_output = model_output[:start_index] + \
+                                model_output[end_index:]
+
+                if len(model_output) == 0:
+                    return reasoning_content, None
+
+            return reasoning_content, model_output
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 89a119ac65695..dc97f0eb059d7 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -21,6 +21,8 @@
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
     DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
     RequestResponseMetadata, ToolCall, UsageInfo)
+from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
+                                                       ReasoningParserManager)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
@@ -47,6 +49,8 @@ def __init__(
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
         return_tokens_as_token_ids: bool = False,
+        enable_reasoning: bool = False,
+        reasoning_parser: Optional[str] = None,
         enable_auto_tools: bool = False,
         tool_parser: Optional[str] = None,
         enable_prompt_tokens_details: bool = False,
@@ -69,6 +73,18 @@ def __init__(
                 " the parallel_tool_calls client option is preset for "
                 "compatibility reasons, it will be ignored.")
 
+        self.enable_reasoning: bool = enable_reasoning
+        self.reasoning_parser: Optional[Callable[[AnyTokenizer],
+                                                 ReasoningParser]] = None
+        if self.enable_reasoning:
+            try:
+                self.reasoning_parser = (
+                    ReasoningParserManager.get_reasoning_parser(
+                        reasoning_parser))
+            except Exception as e:
+                raise TypeError("Error: --enable-reasoning requires "
+                                f"reasoning_parser:'{reasoning_parser}' "
+                                "which has not been registered") from e
         self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
         if self.enable_auto_tools:
             try:
@@ -285,14 +301,35 @@ async def chat_completion_stream_generator(
             not tool_choice_function_name
             and self._should_stream_with_auto_tool_parsing(request))
 
+        should_stream_with_reasoning_parsing = (
+            self._should_stream_with_reasoning_parsing(request))
+
         all_previous_token_ids: Optional[List[List[int]]]
-        if tool_choice_auto:
+
+        # Only one of these will be used, thus previous_texts and
+        # all_previous_token_ids will not be used twice in the same iteration.
+        if tool_choice_auto or should_stream_with_reasoning_parsing:
             # These are only required in "auto" tool choice case
             previous_texts = [""] * num_choices
             all_previous_token_ids = [[]] * num_choices
         else:
             previous_texts, all_previous_token_ids = None, None
 
+        try:
+            # There is no need to check if the reasoning_parser is None
+            # because the should_stream_with_reasoning_parsing check
+            # already ensures that the reasoning_parser is not None.
+            # but the pre-commit hook requires it.
+            if should_stream_with_reasoning_parsing and \
+                self.reasoning_parser is not None:
+                reasoning_parser = self.reasoning_parser(tokenizer)
+        except RuntimeError as e:
+            logger.exception("Error in reasoning parser creation.")
+            data = self.create_streaming_error_response(str(e))
+            yield f"data: {data}\n\n"
+            yield "data: [DONE]\n\n"
+            return
+
         # Prepare the tool parser if it's needed
         try:
             if tool_choice_auto and self.tool_parser:
@@ -456,6 +493,32 @@ async def chat_completion_stream_generator(
                         # update the previous values for the next iteration
                         previous_texts[i] = current_text
                         all_previous_token_ids[i] = current_token_ids
+                    # reasoning_content cannot be enabled with tool_choice.
+                    # If it is, the tool_choice will be used instead.
+                    elif self.enable_reasoning:
+                        # handle reasoning_content delta
+                        assert reasoning_parser is not None
+                        assert previous_texts is not None
+                        assert all_previous_token_ids is not None
+                        previous_text = previous_texts[i]
+                        previous_token_ids = all_previous_token_ids[i]
+                        current_text = previous_text + delta_text
+                        current_token_ids = previous_token_ids + list(
+                            output.token_ids)
+
+                        delta_message = (reasoning_parser.
+                                         extract_reasoning_content_streaming(
+                                             previous_text,
+                                             current_text,
+                                             delta_text,
+                                             previous_token_ids,
+                                             current_token_ids,
+                                             output.token_ids,
+                                         ))
+
+                        # update the previous values for the next iteration
+                        previous_texts[i] = current_text
+                        all_previous_token_ids[i] = current_token_ids
 
                     # handle streaming just a content delta
                     else:
@@ -642,17 +705,38 @@ async def chat_completion_full_generator(
             else:
                 logprobs = None
 
+            should_stream_with_reasoning_parsing = (
+                self._should_stream_with_reasoning_parsing(request))
+
             # In the OpenAI API the finish_reason is "tools_called"
             # if the tool choice is auto and the model produced a tool
             # call. The same is not true for named function calls
             auto_tools_called = False
 
+            if should_stream_with_reasoning_parsing and \
+                self.reasoning_parser is not None:
+                try:
+                    reasoning_parser = self.reasoning_parser(tokenizer)
+                except RuntimeError as e:
+                    logger.exception("Error in reasoning parser creation.")
+                    return self.create_error_response(str(e))
+
+                reasoning_content, content = (
+                    reasoning_parser.extract_reasoning_content(
+                        output.text, request=request))
+
+                if reasoning_content:
+                    message = ChatMessage(role=role,
+                                          content=content,
+                                          reasoning_content=reasoning_content)
+                else:
+                    message = ChatMessage(role=role, content=output.text)
+
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
-            if (not self.enable_auto_tools
-                    or not self.tool_parser) and not isinstance(
-                        request.tool_choice,
-                        ChatCompletionNamedToolChoiceParam):
+            elif (not self.enable_auto_tools
+                  or not self.tool_parser) and not isinstance(
+                      request.tool_choice, ChatCompletionNamedToolChoiceParam):
                 message = ChatMessage(role=role, content=output.text)
 
             # if the request uses tools and specified a tool choice
@@ -835,6 +919,17 @@ def _should_stream_with_auto_tool_parsing(self,
         return (request.tools and self.tool_parser and self.enable_auto_tools
                 and request.tool_choice in ['auto', None])
 
+    def _should_stream_with_reasoning_parsing(self,
+                                              request: ChatCompletionRequest):
+        """
+            Utility function to check if streamed tokens should go through the
+            reasoning parser that was configured.
+    
+            We only want to do this IF reasoning is enabled and a reasoning 
+            parser is configured.
+            """
+        return self.enable_reasoning and self.reasoning_parser is not None
+
     def _should_check_for_unstreamed_tool_arg_tokens(
         self,
         delta_message: Optional[DeltaMessage],
diff --git a/vllm/scripts.py b/vllm/scripts.py
index 42e1c639eda10..8101e6b3af7ee 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -167,6 +167,7 @@ def main():
         "Must be a YAML with the following options:"
         "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
     )
+
     serve_parser = make_arg_parser(serve_parser)
     serve_parser.set_defaults(dispatch_function=serve)
 

From dd6a3a02cb3bf2a7bc6cb84c85dcd57c6eaf2bf9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 29 Jan 2025 03:38:29 +0000
Subject: [PATCH 27/69] [Doc] Convert docs to use colon fences (#12471)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/requirements-docs.txt                    |    4 +-
 docs/source/api/engine/index.md               |    4 +-
 docs/source/api/model/index.md                |    4 +-
 docs/source/api/multimodal/index.md           |    4 +-
 docs/source/api/offline_inference/index.md    |    4 +-
 .../contributing/dockerfile/dockerfile.md     |    4 +-
 docs/source/contributing/model/basic.md       |    8 +-
 docs/source/contributing/model/index.md       |   12 +-
 docs/source/contributing/model/multimodal.md  |   32 +-
 .../source/contributing/model/registration.md |   16 +-
 docs/source/contributing/model/tests.md       |    8 +-
 docs/source/contributing/overview.md          |   12 +-
 .../contributing/profiling/profiling_index.md |   12 +-
 docs/source/deployment/docker.md              |   16 +-
 .../source/deployment/frameworks/cerebrium.md |    4 +-
 docs/source/deployment/frameworks/dstack.md   |    8 +-
 docs/source/deployment/frameworks/helm.md     |  408 +++---
 docs/source/deployment/frameworks/index.md    |    4 +-
 docs/source/deployment/frameworks/skypilot.md |   36 +-
 docs/source/deployment/integrations/index.md  |    4 +-
 docs/source/deployment/nginx.md               |    4 +-
 docs/source/design/arch_overview.md           |   20 +-
 docs/source/design/kernel/paged_attention.md  |   32 +-
 docs/source/design/multiprocessing.md         |    4 +-
 .../features/automatic_prefix_caching.md      |    4 +-
 docs/source/features/compatibility_matrix.md  |  879 ++++++------
 docs/source/features/disagg_prefill.md        |   20 +-
 docs/source/features/lora.md                  |    4 +-
 docs/source/features/quantization/auto_awq.md |    4 +-
 docs/source/features/quantization/fp8.md      |   16 +-
 docs/source/features/quantization/gguf.md     |   12 +-
 docs/source/features/quantization/index.md    |    4 +-
 docs/source/features/quantization/int8.md     |    8 +-
 .../quantization/supported_hardware.md        |  229 +--
 docs/source/features/spec_decode.md           |    8 +-
 docs/source/features/structured_outputs.md    |    4 +-
 docs/source/generate_examples.py              |    4 +-
 .../ai_accelerator/hpu-gaudi.inc.md           |   76 +-
 .../installation/ai_accelerator/index.md      |  272 ++--
 .../installation/ai_accelerator/neuron.inc.md |    4 +-
 .../installation/ai_accelerator/tpu.inc.md    |   51 +-
 .../installation/cpu/apple.inc.md             |    4 +-
 .../getting_started/installation/cpu/index.md |   86 +-
 .../installation/cpu/x86.inc.md               |    4 +-
 .../installation/gpu/cuda.inc.md              |   12 +-
 .../getting_started/installation/gpu/index.md |  206 +--
 .../installation/gpu/rocm.inc.md              |   25 +-
 .../installation/gpu/xpu.inc.md               |    4 +-
 .../getting_started/installation/index.md     |    4 +-
 .../installation/python_env_setup.inc.md      |    4 +-
 docs/source/getting_started/quickstart.md     |   12 +-
 .../source/getting_started/troubleshooting.md |   12 +-
 docs/source/index.md                          |   48 +-
 docs/source/models/extensions/index.md        |    4 +-
 .../models/extensions/runai_model_streamer.md |    4 +-
 docs/source/models/extensions/tensorizer.md   |    4 +-
 docs/source/models/generative_models.md       |    4 +-
 docs/source/models/pooling_models.md          |   60 +-
 docs/source/models/supported_models.md        | 1263 +++++++++--------
 docs/source/serving/distributed_serving.md    |   12 +-
 docs/source/serving/engine_args.md            |    2 +
 docs/source/serving/env_vars.md               |    8 +-
 docs/source/serving/integrations/index.md     |    4 +-
 docs/source/serving/metrics.md                |    4 +-
 docs/source/serving/multimodal_inputs.md      |   53 +-
 docs/source/serving/offline_inference.md      |    8 +-
 .../serving/openai_compatible_server.md       |   56 +-
 pyproject.toml                                |    1 +
 68 files changed, 2091 insertions(+), 2080 deletions(-)

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 8217bc3ba3ded..1d669699f4b2a 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -1,10 +1,10 @@
 sphinx==6.2.1
+sphinx-argparse==0.4.0
 sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
-myst-parser==3.0.1
-sphinx-argparse==0.4.0
 sphinx-design==0.6.1
 sphinx-togglebutton==0.3.2
+myst-parser==3.0.1
 msgspec
 cloudpickle
 
diff --git a/docs/source/api/engine/index.md b/docs/source/api/engine/index.md
index 701cb95d3be33..b6544d94afdf8 100644
--- a/docs/source/api/engine/index.md
+++ b/docs/source/api/engine/index.md
@@ -8,10 +8,10 @@
 .. currentmodule:: vllm.engine
 ```
 
-```{toctree}
+:::{toctree}
 :caption: Engines
 :maxdepth: 2
 
 llm_engine
 async_llm_engine
-```
+:::
diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md
index 113792147be7c..8fee3a55c93de 100644
--- a/docs/source/api/model/index.md
+++ b/docs/source/api/model/index.md
@@ -2,10 +2,10 @@
 
 ## Submodules
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 interfaces_base
 interfaces
 adapters
-```
+:::
diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md
index 14efdb506d76f..069ed53e545c5 100644
--- a/docs/source/api/multimodal/index.md
+++ b/docs/source/api/multimodal/index.md
@@ -17,7 +17,7 @@ Looking to add your own multi-modal model? Please follow the instructions listed
 
 ## Submodules
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 inputs
@@ -25,4 +25,4 @@ parse
 processing
 profiling
 registry
-```
+:::
diff --git a/docs/source/api/offline_inference/index.md b/docs/source/api/offline_inference/index.md
index c32f99d59e3db..ec2cc599d923c 100644
--- a/docs/source/api/offline_inference/index.md
+++ b/docs/source/api/offline_inference/index.md
@@ -1,9 +1,9 @@
 # Offline Inference
 
-```{toctree}
+:::{toctree}
 :caption: Contents
 :maxdepth: 1
 
 llm
 llm_inputs
-```
+:::
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index cb142318b8724..96674805df534 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -17,11 +17,11 @@ The edges of the build graph represent:
 
 - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
 
-  > ```{figure} /assets/contributing/dockerfile-stages-dependency.png
+  > :::{figure} /assets/contributing/dockerfile-stages-dependency.png
   > :align: center
   > :alt: query
   > :width: 100%
-  > ```
+  > :::
   >
   > Made using: <https://github.com/patrickhoefler/dockerfilegraph>
   >
diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
index b9b92fd027f6e..180fdd59e9a64 100644
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@@ -10,9 +10,9 @@ First, clone the PyTorch model code from the source repository.
 For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
 HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
 
-```{warning}
+:::{warning}
 Make sure to review and adhere to the original code's copyright and licensing terms!
-```
+:::
 
 ## 2. Make your code compatible with vLLM
 
@@ -80,10 +80,10 @@ def forward(
     ...
 ```
 
-```{note}
+:::{note}
 Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
 If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
-```
+:::
 
 For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
 
diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md
index fe018b61b08cf..721ee3cd2047c 100644
--- a/docs/source/contributing/model/index.md
+++ b/docs/source/contributing/model/index.md
@@ -4,7 +4,7 @@
 
 This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
 
-```{toctree}
+:::{toctree}
 :caption: Contents
 :maxdepth: 1
 
@@ -12,16 +12,16 @@ basic
 registration
 tests
 multimodal
-```
+:::
 
-```{note}
+:::{note}
 The complexity of adding a new model depends heavily on the model's architecture.
 The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
 However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
-```
+:::
 
-```{tip}
+:::{tip}
 If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
 or ask on our [developer slack](https://slack.vllm.ai).
 We will be happy to help you out!
-```
+:::
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index e5fd9a2877ceb..6c6f3b701cd28 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -48,9 +48,9 @@ Further update the model as follows:
             return vision_embeddings
     ```
 
-    ```{important}
+    :::{important}
     The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
-    ```
+    :::
 
 - Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
 
@@ -89,10 +89,10 @@ Further update the model as follows:
   + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
   ```
 
-  ```{note}
+  :::{note}
   The model class does not have to be named {code}`*ForCausalLM`.
   Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
-  ```
+  :::
 
 ## 2. Specify processing information
 
@@ -120,8 +120,8 @@ When calling the model, the output embeddings from the visual encoder are assign
 containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
 to the size of the output embeddings.
 
-::::{tab-set}
-:::{tab-item} Basic example: LLaVA
+:::::{tab-set}
+::::{tab-item} Basic example: LLaVA
 :sync: llava
 
 Looking at the code of HF's `LlavaForConditionalGeneration`:
@@ -254,12 +254,12 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
     return {"image": self.get_max_image_tokens()}
 ```
 
-```{note}
+:::{note}
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
-```
-
 :::
+
 ::::
+:::::
 
 ## 3. Specify dummy inputs
 
@@ -315,17 +315,17 @@ def get_dummy_processor_inputs(
 Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`
 to fill in the missing details about HF processing.
 
-```{seealso}
+:::{seealso}
 [Multi-Modal Data Processing](#mm-processing)
-```
+:::
 
 ### Multi-modal fields
 
 Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
 return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
 
-::::{tab-set}
-:::{tab-item} Basic example: LLaVA
+:::::{tab-set}
+::::{tab-item} Basic example: LLaVA
 :sync: llava
 
 Looking at the model's `forward` method:
@@ -367,13 +367,13 @@ def _get_mm_fields_config(
     )
 ```
 
-```{note}
+:::{note}
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
 pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
-```
-
 :::
+
 ::::
+:::::
 
 ### Prompt replacements
 
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
index d6c9e4181dfee..64cd25b53807e 100644
--- a/docs/source/contributing/model/registration.md
+++ b/docs/source/contributing/model/registration.md
@@ -17,17 +17,17 @@ After you have implemented your model (see [tutorial](#new-model-basic)), put it
 Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
 Finally, update our [list of supported models](#supported-models) to promote your model!
 
-```{important}
+:::{important}
 The list of models in each section should be maintained in alphabetical order.
-```
+:::
 
 ## Out-of-tree models
 
 You can load an external model using a plugin without modifying the vLLM codebase.
 
-```{seealso}
+:::{seealso}
 [vLLM's Plugin System](#plugin-system)
-```
+:::
 
 To register the model, use the following code:
 
@@ -45,11 +45,11 @@ from vllm import ModelRegistry
 ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
 ```
 
-```{important}
+:::{important}
 If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
 Read more about that [here](#supports-multimodal).
-```
+:::
 
-```{note}
+:::{note}
 Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
-```
+:::
diff --git a/docs/source/contributing/model/tests.md b/docs/source/contributing/model/tests.md
index 74c933b2f45da..68d51d89f7cff 100644
--- a/docs/source/contributing/model/tests.md
+++ b/docs/source/contributing/model/tests.md
@@ -14,14 +14,14 @@ Without them, the CI for your PR will fail.
 Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
 This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
 
-```{important}
+:::{important}
 The list of models in each section should be maintained in alphabetical order.
-```
+:::
 
-```{tip}
+:::{tip}
 If your model requires a development version of HF Transformers, you can set
 `min_transformers_version` to skip the test in CI until the model is released.
-```
+:::
 
 ## Optional Tests
 
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 36cf8e7440eca..908c7cb4d38ee 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -35,17 +35,17 @@ pre-commit run --all-files
 pytest tests/
 ```
 
-```{note}
+:::{note}
 Currently, the repository is not fully checked by `mypy`.
-```
+:::
 
 ## Issues
 
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
-```{important}
+:::{important}
 If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
-```
+:::
 
 ## Pull Requests & Code Reviews
 
@@ -81,9 +81,9 @@ appropriately to indicate the type of change. Please use one of the following:
 - `[Misc]` for PRs that do not fit the above categories. Please use this
   sparingly.
 
-```{note}
+:::{note}
 If the PR spans more than one category, please include all relevant prefixes.
-```
+:::
 
 ### Code Quality
 
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 001db86bdf555..79aeb292a9b73 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -6,21 +6,21 @@ The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` en
 
 When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
 
-```{warning}
+:::{warning}
 Only enable profiling in a development environment.
-```
+:::
 
 Traces can be visualized using <https://ui.perfetto.dev/>.
 
-```{tip}
+:::{tip}
 Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-```
+:::
 
-```{tip}
+:::{tip}
 To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
 Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
 `export VLLM_RPC_TIMEOUT=1800000`
-```
+:::
 
 ## Example commands and usage
 
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 438be47316f3b..334c02225bd6b 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -21,11 +21,11 @@ $ docker run --runtime nvidia --gpus all \
 
 You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
 
-```{note}
+:::{note}
 You can either use the `ipc=host` flag or `--shm-size` flag to allow the
 container to access the host's shared memory. vLLM uses PyTorch, which uses shared
 memory to share data between processes under the hood, particularly for tensor parallel inference.
-```
+:::
 
 (deployment-docker-build-image-from-source)=
 
@@ -38,25 +38,25 @@ You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To
 DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
 ```
 
-```{note}
+:::{note}
 By default vLLM will build for all GPU types for widest distribution. If you are just building for the
 current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
 for vLLM to find the current GPU type and build for that.
 
 If you are using Podman instead of Docker, you might need to disable SELinux labeling by
 adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
-```
+:::
 
 ## Building for Arm64/aarch64
 
 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
 of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
 
-```{note}
+:::{note}
 Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
 flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
 Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
-```
+:::
 
 ```console
 # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
@@ -85,6 +85,6 @@ $ docker run --runtime nvidia --gpus all \
 
 The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
 
-```{note}
+:::{note}
 **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
-```
+:::
diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md
index 5787c4a407bfb..b20c95137b6e7 100644
--- a/docs/source/deployment/frameworks/cerebrium.md
+++ b/docs/source/deployment/frameworks/cerebrium.md
@@ -2,11 +2,11 @@
 
 # Cerebrium
 
-```{raw} html
+:::{raw} html
 <p align="center">
     <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
 </p>
-```
+:::
 
 vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
 
diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/source/deployment/frameworks/dstack.md
index b42a34125c6d7..a16e28f2d8983 100644
--- a/docs/source/deployment/frameworks/dstack.md
+++ b/docs/source/deployment/frameworks/dstack.md
@@ -2,11 +2,11 @@
 
 # dstack
 
-```{raw} html
+:::{raw} html
 <p align="center">
     <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
 </p>
-```
+:::
 
 vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
 
@@ -97,6 +97,6 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-```{note}
+:::{note}
 dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
-```
+:::
diff --git a/docs/source/deployment/frameworks/helm.md b/docs/source/deployment/frameworks/helm.md
index 18ed293191468..e4fc5e1313079 100644
--- a/docs/source/deployment/frameworks/helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@@ -38,213 +38,213 @@ chart **including persistent volumes** and deletes the release.
 
 ## Architecture
 
-```{image} /assets/deployment/architecture_helm_deployment.png
-```
+:::{image} /assets/deployment/architecture_helm_deployment.png
+:::
 
 ## Values
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 25 25
 :header-rows: 1
 
-* - Key
-  - Type
-  - Default
-  - Description
-* - autoscaling
-  - object
-  - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
-  - Autoscaling configuration
-* - autoscaling.enabled
-  - bool
-  - false
-  - Enable autoscaling
-* - autoscaling.maxReplicas
-  - int
-  - 100
-  - Maximum replicas
-* - autoscaling.minReplicas
-  - int
-  - 1
-  - Minimum replicas
-* - autoscaling.targetCPUUtilizationPercentage
-  - int
-  - 80
-  - Target CPU utilization for autoscaling
-* - configs
-  - object
-  - {}
-  - Configmap
-* - containerPort
-  - int
-  - 8000
-  - Container port
-* - customObjects
-  - list
-  - []
-  - Custom Objects configuration
-* - deploymentStrategy
-  - object
-  - {}
-  - Deployment strategy configuration
-* - externalConfigs
-  - list
-  - []
-  - External configuration
-* - extraContainers
-  - list
-  - []
-  - Additional containers configuration
-* - extraInit
-  - object
-  - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
-  - Additional configuration for the init container
-* - extraInit.pvcStorage
-  - string
-  - "50Gi"
-  - Storage size of the s3
-* - extraInit.s3modelpath
-  - string
-  - "relative_s3_model_path/opt-125m"
-  - Path of the model on the s3 which hosts model weights and config files
-* - extraInit.awsEc2MetadataDisabled
-  - boolean
-  - true
-  - Disables the use of the Amazon EC2 instance metadata service
-* - extraPorts
-  - list
-  - []
-  - Additional ports configuration
-* - gpuModels
-  - list
-  - ["TYPE_GPU_USED"]
-  - Type of gpu used
-* - image
-  - object
-  - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
-  - Image configuration
-* - image.command
-  - list
-  - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
-  - Container launch command
-* - image.repository
-  - string
-  - "vllm/vllm-openai"
-  - Image repository
-* - image.tag
-  - string
-  - "latest"
-  - Image tag
-* - livenessProbe
-  - object
-  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
-  - Liveness probe configuration
-* - livenessProbe.failureThreshold
-  - int
-  - 3
-  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
-* - livenessProbe.httpGet
-  - object
-  - {"path":"/health","port":8000}
-  - Configuration of the Kubelet http request on the server
-* - livenessProbe.httpGet.path
-  - string
-  - "/health"
-  - Path to access on the HTTP server
-* - livenessProbe.httpGet.port
-  - int
-  - 8000
-  - Name or number of the port to access on the container, on which the server is listening
-* - livenessProbe.initialDelaySeconds
-  - int
-  - 15
-  - Number of seconds after the container has started before liveness probe is initiated
-* - livenessProbe.periodSeconds
-  - int
-  - 10
-  - How often (in seconds) to perform the liveness probe
-* - maxUnavailablePodDisruptionBudget
-  - string
-  - ""
-  - Disruption Budget Configuration
-* - readinessProbe
-  - object
-  - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
-  - Readiness probe configuration
-* - readinessProbe.failureThreshold
-  - int
-  - 3
-  - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
-* - readinessProbe.httpGet
-  - object
-  - {"path":"/health","port":8000}
-  - Configuration of the Kubelet http request on the server
-* - readinessProbe.httpGet.path
-  - string
-  - "/health"
-  - Path to access on the HTTP server
-* - readinessProbe.httpGet.port
-  - int
-  - 8000
-  - Name or number of the port to access on the container, on which the server is listening
-* - readinessProbe.initialDelaySeconds
-  - int
-  - 5
-  - Number of seconds after the container has started before readiness probe is initiated
-* - readinessProbe.periodSeconds
-  - int
-  - 5
-  - How often (in seconds) to perform the readiness probe
-* - replicaCount
-  - int
-  - 1
-  - Number of replicas
-* - resources
-  - object
-  - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
-  - Resource configuration
-* - resources.limits."nvidia.com/gpu"
-  - int
-  - 1
-  - Number of gpus used
-* - resources.limits.cpu
-  - int
-  - 4
-  - Number of CPUs
-* - resources.limits.memory
-  - string
-  - "16Gi"
-  - CPU memory configuration
-* - resources.requests."nvidia.com/gpu"
-  - int
-  - 1
-  - Number of gpus used
-* - resources.requests.cpu
-  - int
-  - 4
-  - Number of CPUs
-* - resources.requests.memory
-  - string
-  - "16Gi"
-  - CPU memory configuration
-* - secrets
-  - object
-  - {}
-  - Secrets configuration
-* - serviceName
-  - string
-  -
-  - Service name
-* - servicePort
-  - int
-  - 80
-  - Service port
-* - labels.environment
-  - string
-  - test
-  - Environment name
-* - labels.release
-  - string
-  - test
-  - Release name
-```
+- * Key
+  * Type
+  * Default
+  * Description
+- * autoscaling
+  * object
+  * {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
+  * Autoscaling configuration
+- * autoscaling.enabled
+  * bool
+  * false
+  * Enable autoscaling
+- * autoscaling.maxReplicas
+  * int
+  * 100
+  * Maximum replicas
+- * autoscaling.minReplicas
+  * int
+  * 1
+  * Minimum replicas
+- * autoscaling.targetCPUUtilizationPercentage
+  * int
+  * 80
+  * Target CPU utilization for autoscaling
+- * configs
+  * object
+  * {}
+  * Configmap
+- * containerPort
+  * int
+  * 8000
+  * Container port
+- * customObjects
+  * list
+  * []
+  * Custom Objects configuration
+- * deploymentStrategy
+  * object
+  * {}
+  * Deployment strategy configuration
+- * externalConfigs
+  * list
+  * []
+  * External configuration
+- * extraContainers
+  * list
+  * []
+  * Additional containers configuration
+- * extraInit
+  * object
+  * {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
+  * Additional configuration for the init container
+- * extraInit.pvcStorage
+  * string
+  * "50Gi"
+  * Storage size of the s3
+- * extraInit.s3modelpath
+  * string
+  * "relative_s3_model_path/opt-125m"
+  * Path of the model on the s3 which hosts model weights and config files
+- * extraInit.awsEc2MetadataDisabled
+  * boolean
+  * true
+  * Disables the use of the Amazon EC2 instance metadata service
+- * extraPorts
+  * list
+  * []
+  * Additional ports configuration
+- * gpuModels
+  * list
+  * ["TYPE_GPU_USED"]
+  * Type of gpu used
+- * image
+  * object
+  * {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
+  * Image configuration
+- * image.command
+  * list
+  * ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
+  * Container launch command
+- * image.repository
+  * string
+  * "vllm/vllm-openai"
+  * Image repository
+- * image.tag
+  * string
+  * "latest"
+  * Image tag
+- * livenessProbe
+  * object
+  * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
+  * Liveness probe configuration
+- * livenessProbe.failureThreshold
+  * int
+  * 3
+  * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+- * livenessProbe.httpGet
+  * object
+  * {"path":"/health","port":8000}
+  * Configuration of the Kubelet http request on the server
+- * livenessProbe.httpGet.path
+  * string
+  * "/health"
+  * Path to access on the HTTP server
+- * livenessProbe.httpGet.port
+  * int
+  * 8000
+  * Name or number of the port to access on the container, on which the server is listening
+- * livenessProbe.initialDelaySeconds
+  * int
+  * 15
+  * Number of seconds after the container has started before liveness probe is initiated
+- * livenessProbe.periodSeconds
+  * int
+  * 10
+  * How often (in seconds) to perform the liveness probe
+- * maxUnavailablePodDisruptionBudget
+  * string
+  * ""
+  * Disruption Budget Configuration
+- * readinessProbe
+  * object
+  * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
+  * Readiness probe configuration
+- * readinessProbe.failureThreshold
+  * int
+  * 3
+  * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+- * readinessProbe.httpGet
+  * object
+  * {"path":"/health","port":8000}
+  * Configuration of the Kubelet http request on the server
+- * readinessProbe.httpGet.path
+  * string
+  * "/health"
+  * Path to access on the HTTP server
+- * readinessProbe.httpGet.port
+  * int
+  * 8000
+  * Name or number of the port to access on the container, on which the server is listening
+- * readinessProbe.initialDelaySeconds
+  * int
+  * 5
+  * Number of seconds after the container has started before readiness probe is initiated
+- * readinessProbe.periodSeconds
+  * int
+  * 5
+  * How often (in seconds) to perform the readiness probe
+- * replicaCount
+  * int
+  * 1
+  * Number of replicas
+- * resources
+  * object
+  * {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
+  * Resource configuration
+- * resources.limits."nvidia.com/gpu"
+  * int
+  * 1
+  * Number of gpus used
+- * resources.limits.cpu
+  * int
+  * 4
+  * Number of CPUs
+- * resources.limits.memory
+  * string
+  * "16Gi"
+  * CPU memory configuration
+- * resources.requests."nvidia.com/gpu"
+  * int
+  * 1
+  * Number of gpus used
+- * resources.requests.cpu
+  * int
+  * 4
+  * Number of CPUs
+- * resources.requests.memory
+  * string
+  * "16Gi"
+  * CPU memory configuration
+- * secrets
+  * object
+  * {}
+  * Secrets configuration
+- * serviceName
+  * string
+  *
+  * Service name
+- * servicePort
+  * int
+  * 80
+  * Service port
+- * labels.environment
+  * string
+  * test
+  * Environment name
+- * labels.release
+  * string
+  * test
+  * Release name
+:::
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
index 964782763f6b3..cb758d3e6d2e4 100644
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@@ -1,6 +1,6 @@
 # Using other frameworks
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 bentoml
@@ -11,4 +11,4 @@ lws
 modal
 skypilot
 triton
-```
+:::
diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md
index 051fc2f2a8d4e..5e101b9001033 100644
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/source/deployment/frameworks/skypilot.md
@@ -2,11 +2,11 @@
 
 # SkyPilot
 
-```{raw} html
+:::{raw} html
 <p align="center">
   <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
 </p>
-```
+:::
 
 vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
 
@@ -104,10 +104,10 @@ service:
   max_completion_tokens: 1
 ```
 
-```{raw} html
+:::{raw} html
 <details>
 <summary>Click to see the full recipe YAML</summary>
-```
+:::
 
 ```yaml
 service:
@@ -153,9 +153,9 @@ run: |
     2>&1 | tee api_server.log
 ```
 
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 
 Start the serving the Llama-3 8B model on multiple replicas:
 
@@ -169,10 +169,10 @@ Wait until the service is ready:
 watch -n10 sky serve status vllm
 ```
 
-```{raw} html
+:::{raw} html
 <details>
 <summary>Example outputs:</summary>
-```
+:::
 
 ```console
 Services
@@ -185,9 +185,9 @@ vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  R
 vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
 ```
 
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 
 After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
 
@@ -223,10 +223,10 @@ service:
 
 This will scale the service up to when the QPS exceeds 2 for each replica.
 
-```{raw} html
+:::{raw} html
 <details>
 <summary>Click to see the full recipe YAML</summary>
-```
+:::
 
 ```yaml
 service:
@@ -275,9 +275,9 @@ run: |
     2>&1 | tee api_server.log
 ```
 
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 
 To update the service with the new config:
 
@@ -295,10 +295,10 @@ sky serve down vllm
 
 It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
 
-```{raw} html
+:::{raw} html
 <details>
 <summary>Click to see the full GUI YAML</summary>
-```
+:::
 
 ```yaml
 envs:
@@ -328,9 +328,9 @@ run: |
     --stop-token-ids 128009,128001 | tee ~/gradio.log
 ```
 
-```{raw} html
+:::{raw} html
 </details>
-```
+:::
 
 1. Start the chat web UI:
 
diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md
index d47ede8967547..c286edb4d7bc1 100644
--- a/docs/source/deployment/integrations/index.md
+++ b/docs/source/deployment/integrations/index.md
@@ -1,9 +1,9 @@
 # External Integrations
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 kserve
 kubeai
 llamastack
-```
+:::
diff --git a/docs/source/deployment/nginx.md b/docs/source/deployment/nginx.md
index a58f791c2997b..87feb48856853 100644
--- a/docs/source/deployment/nginx.md
+++ b/docs/source/deployment/nginx.md
@@ -105,9 +105,9 @@ docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-si
 docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
 ```
 
-```{note}
+:::{note}
 If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
-```
+:::
 
 (nginxloadbalancer-nginx-launch-nginx)=
 
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index cec503ef2f77d..04886e5981eef 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -4,19 +4,19 @@
 
 This document provides an overview of the vLLM architecture.
 
-```{contents} Table of Contents
+:::{contents} Table of Contents
 :depth: 2
 :local: true
-```
+:::
 
 ## Entrypoints
 
 vLLM provides a number of entrypoints for interacting with the system. The
 following diagram shows the relationship between them.
 
-```{image} /assets/design/arch_overview/entrypoints.excalidraw.png
+:::{image} /assets/design/arch_overview/entrypoints.excalidraw.png
 :alt: Entrypoints Diagram
-```
+:::
 
 ### LLM Class
 
@@ -84,9 +84,9 @@ More details on the API server can be found in the [OpenAI-Compatible Server](#o
 The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
 the vLLM system, handling model inference and asynchronous request processing.
 
-```{image} /assets/design/arch_overview/llm_engine.excalidraw.png
+:::{image} /assets/design/arch_overview/llm_engine.excalidraw.png
 :alt: LLMEngine Diagram
-```
+:::
 
 ### LLMEngine
 
@@ -144,11 +144,11 @@ configurations affect the class we ultimately get.
 
 The following figure shows the class hierarchy of vLLM:
 
-> ```{figure} /assets/design/hierarchy.png
+> :::{figure} /assets/design/hierarchy.png
 > :align: center
 > :alt: query
 > :width: 100%
-> ```
+> :::
 
 There are several important design choices behind this class hierarchy:
 
@@ -178,7 +178,7 @@ of a vision model and a language model. By making the constructor uniform, we
 can easily create a vision model and a language model and compose them into a
 vision-language model.
 
-````{note}
+:::{note}
 To support this change, all vLLM models' signatures have been updated to:
 
 ```python
@@ -215,7 +215,7 @@ else:
 ```
 
 This way, the model can work with both old and new versions of vLLM.
-````
+:::
 
 3\. **Sharding and Quantization at Initialization**: Certain features require
 changing the model weights. For example, tensor parallelism needs to shard the
diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md
index f896f903c78f5..5f2582877260a 100644
--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@@ -139,26 +139,26 @@
   const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
   ```
 
-  ```{figure} ../../assets/kernel/query.png
+  :::{figure} ../../assets/kernel/query.png
   :align: center
   :alt: query
   :width: 70%
 
   Query data of one token at one head
-  ```
+  :::
 
 - Each thread defines its own `q_ptr` which points to the assigned
   query token data on global memory. For example, if `VEC_SIZE` is 4
   and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
   total of 128 elements divided into 128 / 4 = 32 vecs.
 
-  ```{figure} ../../assets/kernel/q_vecs.png
+  :::{figure} ../../assets/kernel/q_vecs.png
   :align: center
   :alt: q_vecs
   :width: 70%
 
   `q_vecs` for one thread group
-  ```
+  :::
 
   ```cpp
   __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
@@ -195,13 +195,13 @@
   points to key token data based on `k_cache` at assigned block,
   assigned head and assigned token.
 
-  ```{figure} ../../assets/kernel/key.png
+  :::{figure} ../../assets/kernel/key.png
   :align: center
   :alt: key
   :width: 70%
 
   Key data of all context tokens at one head
-  ```
+  :::
 
 - The diagram above illustrates the memory layout for key data. It
   assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
@@ -214,13 +214,13 @@
   elements for one token) that will be processed by 2 threads (one
   thread group) separately.
 
-  ```{figure} ../../assets/kernel/k_vecs.png
+  :::{figure} ../../assets/kernel/k_vecs.png
   :align: center
   :alt: k_vecs
   :width: 70%
 
   `k_vecs` for one thread
-  ```
+  :::
 
   ```cpp
   K_vec k_vecs[NUM_VECS_PER_THREAD]
@@ -289,14 +289,14 @@
   should be performed across the entire thread block, encompassing
   results between the query token and all context key tokens.
 
-  ```{math}
+  :::{math}
   :nowrap: true
 
   \begin{gather*}
   m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
   \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
   \end{gather*}
-  ```
+  :::
 
 ### `qk_max` and `logits`
 
@@ -379,29 +379,29 @@
 
 ## Value
 
-```{figure} ../../assets/kernel/value.png
+:::{figure} ../../assets/kernel/value.png
 :align: center
 :alt: value
 :width: 70%
 
 Value data of all context tokens at one head
-```
+:::
 
-```{figure} ../../assets/kernel/logits_vec.png
+:::{figure} ../../assets/kernel/logits_vec.png
 :align: center
 :alt: logits_vec
 :width: 50%
 
 `logits_vec` for one thread
-```
+:::
 
-```{figure} ../../assets/kernel/v_vec.png
+:::{figure} ../../assets/kernel/v_vec.png
 :align: center
 :alt: v_vec
 :width: 70%
 
 List of `v_vec` for one thread
-```
+:::
 
 - Now we need to retrieve the value data and perform dot multiplication
   with `logits`. Unlike query and key, there is no thread group
diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index c2cdb75ea08a7..55dae0bb92d4e 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -7,9 +7,9 @@ page for information on known issues and how to solve them.
 
 ## Introduction
 
-```{important}
+:::{important}
 The source code references are to the state of the code at the time of writing in December, 2024.
-```
+:::
 
 The use of Python multiprocessing in vLLM is complicated by:
 
diff --git a/docs/source/features/automatic_prefix_caching.md b/docs/source/features/automatic_prefix_caching.md
index 3d70cbb29c385..59016d7fcf6b3 100644
--- a/docs/source/features/automatic_prefix_caching.md
+++ b/docs/source/features/automatic_prefix_caching.md
@@ -6,9 +6,9 @@
 
 Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
 
-```{note}
+:::{note}
 Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
-```
+:::
 
 ## Enabling APC in vLLM
 
diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
index 47ab616b30686..b0018ebccf5ba 100644
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -4,13 +4,13 @@
 
 The tables below show mutually exclusive features and the support on some hardware.
 
-```{note}
+:::{note}
 Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
-```
+:::
 
 ## Feature x Feature
 
-```{raw} html
+:::{raw} html
 <style>
   /* Make smaller to try to improve readability  */
   td {
@@ -23,448 +23,447 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
     font-size: 0.8rem;
   }
 </style>
-```
+:::
 
-```{list-table}
-   :header-rows: 1
-   :stub-columns: 1
-   :widths: auto
+:::{list-table}
+:header-rows: 1
+:stub-columns: 1
+:widths: auto
 
-   * - Feature
-     - [CP](#chunked-prefill)
-     - [APC](#automatic-prefix-caching)
-     - [LoRA](#lora-adapter)
-     - <abbr title="Prompt Adapter">prmpt adptr</abbr>
-     - [SD](#spec_decode)
-     - CUDA graph
-     - <abbr title="Pooling Models">pooling</abbr>
-     - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-     - <abbr title="Logprobs">logP</abbr>
-     - <abbr title="Prompt Logprobs">prmpt logP</abbr>
-     - <abbr title="Async Output Processing">async output</abbr>
-     - multi-step
-     - <abbr title="Multimodal Inputs">mm</abbr>
-     - best-of
-     - beam-search
-     - <abbr title="Guided Decoding">guided dec</abbr>
-   * - [CP](#chunked-prefill)
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - [APC](#automatic-prefix-caching)
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - [LoRA](#lora-adapter)
-     - [✗](gh-pr:9057)
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
-     - ✅
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - [SD](#spec_decode)
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - CUDA graph
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Pooling Models">pooling</abbr>
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-     - ✗
-     - [✗](gh-issue:7366)
-     - ✗
-     - ✗
-     - [✗](gh-issue:7366)
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Logprobs">logP</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-pr:8199)
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Async Output Processing">async output</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-     - ✗
-     - ✗
-     - ✅
-     - ✅
-     -
-     -
-     -
-     -
-     -
-     -
-   * - multi-step
-     - ✗
-     - ✅
-     - ✗
-     - ✅
-     - ✗
-     - ✅
-     - ✗
-     - ✗
-     - ✅
-     - [✗](gh-issue:8198)
-     - ✅
-     -
-     -
-     -
-     -
-     -
-   * - <abbr title="Multimodal Inputs">mm</abbr>
-     - ✅
-     -  [✗](gh-pr:8348)
-     -  [✗](gh-pr:7199)
-     - ?
-     - ?
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     -
-     -
-     -
-     -
-   * - best-of
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:6137)
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     - [✗](gh-issue:7968)
-     - ✅
-     -
-     -
-     -
-   * - beam-search
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:6137)
-     - ✅
-     - ✗
-     - ✅
-     - ✅
-     - ✅
-     - ?
-     - [✗](gh-issue:7968>)
-     - ?
-     - ✅
-     -
-     -
-   * - <abbr title="Guided Decoding">guided dec</abbr>
-     - ✅
-     - ✅
-     - ?
-     - ?
-     - [✗](gh-issue:11484)
-     - ✅
-     - ✗
-     - ?
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:9893)
-     - ?
-     - ✅
-     - ✅
-     -
-
-```
+- * Feature
+  * [CP](#chunked-prefill)
+  * [APC](#automatic-prefix-caching)
+  * [LoRA](#lora-adapter)
+  * <abbr title="Prompt Adapter">prmpt adptr</abbr>
+  * [SD](#spec_decode)
+  * CUDA graph
+  * <abbr title="Pooling Models">pooling</abbr>
+  * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+  * <abbr title="Logprobs">logP</abbr>
+  * <abbr title="Prompt Logprobs">prmpt logP</abbr>
+  * <abbr title="Async Output Processing">async output</abbr>
+  * multi-step
+  * <abbr title="Multimodal Inputs">mm</abbr>
+  * best-of
+  * beam-search
+  * <abbr title="Guided Decoding">guided dec</abbr>
+- * [CP](#chunked-prefill)
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * [APC](#automatic-prefix-caching)
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * [LoRA](#lora-adapter)
+  * [✗](gh-pr:9057)
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
+  * ✅
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * [SD](#spec_decode)
+  * ✅
+  * ✅
+  * ✗
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * CUDA graph
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Pooling Models">pooling</abbr>
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+  * ✗
+  * [✗](gh-issue:7366)
+  * ✗
+  * ✗
+  * [✗](gh-issue:7366)
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Logprobs">logP</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-pr:8199)
+  * ✅
+  * ✗
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Async Output Processing">async output</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+  * ✅
+  * ✗
+  * ✗
+  * ✅
+  * ✅
+  *
+  *
+  *
+  *
+  *
+  *
+- * multi-step
+  * ✗
+  * ✅
+  * ✗
+  * ✅
+  * ✗
+  * ✅
+  * ✗
+  * ✗
+  * ✅
+  * [✗](gh-issue:8198)
+  * ✅
+  *
+  *
+  *
+  *
+  *
+- * <abbr title="Multimodal Inputs">mm</abbr>
+  * ✅
+  * [✗](gh-pr:8348)
+  * [✗](gh-pr:7199)
+  * ?
+  * ?
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ?
+  *
+  *
+  *
+  *
+- * best-of
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:6137)
+  * ✅
+  * ✗
+  * ✅
+  * ✅
+  * ✅
+  * ?
+  * [✗](gh-issue:7968)
+  * ✅
+  *
+  *
+  *
+- * beam-search
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:6137)
+  * ✅
+  * ✗
+  * ✅
+  * ✅
+  * ✅
+  * ?
+  * [✗](gh-issue:7968>)
+  * ?
+  * ✅
+  *
+  *
+- * <abbr title="Guided Decoding">guided dec</abbr>
+  * ✅
+  * ✅
+  * ?
+  * ?
+  * [✗](gh-issue:11484)
+  * ✅
+  * ✗
+  * ?
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:9893)
+  * ?
+  * ✅
+  * ✅
+  *
+:::
 
 (feature-x-hardware)=
 
 ## Feature x Hardware
 
-```{list-table}
-   :header-rows: 1
-   :stub-columns: 1
-   :widths: auto
+:::{list-table}
+:header-rows: 1
+:stub-columns: 1
+:widths: auto
 
-   * - Feature
-     - Volta
-     - Turing
-     - Ampere
-     - Ada
-     - Hopper
-     - CPU
-     - AMD
-   * - [CP](#chunked-prefill)
-     - [✗](gh-issue:2729)
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - [APC](#automatic-prefix-caching)
-     - [✗](gh-issue:3687)
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - [LoRA](#lora-adapter)
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Prompt Adapter">prmpt adptr</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:8475)
-     - ✅
-   * - [SD](#spec_decode)
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - CUDA graph
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✅
-   * - <abbr title="Pooling Models">pooling</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ?
-   * - <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-   * - <abbr title="Multimodal Inputs">mm</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Logprobs">logP</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Prompt Logprobs">prmpt logP</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Async Output Processing">async output</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✗
-     - ✗
-   * - multi-step
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - [✗](gh-issue:8477)
-     - ✅
-   * - best-of
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - beam-search
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-   * - <abbr title="Guided Decoding">guided dec</abbr>
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-     - ✅
-```
+- * Feature
+  * Volta
+  * Turing
+  * Ampere
+  * Ada
+  * Hopper
+  * CPU
+  * AMD
+- * [CP](#chunked-prefill)
+  * [✗](gh-issue:2729)
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * [APC](#automatic-prefix-caching)
+  * [✗](gh-issue:3687)
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * [LoRA](#lora-adapter)
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:8475)
+  * ✅
+- * [SD](#spec_decode)
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * CUDA graph
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+  * ✅
+- * <abbr title="Pooling Models">pooling</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ?
+- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+- * <abbr title="Multimodal Inputs">mm</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Logprobs">logP</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Async Output Processing">async output</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✗
+  * ✗
+- * multi-step
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * [✗](gh-issue:8477)
+  * ✅
+- * best-of
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * beam-search
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+- * <abbr title="Guided Decoding">guided dec</abbr>
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+  * ✅
+:::
diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
index efa2efc66192e..52d253b9c2b18 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -4,9 +4,9 @@
 
 This page introduces you the disaggregated prefilling feature in vLLM.
 
-```{note}
+:::{note}
 This feature is experimental and subject to change.
-```
+:::
 
 ## Why disaggregated prefilling?
 
@@ -15,9 +15,9 @@ Two main reasons:
 - **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
 - **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
 
-```{note}
+:::{note}
 Disaggregated prefill DOES NOT improve throughput.
-```
+:::
 
 ## Usage example
 
@@ -39,21 +39,21 @@ Key abstractions for disaggregated prefilling:
 - **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer.
 - **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`.
 
-```{note}
+:::{note}
 `insert` is non-blocking operation but `drop_select` is blocking operation.
-```
+:::
 
 Here is a figure illustrating how the above 3 abstractions are organized:
 
-```{image} /assets/features/disagg_prefill/abstraction.jpg
+:::{image} /assets/features/disagg_prefill/abstraction.jpg
 :alt: Disaggregated prefilling abstractions
-```
+:::
 
 The workflow of disaggregated prefilling is as follows:
 
-```{image} /assets/features/disagg_prefill/overview.jpg
+:::{image} /assets/features/disagg_prefill/overview.jpg
 :alt: Disaggregated prefilling workflow
-```
+:::
 
 The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
 
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index b00d05147bb32..fb5a7a0d519cb 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -60,9 +60,9 @@ vllm serve meta-llama/Llama-2-7b-hf \
     --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
 ```
 
-```{note}
+:::{note}
 The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
-```
+:::
 
 The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`,
 etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index 404505eb3890e..30735b1161ff3 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -2,11 +2,11 @@
 
 # AutoAWQ
 
-```{warning}
+:::{warning}
 Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
 accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
 inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
-```
+:::
 
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md
index 1398e8a324201..a62e0124b7706 100644
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/source/features/quantization/fp8.md
@@ -14,10 +14,10 @@ The FP8 types typically supported in hardware have two distinct representations,
 - **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`.
 - **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values.
 
-```{note}
+:::{note}
 FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
 FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin.
-```
+:::
 
 ## Quick Start with Online Dynamic Quantization
 
@@ -32,9 +32,9 @@ model = LLM("facebook/opt-125m", quantization="fp8")
 result = model.generate("Hello, my name is")
 ```
 
-```{warning}
+:::{warning}
 Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
-```
+:::
 
 ## Installation
 
@@ -110,9 +110,9 @@ model.generate("Hello my name is")
 
 Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
 
-```{note}
+:::{note}
 Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
-```
+:::
 
 ```console
 $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
@@ -137,10 +137,10 @@ If you encounter any issues or have feature requests, please open an issue on th
 
 ## Deprecated Flow
 
-```{note}
+:::{note}
 The following information is preserved for reference and search purposes.
 The quantization method described below is deprecated in favor of the `llmcompressor` method described above.
-```
+:::
 
 For static per-tensor offline quantization to FP8, please install the [AutoFP8 library](https://github.com/neuralmagic/autofp8).
 
diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md
index 640997cf4bc39..65c181900f9be 100644
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
@@ -2,13 +2,13 @@
 
 # GGUF
 
-```{warning}
+:::{warning}
 Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
-```
+:::
 
-```{warning}
+:::{warning}
 Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
-```
+:::
 
 To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
 
@@ -25,9 +25,9 @@ You can also add `--tensor-parallel-size 2` to enable tensor parallelism inferen
 vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
 ```
 
-```{warning}
+:::{warning}
 We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
-```
+:::
 
 You can also use the GGUF model directly through the LLM entrypoint:
 
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index 56ccdb5f00c34..d972dc85fc23c 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -4,7 +4,7 @@
 
 Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
 
-```{toctree}
+:::{toctree}
 :caption: Contents
 :maxdepth: 1
 
@@ -15,4 +15,4 @@ gguf
 int8
 fp8
 quantized_kvcache
-```
+:::
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
index 592a60d3988b2..fedb16f4350e5 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@@ -7,9 +7,9 @@ This quantization method is particularly useful for reducing model size while ma
 
 Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
 
-```{note}
+:::{note}
 INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
-```
+:::
 
 ## Prerequisites
 
@@ -119,9 +119,9 @@ $ lm_eval --model vllm \
   --batch_size 'auto'
 ```
 
-```{note}
+:::{note}
 Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
-```
+:::
 
 ## Best Practices
 
diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
index f5c0a95ea426e..555ed4ce4c8db 100644
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@@ -4,128 +4,129 @@
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
-```{list-table}
+:::{list-table}
 :header-rows: 1
 :widths: 20 8 8 8 8 8 8 8 8 8 8
 
-* - Implementation
-  - Volta
-  - Turing
-  - Ampere
-  - Ada
-  - Hopper
-  - AMD GPU
-  - Intel GPU
-  - x86 CPU
-  - AWS Inferentia
-  - Google TPU
-* - AWQ
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-* - GPTQ
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-* - Marlin (GPTQ/AWQ/FP8)
-  - ✗
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - INT8 (W8A8)
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✅︎
-  - ✗
-  - ✗
-* - FP8 (W8A8)
-  - ✗
-  - ✗
-  - ✗
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - AQLM
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - bitsandbytes
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - DeepSpeedFP
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-* - GGUF
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✅︎
-  - ✗
-  - ✗
-  - ✗
-  - ✗
-```
+- * Implementation
+  * Volta
+  * Turing
+  * Ampere
+  * Ada
+  * Hopper
+  * AMD GPU
+  * Intel GPU
+  * x86 CPU
+  * AWS Inferentia
+  * Google TPU
+- * AWQ
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+- * GPTQ
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+- * Marlin (GPTQ/AWQ/FP8)
+  * ✗
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * INT8 (W8A8)
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✅︎
+  * ✗
+  * ✗
+- * FP8 (W8A8)
+  * ✗
+  * ✗
+  * ✗
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * AQLM
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * bitsandbytes
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * DeepSpeedFP
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+- * GGUF
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✗
+  * ✗
+  * ✗
+  * ✗
+
+:::
 
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - "✅︎" indicates that the quantization method is supported on the specified hardware.
 - "✗" indicates that the quantization method is not supported on the specified hardware.
 
-```{note}
+:::{note}
 This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
 For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
-```
+:::
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index ab7b2f302bd13..da87127057dc5 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -2,15 +2,15 @@
 
 # Speculative Decoding
 
-```{warning}
+:::{warning}
 Please note that speculative decoding in vLLM is not yet optimized and does
 not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
 The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
-```
+:::
 
-```{warning}
+:::{warning}
 Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
-```
+:::
 
 This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM.
 Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index 1d77c7339a33f..90c880e8cfa46 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -95,10 +95,10 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-```{tip}
+:::{tip}
 While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
 This can improve the results notably in most cases.
-```
+:::
 
 Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
 It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index aaa13d0fb6d3f..ac592e22328da 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -57,9 +57,9 @@ class Index:
 
     def generate(self) -> str:
         content = f"# {self.title}\n\n{self.description}\n\n"
-        content += "```{toctree}\n"
+        content += ":::{toctree}\n"
         content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
-        content += "\n".join(self.documents) + "\n```\n"
+        content += "\n".join(self.documents) + "\n:::\n"
         return content
 
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index ae42dd0c0d08f..704a16233981f 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -86,9 +86,9 @@ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
 docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
 ```
 
-```{tip}
+:::{tip}
 If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
-```
+:::
 
 ## Extra information
 
@@ -155,30 +155,30 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
 
 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
 
-```{list-table} vLLM execution modes
+:::{list-table} vLLM execution modes
 :widths: 25 25 50
 :header-rows: 1
 
-* - `PT_HPU_LAZY_MODE`
-  - `enforce_eager`
-  - execution mode
-* - 0
-  - 0
-  - torch.compile
-* - 0
-  - 1
-  - PyTorch eager mode
-* - 1
-  - 0
-  - HPU Graphs
-* - 1
-  - 1
-  - PyTorch lazy mode
-```
-
-```{warning}
+- * `PT_HPU_LAZY_MODE`
+  * `enforce_eager`
+  * execution mode
+- * 0
+  * 0
+  * torch.compile
+- * 0
+  * 1
+  * PyTorch eager mode
+- * 1
+  * 0
+  * HPU Graphs
+- * 1
+  * 1
+  * PyTorch lazy mode
+:::
+
+:::{warning}
 In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-```
+:::
 
 (gaudi-bucketing-mechanism)=
 
@@ -187,9 +187,9 @@ In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and
 Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
 In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
 
-```{note}
+:::{note}
 Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-```
+:::
 
 Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
 
@@ -222,15 +222,15 @@ min = 128, step = 128, max = 512
 
 In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
 
-```{warning}
+:::{warning}
 If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-```
+:::
 
 As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
 
-```{note}
+:::{note}
 Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-```
+:::
 
 ### Warmup
 
@@ -252,9 +252,9 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size
 
 This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
 
-```{tip}
+:::{tip}
 Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-```
+:::
 
 ### HPU Graph capture
 
@@ -269,9 +269,9 @@ With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory wil
 Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
 Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
 
-```{note}
+:::{note}
 `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
-```
+:::
 
 User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
 \- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
@@ -279,9 +279,9 @@ User can also configure the strategy for capturing HPU Graphs for prompt and dec
 
 When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
 
-```{note}
+:::{note}
 `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-```
+:::
 
 Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
 
@@ -352,13 +352,13 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 
 - `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
 
-  - `{phase}` is either `PROMPT` or `DECODE`
+  * `{phase}` is either `PROMPT` or `DECODE`
 
-  - `{dim}` is either `BS`, `SEQ` or `BLOCK`
+  * `{dim}` is either `BS`, `SEQ` or `BLOCK`
 
-  - `{param}` is either `MIN`, `STEP` or `MAX`
+  * `{param}` is either `MIN`, `STEP` or `MAX`
 
-  - Default values:
+  * Default values:
 
     - Prompt:
       - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md
index a6c4c44305a4c..88352f639567b 100644
--- a/docs/source/getting_started/installation/ai_accelerator/index.md
+++ b/docs/source/getting_started/installation/ai_accelerator/index.md
@@ -2,374 +2,374 @@
 
 vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions:
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Requirements
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Configure a new environment
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} OpenVINO
-:sync: openvino
+::::
 
-```{include} ../python_env_setup.inc.md
-```
+::::{tab-item} OpenVINO
+:sync: openvino
 
+:::{include} ../python_env_setup.inc.md
 :::
 
 ::::
 
+:::::
+
 ## Set up using Python
 
 ### Pre-built wheels
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
 ::::
 
+:::::
+
 ### Build wheel from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Set up using Docker
 
 ### Pre-built images
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
 ::::
 
+:::::
+
 ### Build image from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Extra information
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} TPU
+::::{tab-item} TPU
 :sync: tpu
 
-```{include} tpu.inc.md
+:::{include} tpu.inc.md
 :start-after: "## Extra information"
-```
-
 :::
 
-:::{tab-item} Intel Gaudi
+::::
+
+::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-```{include} hpu-gaudi.inc.md
+:::{include} hpu-gaudi.inc.md
 :start-after: "## Extra information"
-```
-
 :::
 
-:::{tab-item} Neuron
+::::
+
+::::{tab-item} Neuron
 :sync: neuron
 
-```{include} neuron.inc.md
+:::{include} neuron.inc.md
 :start-after: "## Extra information"
-```
-
 :::
 
-:::{tab-item} OpenVINO
+::::
+
+::::{tab-item} OpenVINO
 :sync: openvino
 
-```{include} openvino.inc.md
+:::{include} openvino.inc.md
 :start-after: "## Extra information"
-```
-
 :::
 
 ::::
+
+:::::
diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index 575a9f9c2e2f0..145cc9d668efd 100644
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -67,9 +67,9 @@ Currently, there are no pre-built Neuron wheels.
 
 ### Build wheel from source
 
-```{note}
+:::{note}
 The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-```
+:::
 
 Following instructions are applicable to Neuron SDK 2.16 and beyond.
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index 6a911cc6b9eba..6827afc805fd8 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -47,10 +47,10 @@ When you request queued resources, the request is added to a queue maintained by
 the Cloud TPU service. When the requested resource becomes available, it's
 assigned to your Google Cloud project for your immediate exclusive use.
 
-```{note}
+:::{note}
 In all of the following commands, replace the ALL CAPS parameter names with
 appropriate values. See the parameter descriptions table for more information.
-```
+:::
 
 ### Provision Cloud TPUs with GKE
 
@@ -75,33 +75,33 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
 --service-account SERVICE_ACCOUNT
 ```
 
-```{list-table} Parameter descriptions
+:::{list-table} Parameter descriptions
 :header-rows: 1
 
-* - Parameter name
-  - Description
-* - QUEUED_RESOURCE_ID
-  - The user-assigned ID of the queued resource request.
-* - TPU_NAME
-  - The user-assigned name of the TPU which is created when the queued
+- * Parameter name
+  * Description
+- * QUEUED_RESOURCE_ID
+  * The user-assigned ID of the queued resource request.
+- * TPU_NAME
+  * The user-assigned name of the TPU which is created when the queued
     resource request is allocated.
-* - PROJECT_ID
-  - Your Google Cloud project
-* - ZONE
-  - The GCP zone where you want to create your Cloud TPU. The value you use
+- * PROJECT_ID
+  * Your Google Cloud project
+- * ZONE
+  * The GCP zone where you want to create your Cloud TPU. The value you use
     depends on the version of TPUs you are using. For more information, see
     `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
-* - ACCELERATOR_TYPE
-  - The TPU version you want to use. Specify the TPU version, for example
+- * ACCELERATOR_TYPE
+  * The TPU version you want to use. Specify the TPU version, for example
     `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
     see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
-* - RUNTIME_VERSION
-  - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
-* - SERVICE_ACCOUNT
-  - The email address for your service account. You can find it in the IAM
+- * RUNTIME_VERSION
+  * The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
+- * SERVICE_ACCOUNT
+  * The email address for your service account. You can find it in the IAM
     Cloud Console under *Service Accounts*. For example:
     `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
-```
+:::
 
 Connect to your TPU using SSH:
 
@@ -178,15 +178,15 @@ Run the Docker image with the following command:
 docker run --privileged --net host --shm-size=16G -it vllm-tpu
 ```
 
-```{note}
+:::{note}
 Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
 possible input shapes and compiles an XLA graph for each shape. The
 compilation time may take 20~30 minutes in the first run. However, the
 compilation time reduces to ~5 minutes afterwards because the XLA graphs are
 cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default).
-```
+:::
 
-````{tip}
+:::{tip}
 If you encounter the following error:
 
 ```console
@@ -198,9 +198,10 @@ file or directory
 Install OpenBLAS with the following command:
 
 ```console
-$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
 ```
-````
+
+:::
 
 ## Extra information
 
diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md
index 56545253b1ef7..0808b869fdb7b 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -25,9 +25,9 @@ pip install -r requirements-cpu.txt
 pip install -e . 
 ```
 
-```{note}
+:::{note}
 On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
-```
+:::
 
 #### Troubleshooting
 
diff --git a/docs/source/getting_started/installation/cpu/index.md b/docs/source/getting_started/installation/cpu/index.md
index 4ec907c0e9fda..2f549ede0cf48 100644
--- a/docs/source/getting_started/installation/cpu/index.md
+++ b/docs/source/getting_started/installation/cpu/index.md
@@ -2,86 +2,86 @@
 
 vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions:
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} x86
+::::{tab-item} x86
 :sync: x86
 
-```{include} x86.inc.md
+:::{include} x86.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} ARM
+::::
+
+::::{tab-item} ARM
 :sync: arm
 
-```{include} arm.inc.md
+:::{include} arm.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} Apple silicon
+::::
+
+::::{tab-item} Apple silicon
 :sync: apple
 
-```{include} apple.inc.md
+:::{include} apple.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Requirements
 
 - Python: 3.9 -- 3.12
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} x86
+::::{tab-item} x86
 :sync: x86
 
-```{include} x86.inc.md
+:::{include} x86.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} ARM
+::::
+
+::::{tab-item} ARM
 :sync: arm
 
-```{include} arm.inc.md
+:::{include} arm.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} Apple silicon
+::::
+
+::::{tab-item} Apple silicon
 :sync: apple
 
-```{include} apple.inc.md
+:::{include} apple.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Set up using Python
 
 ### Create a new Python environment
 
-```{include} ../python_env_setup.inc.md
-```
+:::{include} ../python_env_setup.inc.md
+:::
 
 ### Pre-built wheels
 
@@ -89,41 +89,41 @@ Currently, there are no pre-built CPU wheels.
 
 ### Build wheel from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} x86
+::::{tab-item} x86
 :sync: x86
 
-```{include} x86.inc.md
+:::{include} x86.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} ARM
+::::
+
+::::{tab-item} ARM
 :sync: arm
 
-```{include} arm.inc.md
+:::{include} arm.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} Apple silicon
+::::
+
+::::{tab-item} Apple silicon
 :sync: apple
 
-```{include} apple.inc.md
+:::{include} apple.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Set up using Docker
 
 ### Pre-built images
@@ -142,9 +142,9 @@ $ docker run -it \
              vllm-cpu-env
 ```
 
-:::{tip}
+::::{tip}
 For ARM or Apple silicon, use `Dockerfile.arm`
-:::
+::::
 
 ## Supported features
 
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
index e0eaac5099305..f146ae0918b44 100644
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -17,10 +17,10 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
 :::{include} build.inc.md
 :::
 
-```{note}
+:::{note}
 - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
 - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
-```
+:::
 
 ## Set up using Docker
 
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index 4cce65278c069..5c2ea30dbfde1 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -10,9 +10,9 @@ vLLM contains pre-compiled C++ and CUDA (12.1) binaries.
 
 ### Create a new Python environment
 
-```{note}
+:::{note}
 PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
-```
+:::
 
 In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
 
@@ -100,10 +100,10 @@ pip install --editable .
 
 You can find more information about vLLM's wheels in <project:#install-the-latest-code>.
 
-```{note}
+:::{note}
 There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
 It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to <project:#install-the-latest-code> for instructions on how to install a specified wheel.
-```
+:::
 
 #### Full build (with compilation)
 
@@ -115,7 +115,7 @@ cd vllm
 pip install -e .
 ```
 
-```{tip}
+:::{tip}
 Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
 
 For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
@@ -123,7 +123,7 @@ As long as `which ccache` command can find the `ccache` binary, it will be used
 
 [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
 The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
-```
+:::
 
 ##### Use an existing PyTorch installation
 
diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md
index 6c007382b2c3d..0a61f889753a3 100644
--- a/docs/source/getting_started/installation/gpu/index.md
+++ b/docs/source/getting_started/installation/gpu/index.md
@@ -2,299 +2,299 @@
 
 vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Requirements
 
 - OS: Linux
 - Python: 3.9 -- 3.12
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Set up using Python
 
 ### Create a new Python environment
 
-```{include} ../python_env_setup.inc.md
-```
+:::{include} ../python_env_setup.inc.md
+:::
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "## Create a new Python environment"
 :end-before: "### Pre-built wheels"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
 There is no extra information on creating a new Python environment for this device.
 
-:::
+::::
 
-:::{tab-item} XPU
+::::{tab-item} XPU
 :sync: xpu
 
 There is no extra information on creating a new Python environment for this device.
 
-:::
-
 ::::
 
+:::::
+
 ### Pre-built wheels
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
-```
-
 :::
 
 ::::
 
+:::::
+
 (build-from-source)=
 
 ### Build wheel from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Set up using Docker
 
 ### Pre-built images
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
-```
-
 :::
 
 ::::
 
+:::::
+
 ### Build image from source
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
-```
-
 :::
 
 ::::
 
+:::::
+
 ## Supported features
 
-::::{tab-set}
+:::::{tab-set}
 :sync-group: device
 
-:::{tab-item} CUDA
+::::{tab-item} CUDA
 :sync: cuda
 
-```{include} cuda.inc.md
+:::{include} cuda.inc.md
 :start-after: "## Supported features"
-```
-
 :::
 
-:::{tab-item} ROCm
+::::
+
+::::{tab-item} ROCm
 :sync: rocm
 
-```{include} rocm.inc.md
+:::{include} rocm.inc.md
 :start-after: "## Supported features"
-```
-
 :::
 
-:::{tab-item} XPU
+::::
+
+::::{tab-item} XPU
 :sync: xpu
 
-```{include} xpu.inc.md
+:::{include} xpu.inc.md
 :start-after: "## Supported features"
-```
-
 :::
 
 ::::
+
+:::::
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 69238f6e36fb2..131ad1704ea11 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -16,10 +16,10 @@ Currently, there are no pre-built ROCm wheels.
 However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
 docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
 
-```{tip}
+:::{tip}
 Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
 for instructions on how to use this prebuilt docker image.
-```
+:::
 
 ### Build wheel from source
 
@@ -47,9 +47,9 @@ for instructions on how to use this prebuilt docker image.
     cd ../..
     ```
 
-    ```{note}
-    - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
-    ```
+    :::{note}
+    If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
+    :::
 
 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile)
 
@@ -67,9 +67,9 @@ for instructions on how to use this prebuilt docker image.
     cd ..
     ```
 
-    ```{note}
-    - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
-    ```
+    :::{note}
+    You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
+    :::
 
 3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps:
 
@@ -95,17 +95,18 @@ for instructions on how to use this prebuilt docker image.
 
     This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
-    ```{tip}
+<!--- pyml disable-num-lines 5 ul-indent-->
+    :::{tip}
     - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
     - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
     - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
     - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
-    ```
+    :::
 
-```{tip}
+:::{tip}
 - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
   For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
-```
+:::
 
 ## Set up using Docker
 
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index 577986eba74fd..bc01c6000bc07 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -30,10 +30,10 @@ pip install -v -r requirements-xpu.txt
 VLLM_TARGET_DEVICE=xpu python setup.py install
 ```
 
-```{note}
+:::{note}
 - FP16 is the default data type in the current XPU backend. The BF16 data
   type will be supported in the future.
-```
+:::
 
 ## Set up using Docker
 
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index bc1d268bf0c7e..0f5e013ce071a 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -4,10 +4,10 @@
 
 vLLM supports the following hardware platforms:
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 gpu/index
 cpu/index
 ai_accelerator/index
-```
+:::
diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md
index 25cfac5f58aa7..cb73914c9c75e 100644
--- a/docs/source/getting_started/installation/python_env_setup.inc.md
+++ b/docs/source/getting_started/installation/python_env_setup.inc.md
@@ -6,9 +6,9 @@ conda create -n myenv python=3.12 -y
 conda activate myenv
 ```
 
-```{note}
+:::{note}
 [PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages.
-```
+:::
 
 Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
 
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 8ac80e5e5c553..f4682ee45a48e 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -32,9 +32,9 @@ conda activate myenv
 pip install vllm
 ```
 
-```{note}
+:::{note}
 For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM.
-```
+:::
 
 (quickstart-offline)=
 
@@ -69,9 +69,9 @@ The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](
 llm = LLM(model="facebook/opt-125m")
 ```
 
-```{note}
+:::{note}
 By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
-```
+:::
 
 Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
 
@@ -97,10 +97,10 @@ Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instru
 vllm serve Qwen/Qwen2.5-1.5B-Instruct
 ```
 
-```{note}
+:::{note}
 By default, the server uses a predefined chat template stored in the tokenizer.
 You can learn about overriding it [here](#chat-template).
-```
+:::
 
 This server can be queried in the same format as OpenAI API. For example, to list the models:
 
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 7bfe9b4036adf..2f41fa3b6b19e 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -4,9 +4,9 @@
 
 This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
-```{note}
+:::{note}
 Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
-```
+:::
 
 ## Hangs downloading a model
 
@@ -18,9 +18,9 @@ It's recommended to download the model first using the [huggingface-cli](https:/
 If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
 It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
 
-```{note}
+:::{note}
 To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
-```
+:::
 
 ## Out of memory
 
@@ -132,14 +132,14 @@ If the script runs successfully, you should see the message `sanity check is suc
 
 If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
 
-```{note}
+:::{note}
 A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
 
 - In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`.
 - In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`.
 
 Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
-```
+:::
 
 (troubleshooting-python-multiprocessing)=
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 6957d5dd0f2e7..e90e81c72860a 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -1,13 +1,13 @@
 # Welcome to vLLM
 
-```{figure} ./assets/logos/vllm-logo-text-light.png
+:::{figure} ./assets/logos/vllm-logo-text-light.png
 :align: center
 :alt: vLLM
 :class: no-scaled-link
 :width: 60%
-```
+:::
 
-```{raw} html
+:::{raw} html
 <p style="text-align:center">
 <strong>Easy, fast, and cheap LLM serving for everyone
 </strong>
@@ -19,7 +19,7 @@
 <a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
 <a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
 </p>
-```
+:::
 
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 
@@ -58,7 +58,7 @@ For more information, check out the following:
 
 % How to start using vLLM?
 
-```{toctree}
+:::{toctree}
 :caption: Getting Started
 :maxdepth: 1
 
@@ -67,11 +67,11 @@ getting_started/quickstart
 getting_started/examples/examples_index
 getting_started/troubleshooting
 getting_started/faq
-```
+:::
 
 % What does vLLM support?
 
-```{toctree}
+:::{toctree}
 :caption: Models
 :maxdepth: 1
 
@@ -79,11 +79,11 @@ models/generative_models
 models/pooling_models
 models/supported_models
 models/extensions/index
-```
+:::
 
 % Additional capabilities
 
-```{toctree}
+:::{toctree}
 :caption: Features
 :maxdepth: 1
 
@@ -96,11 +96,11 @@ features/automatic_prefix_caching
 features/disagg_prefill
 features/spec_decode
 features/compatibility_matrix
-```
+:::
 
 % Details about running vLLM
 
-```{toctree}
+:::{toctree}
 :caption: Inference and Serving
 :maxdepth: 1
 
@@ -113,11 +113,11 @@ serving/engine_args
 serving/env_vars
 serving/usage_stats
 serving/integrations/index
-```
+:::
 
 % Scaling up vLLM for production
 
-```{toctree}
+:::{toctree}
 :caption: Deployment
 :maxdepth: 1
 
@@ -126,21 +126,21 @@ deployment/k8s
 deployment/nginx
 deployment/frameworks/index
 deployment/integrations/index
-```
+:::
 
 % Making the most out of vLLM
 
-```{toctree}
+:::{toctree}
 :caption: Performance
 :maxdepth: 1
 
 performance/optimization
 performance/benchmarks
-```
+:::
 
 % Explanation of vLLM internals
 
-```{toctree}
+:::{toctree}
 :caption: Design Documents
 :maxdepth: 2
 
@@ -151,11 +151,11 @@ design/kernel/paged_attention
 design/mm_processing
 design/automatic_prefix_caching
 design/multiprocessing
-```
+:::
 
 % How to contribute to the vLLM project
 
-```{toctree}
+:::{toctree}
 :caption: Developer Guide
 :maxdepth: 2
 
@@ -164,11 +164,11 @@ contributing/profiling/profiling_index
 contributing/dockerfile/dockerfile
 contributing/model/index
 contributing/vulnerability_management
-```
+:::
 
 % Technical API specifications
 
-```{toctree}
+:::{toctree}
 :caption: API Reference
 :maxdepth: 2
 
@@ -177,18 +177,18 @@ api/engine/index
 api/inference_params
 api/multimodal/index
 api/model/index
-```
+:::
 
 % Latest news and acknowledgements
 
-```{toctree}
+:::{toctree}
 :caption: Community
 :maxdepth: 1
 
 community/blog
 community/meetups
 community/sponsors
-```
+:::
 
 ## Indices and tables
 
diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md
index cff09d12eba47..69faf472e5300 100644
--- a/docs/source/models/extensions/index.md
+++ b/docs/source/models/extensions/index.md
@@ -1,8 +1,8 @@
 # Built-in Extensions
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 runai_model_streamer
 tensorizer
-```
+:::
diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md
index 75f7a9fcad416..99c37876a01b3 100644
--- a/docs/source/models/extensions/runai_model_streamer.md
+++ b/docs/source/models/extensions/runai_model_streamer.md
@@ -48,6 +48,6 @@ You can read further about CPU buffer memory limiting [here](https://github.com/
 vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
 ```
 
-```{note}
+:::{note}
 For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
-```
+:::
diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md
index ae17e3437bca6..830c579d91bae 100644
--- a/docs/source/models/extensions/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@@ -11,6 +11,6 @@ For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
 the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
 
-```{note}
+:::{note}
 Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
-```
+:::
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index e4b4cd03a90d2..4abe6b776eea3 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -70,10 +70,10 @@ The {class}`~vllm.LLM.chat` method implements chat functionality on top of {clas
 In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
 and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt.
 
-```{important}
+:::{important}
 In general, only instruction-tuned models have a chat template.
 Base models may perform poorly as they are not trained to respond to the chat conversation.
-```
+:::
 
 ```python
 llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 91db694be29a4..9704ccee745c4 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -8,54 +8,54 @@ In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmMo
 These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
 before returning them.
 
-```{note}
+:::{note}
 We currently support pooling models primarily as a matter of convenience.
 As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to
 pooling models as they only work on the generation or decode stage, so performance may not improve as much.
-```
+:::
 
 For pooling models, we support the following `--task` options.
 The selected option sets the default pooler used to extract the final hidden states:
 
-```{list-table}
+:::{list-table}
 :widths: 50 25 25 25
 :header-rows: 1
 
-* - Task
-  - Pooling Type
-  - Normalization
-  - Softmax
-* - Embedding (`embed`)
-  - `LAST`
-  - ✅︎
-  - ✗
-* - Classification (`classify`)
-  - `LAST`
-  - ✗
-  - ✅︎
-* - Sentence Pair Scoring (`score`)
-  - \*
-  - \*
-  - \*
-* - Reward Modeling (`reward`)
-  - `ALL`
-  - ✗
-  - ✗
-```
+- * Task
+  * Pooling Type
+  * Normalization
+  * Softmax
+- * Embedding (`embed`)
+  * `LAST`
+  * ✅︎
+  * ✗
+- * Classification (`classify`)
+  * `LAST`
+  * ✗
+  * ✅︎
+- * Sentence Pair Scoring (`score`)
+  * \*
+  * \*
+  * \*
+- * Reward Modeling (`reward`)
+  * `ALL`
+  * ✗
+  * ✗
+:::
 
 \*The default pooler is always defined by the model.
 
-```{note}
+:::{note}
 If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
-```
+:::
 
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
 we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`).
 
-```{tip}
+:::{tip}
 You can customize the model's pooling method via the `--override-pooler-config` option,
 which takes priority over both the model's and Sentence Transformers's defaults.
-```
+:::
 
 ## Offline Inference
 
@@ -111,10 +111,10 @@ The {class}`~vllm.LLM.score` method outputs similarity scores between sentence p
 It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html).
 These types of models serve as rerankers between candidate query-document pairs in RAG systems.
 
-```{note}
+:::{note}
 vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
 To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
-```
+:::
 
 ```python
 llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index e59150cdd3b83..94f4bd6cadabd 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -17,7 +17,7 @@ By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co
 To determine whether a given model is supported, you can check the `config.json` file inside the HF repository.
 If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory.
 
-````{tip}
+:::{tip}
 The easiest way to check if your model is really supported at runtime is to run the program below:
 
 ```python
@@ -35,7 +35,7 @@ print(output)
 ```
 
 If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
-````
+:::
 
 Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
@@ -72,364 +72,364 @@ See [this page](#generative-models) for more information on how to use generativ
 
 #### Text Generation (`--task generate`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `AquilaForCausalLM`
-  - Aquila, Aquila2
-  - `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.
-  - ✅︎
-  - ✅︎
-* - `ArcticForCausalLM`
-  - Arctic
-  - `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.
-  -
-  - ✅︎
-* - `BaiChuanForCausalLM`
-  - Baichuan2, Baichuan
-  - `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
-  - ✅︎
-  - ✅︎
-* - `BloomForCausalLM`
-  - BLOOM, BLOOMZ, BLOOMChat
-  - `bigscience/bloom`, `bigscience/bloomz`, etc.
-  -
-  - ✅︎
-* - `BartForConditionalGeneration`
-  - BART
-  - `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
-  -
-  -
-* - `ChatGLMModel`
-  - ChatGLM
-  - `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
-  - ✅︎
-  - ✅︎
-* - `CohereForCausalLM`, `Cohere2ForCausalLM`
-  - Command-R
-  - `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.
-  - ✅︎
-  - ✅︎
-* - `DbrxForCausalLM`
-  - DBRX
-  - `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.
-  -
-  - ✅︎
-* - `DeciLMForCausalLM`
-  - DeciLM
-  - `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
-  -
-  - ✅︎
-* - `DeepseekForCausalLM`
-  - DeepSeek
-  - `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.
-  -
-  - ✅︎
-* - `DeepseekV2ForCausalLM`
-  - DeepSeek-V2
-  - `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.
-  -
-  - ✅︎
-* - `DeepseekV3ForCausalLM`
-  - DeepSeek-V3
-  - `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.
-  -
-  - ✅︎
-* - `ExaoneForCausalLM`
-  - EXAONE-3
-  - `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `FalconForCausalLM`
-  - Falcon
-  - `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.
-  -
-  - ✅︎
-* - `FalconMambaForCausalLM`
-  - FalconMamba
-  - `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `GemmaForCausalLM`
-  - Gemma
-  - `google/gemma-2b`, `google/gemma-7b`, etc.
-  - ✅︎
-  - ✅︎
-* - `Gemma2ForCausalLM`
-  - Gemma2
-  - `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
-  - ✅︎
-  - ✅︎
-* - `GlmForCausalLM`
-  - GLM-4
-  - `THUDM/glm-4-9b-chat-hf`, etc.
-  - ✅︎
-  - ✅︎
-* - `GPT2LMHeadModel`
-  - GPT-2
-  - `gpt2`, `gpt2-xl`, etc.
-  -
-  - ✅︎
-* - `GPTBigCodeForCausalLM`
-  - StarCoder, SantaCoder, WizardCoder
-  - `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.
-  - ✅︎
-  - ✅︎
-* - `GPTJForCausalLM`
-  - GPT-J
-  - `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.
-  -
-  - ✅︎
-* - `GPTNeoXForCausalLM`
-  - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
-  - `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.
-  -
-  - ✅︎
-* - `GraniteForCausalLM`
-  - Granite 3.0, Granite 3.1, PowerLM
-  - `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.
-  - ✅︎
-  - ✅︎
-* - `GraniteMoeForCausalLM`
-  - Granite 3.0 MoE, PowerMoE
-  - `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
-  - ✅︎
-  - ✅︎
-* - `GritLM`
-  - GritLM
-  - `parasail-ai/GritLM-7B-vllm`.
-  - ✅︎
-  - ✅︎
-* - `InternLMForCausalLM`
-  - InternLM
-  - `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
-  - ✅︎
-  - ✅︎
-* - `InternLM2ForCausalLM`
-  - InternLM2
-  - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
-  - ✅︎
-  - ✅︎
-* - `InternLM3ForCausalLM`
-  - InternLM3
-  - `internlm/internlm3-8b-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `JAISLMHeadModel`
-  - Jais
-  - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
-  -
-  - ✅︎
-* - `JambaForCausalLM`
-  - Jamba
-  - `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.
-  - ✅︎
-  - ✅︎
-* - `LlamaForCausalLM`
-  - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
-  - `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.
-  - ✅︎
-  - ✅︎
-* - `MambaForCausalLM`
-  - Mamba
-  - `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.
-  -
-  - ✅︎
-* - `MiniCPMForCausalLM`
-  - MiniCPM
-  - `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.
-  - ✅︎
-  - ✅︎
-* - `MiniCPM3ForCausalLM`
-  - MiniCPM3
-  - `openbmb/MiniCPM3-4B`, etc.
-  - ✅︎
-  - ✅︎
-* - `MistralForCausalLM`
-  - Mistral, Mistral-Instruct
-  - `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.
-  - ✅︎
-  - ✅︎
-* - `MixtralForCausalLM`
-  - Mixtral-8x7B, Mixtral-8x7B-Instruct
-  - `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.
-  - ✅︎
-  - ✅︎
-* - `MPTForCausalLM`
-  - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
-  - `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.
-  -
-  - ✅︎
-* - `NemotronForCausalLM`
-  - Nemotron-3, Nemotron-4, Minitron
-  - `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
-  - ✅︎
-  - ✅︎
-* - `OLMoForCausalLM`
-  - OLMo
-  - `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.
-  -
-  - ✅︎
-* - `OLMo2ForCausalLM`
-  - OLMo2
-  - `allenai/OLMo2-7B-1124`, etc.
-  -
-  - ✅︎
-* - `OLMoEForCausalLM`
-  - OLMoE
-  - `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `OPTForCausalLM`
-  - OPT, OPT-IML
-  - `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.
-  -
-  - ✅︎
-* - `OrionForCausalLM`
-  - Orion
-  - `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.
-  -
-  - ✅︎
-* - `PhiForCausalLM`
-  - Phi
-  - `microsoft/phi-1_5`, `microsoft/phi-2`, etc.
-  - ✅︎
-  - ✅︎
-* - `Phi3ForCausalLM`
-  - Phi-4, Phi-3
-  - `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `Phi3SmallForCausalLM`
-  - Phi-3-Small
-  - `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.
-  -
-  - ✅︎
-* - `PhiMoEForCausalLM`
-  - Phi-3.5-MoE
-  - `microsoft/Phi-3.5-MoE-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `PersimmonForCausalLM`
-  - Persimmon
-  - `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
-  -
-  - ✅︎
-* - `QWenLMHeadModel`
-  - Qwen
-  - `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2ForCausalLM`
-  - QwQ, Qwen2
-  - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2MoeForCausalLM`
-  - Qwen2MoE
-  - `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
-  -
-  - ✅︎
-* - `StableLmForCausalLM`
-  - StableLM
-  - `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
-  -
-  - ✅︎
-* - `Starcoder2ForCausalLM`
-  - Starcoder2
-  - `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.
-  -
-  - ✅︎
-* - `SolarForCausalLM`
-  - Solar Pro
-  - `upstage/solar-pro-preview-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `TeleChat2ForCausalLM`
-  - TeleChat2
-  - `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc.
-  - ✅︎
-  - ✅︎
-* - `XverseForCausalLM`
-  - XVERSE
-  - `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
-  - ✅︎
-  - ✅︎
-```
-
-```{note}
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `AquilaForCausalLM`
+  * Aquila, Aquila2
+  * `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.
+  * ✅︎
+  * ✅︎
+- * `ArcticForCausalLM`
+  * Arctic
+  * `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.
+  *
+  * ✅︎
+- * `BaiChuanForCausalLM`
+  * Baichuan2, Baichuan
+  * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
+  * ✅︎
+  * ✅︎
+- * `BloomForCausalLM`
+  * BLOOM, BLOOMZ, BLOOMChat
+  * `bigscience/bloom`, `bigscience/bloomz`, etc.
+  *
+  * ✅︎
+- * `BartForConditionalGeneration`
+  * BART
+  * `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
+  *
+  *
+- * `ChatGLMModel`
+  * ChatGLM
+  * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
+  * ✅︎
+  * ✅︎
+- * `CohereForCausalLM`, `Cohere2ForCausalLM`
+  * Command-R
+  * `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.
+  * ✅︎
+  * ✅︎
+- * `DbrxForCausalLM`
+  * DBRX
+  * `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.
+  *
+  * ✅︎
+- * `DeciLMForCausalLM`
+  * DeciLM
+  * `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
+  *
+  * ✅︎
+- * `DeepseekForCausalLM`
+  * DeepSeek
+  * `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.
+  *
+  * ✅︎
+- * `DeepseekV2ForCausalLM`
+  * DeepSeek-V2
+  * `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.
+  *
+  * ✅︎
+- * `DeepseekV3ForCausalLM`
+  * DeepSeek-V3
+  * `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.
+  *
+  * ✅︎
+- * `ExaoneForCausalLM`
+  * EXAONE-3
+  * `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `FalconForCausalLM`
+  * Falcon
+  * `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.
+  *
+  * ✅︎
+- * `FalconMambaForCausalLM`
+  * FalconMamba
+  * `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `GemmaForCausalLM`
+  * Gemma
+  * `google/gemma-2b`, `google/gemma-7b`, etc.
+  * ✅︎
+  * ✅︎
+- * `Gemma2ForCausalLM`
+  * Gemma2
+  * `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
+  * ✅︎
+  * ✅︎
+- * `GlmForCausalLM`
+  * GLM-4
+  * `THUDM/glm-4-9b-chat-hf`, etc.
+  * ✅︎
+  * ✅︎
+- * `GPT2LMHeadModel`
+  * GPT-2
+  * `gpt2`, `gpt2-xl`, etc.
+  *
+  * ✅︎
+- * `GPTBigCodeForCausalLM`
+  * StarCoder, SantaCoder, WizardCoder
+  * `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.
+  * ✅︎
+  * ✅︎
+- * `GPTJForCausalLM`
+  * GPT-J
+  * `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.
+  *
+  * ✅︎
+- * `GPTNeoXForCausalLM`
+  * GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
+  * `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.
+  *
+  * ✅︎
+- * `GraniteForCausalLM`
+  * Granite 3.0, Granite 3.1, PowerLM
+  * `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.
+  * ✅︎
+  * ✅︎
+- * `GraniteMoeForCausalLM`
+  * Granite 3.0 MoE, PowerMoE
+  * `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
+  * ✅︎
+  * ✅︎
+- * `GritLM`
+  * GritLM
+  * `parasail-ai/GritLM-7B-vllm`.
+  * ✅︎
+  * ✅︎
+- * `InternLMForCausalLM`
+  * InternLM
+  * `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
+  * ✅︎
+  * ✅︎
+- * `InternLM2ForCausalLM`
+  * InternLM2
+  * `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
+  * ✅︎
+  * ✅︎
+- * `InternLM3ForCausalLM`
+  * InternLM3
+  * `internlm/internlm3-8b-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `JAISLMHeadModel`
+  * Jais
+  * `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
+  *
+  * ✅︎
+- * `JambaForCausalLM`
+  * Jamba
+  * `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.
+  * ✅︎
+  * ✅︎
+- * `LlamaForCausalLM`
+  * Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
+  * `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.
+  * ✅︎
+  * ✅︎
+- * `MambaForCausalLM`
+  * Mamba
+  * `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.
+  *
+  * ✅︎
+- * `MiniCPMForCausalLM`
+  * MiniCPM
+  * `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.
+  * ✅︎
+  * ✅︎
+- * `MiniCPM3ForCausalLM`
+  * MiniCPM3
+  * `openbmb/MiniCPM3-4B`, etc.
+  * ✅︎
+  * ✅︎
+- * `MistralForCausalLM`
+  * Mistral, Mistral-Instruct
+  * `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.
+  * ✅︎
+  * ✅︎
+- * `MixtralForCausalLM`
+  * Mixtral-8x7B, Mixtral-8x7B-Instruct
+  * `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.
+  * ✅︎
+  * ✅︎
+- * `MPTForCausalLM`
+  * MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
+  * `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.
+  *
+  * ✅︎
+- * `NemotronForCausalLM`
+  * Nemotron-3, Nemotron-4, Minitron
+  * `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
+  * ✅︎
+  * ✅︎
+- * `OLMoForCausalLM`
+  * OLMo
+  * `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.
+  *
+  * ✅︎
+- * `OLMo2ForCausalLM`
+  * OLMo2
+  * `allenai/OLMo2-7B-1124`, etc.
+  *
+  * ✅︎
+- * `OLMoEForCausalLM`
+  * OLMoE
+  * `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `OPTForCausalLM`
+  * OPT, OPT-IML
+  * `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.
+  *
+  * ✅︎
+- * `OrionForCausalLM`
+  * Orion
+  * `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.
+  *
+  * ✅︎
+- * `PhiForCausalLM`
+  * Phi
+  * `microsoft/phi-1_5`, `microsoft/phi-2`, etc.
+  * ✅︎
+  * ✅︎
+- * `Phi3ForCausalLM`
+  * Phi-4, Phi-3
+  * `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `Phi3SmallForCausalLM`
+  * Phi-3-Small
+  * `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.
+  *
+  * ✅︎
+- * `PhiMoEForCausalLM`
+  * Phi-3.5-MoE
+  * `microsoft/Phi-3.5-MoE-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `PersimmonForCausalLM`
+  * Persimmon
+  * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
+  *
+  * ✅︎
+- * `QWenLMHeadModel`
+  * Qwen
+  * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2ForCausalLM`
+  * QwQ, Qwen2
+  * `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2MoeForCausalLM`
+  * Qwen2MoE
+  * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
+  *
+  * ✅︎
+- * `StableLmForCausalLM`
+  * StableLM
+  * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
+  *
+  * ✅︎
+- * `Starcoder2ForCausalLM`
+  * Starcoder2
+  * `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.
+  *
+  * ✅︎
+- * `SolarForCausalLM`
+  * Solar Pro
+  * `upstage/solar-pro-preview-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `TeleChat2ForCausalLM`
+  * TeleChat2
+  * `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc.
+  * ✅︎
+  * ✅︎
+- * `XverseForCausalLM`
+  * XVERSE
+  * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
+  * ✅︎
+  * ✅︎
+:::
+
+:::{note}
 Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
-```
+:::
 
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
 
-```{important}
+:::{important}
 Since some model architectures support both generative and pooling tasks,
 you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
-```
+:::
 
 #### Text Embedding (`--task embed`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `BertModel`
-  - BERT-based
-  - `BAAI/bge-base-en-v1.5`, etc.
-  -
-  -
-* - `Gemma2Model`
-  - Gemma2-based
-  - `BAAI/bge-multilingual-gemma2`, etc.
-  -
-  - ✅︎
-* - `GritLM`
-  - GritLM
-  - `parasail-ai/GritLM-7B-vllm`.
-  - ✅︎
-  - ✅︎
-* - `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc.
-  - Llama-based
-  - `intfloat/e5-mistral-7b-instruct`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2Model`, `Qwen2ForCausalLM`
-  - Qwen2-based
-  - `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
-  - ✅︎
-  - ✅︎
-* - `RobertaModel`, `RobertaForMaskedLM`
-  - RoBERTa-based
-  - `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc.
-  -
-  -
-* - `XLMRobertaModel`
-  - XLM-RoBERTa-based
-  - `intfloat/multilingual-e5-large`, etc.
-  -
-  -
-```
-
-```{note}
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `BertModel`
+  * BERT-based
+  * `BAAI/bge-base-en-v1.5`, etc.
+  *
+  *
+- * `Gemma2Model`
+  * Gemma2-based
+  * `BAAI/bge-multilingual-gemma2`, etc.
+  *
+  * ✅︎
+- * `GritLM`
+  * GritLM
+  * `parasail-ai/GritLM-7B-vllm`.
+  * ✅︎
+  * ✅︎
+- * `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc.
+  * Llama-based
+  * `intfloat/e5-mistral-7b-instruct`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2Model`, `Qwen2ForCausalLM`
+  * Qwen2-based
+  * `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
+  * ✅︎
+  * ✅︎
+- * `RobertaModel`, `RobertaForMaskedLM`
+  * RoBERTa-based
+  * `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc.
+  *
+  *
+- * `XLMRobertaModel`
+  * XLM-RoBERTa-based
+  * `intfloat/multilingual-e5-large`, etc.
+  *
+  *
+:::
+
+:::{note}
 `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
 You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
-```
+:::
 
-```{note}
+:::{note}
 Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
 You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
 
@@ -438,7 +438,7 @@ despite being described otherwise on its model card.
 
 Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be
 loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
-```
+:::
 
 If your model is not in the above list, we will try to automatically convert the model using
 {func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
@@ -446,98 +446,98 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 
 #### Reward Modeling (`--task reward`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `InternLM2ForRewardModel`
-  - InternLM2-based
-  - `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc.
-  - ✅︎
-  - ✅︎
-* - `LlamaForCausalLM`
-  - Llama-based
-  - `peiyi9979/math-shepherd-mistral-7b-prm`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2ForRewardModel`
-  - Qwen2-based
-  - `Qwen/Qwen2.5-Math-RM-72B`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2ForProcessRewardModel`
-  - Qwen2-based
-  - `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc.
-  - ✅︎
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `InternLM2ForRewardModel`
+  * InternLM2-based
+  * `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc.
+  * ✅︎
+  * ✅︎
+- * `LlamaForCausalLM`
+  * Llama-based
+  * `peiyi9979/math-shepherd-mistral-7b-prm`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2ForRewardModel`
+  * Qwen2-based
+  * `Qwen/Qwen2.5-Math-RM-72B`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2ForProcessRewardModel`
+  * Qwen2-based
+  * `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc.
+  * ✅︎
+  * ✅︎
+:::
 
 If your model is not in the above list, we will try to automatically convert the model using
 {func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
 
-```{important}
+:::{important}
 For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
 e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
-```
+:::
 
 #### Classification (`--task classify`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `JambaForSequenceClassification`
-  - Jamba
-  - `ai21labs/Jamba-tiny-reward-dev`, etc.
-  - ✅︎
-  - ✅︎
-* - `Qwen2ForSequenceClassification`
-  - Qwen2-based
-  - `jason9693/Qwen2.5-1.5B-apeach`, etc.
-  - ✅︎
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `JambaForSequenceClassification`
+  * Jamba
+  * `ai21labs/Jamba-tiny-reward-dev`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen2ForSequenceClassification`
+  * Qwen2-based
+  * `jason9693/Qwen2.5-1.5B-apeach`, etc.
+  * ✅︎
+  * ✅︎
+:::
 
 If your model is not in the above list, we will try to automatically convert the model using
 {func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring (`--task score`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 50 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `BertForSequenceClassification`
-  - BERT-based
-  - `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
-  -
-  -
-* - `RobertaForSequenceClassification`
-  - RoBERTa-based
-  - `cross-encoder/quora-roberta-base`, etc.
-  -
-  -
-* - `XLMRobertaForSequenceClassification`
-  - XLM-RoBERTa-based
-  - `BAAI/bge-reranker-v2-m3`, etc.
-  -
-  -
-```
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `BertForSequenceClassification`
+  * BERT-based
+  * `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
+  *
+  *
+- * `RobertaForSequenceClassification`
+  * RoBERTa-based
+  * `cross-encoder/quora-roberta-base`, etc.
+  *
+  *
+- * `XLMRobertaForSequenceClassification`
+  * XLM-RoBERTa-based
+  * `BAAI/bge-reranker-v2-m3`, etc.
+  *
+  *
+:::
 
 (supported-mm-models)=
 
@@ -560,11 +560,12 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
-````{important}
+:::{important}
 To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
 or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
 
 Offline inference:
+
 ```python
 llm = LLM(
     model="Qwen/Qwen2-VL-7B-Instruct",
@@ -573,14 +574,16 @@ llm = LLM(
 ```
 
 Online serving:
+
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
 ```
-````
 
-```{note}
+:::
+
+:::{note}
 vLLM currently only supports adding LoRA to the language backbone of multimodal models.
-```
+:::
 
 ### Generative Models
 
@@ -588,256 +591,256 @@ See [this page](#generative-models) for more information on how to use generativ
 
 #### Text Generation (`--task generate`)
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 15 20 5 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Inputs
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-  - [V1](gh-issue:8779)
-* - `AriaForConditionalGeneration`
-  - Aria
-  - T + I<sup>+</sup>
-  - `rhymes-ai/Aria`
-  -
-  - ✅︎
-  - ✅︎
-* - `Blip2ForConditionalGeneration`
-  - BLIP-2
-  - T + I<sup>E</sup>
-  - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `ChameleonForConditionalGeneration`
-  - Chameleon
-  - T + I
-  - `facebook/chameleon-7b` etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `DeepseekVLV2ForCausalLM`
-  - DeepSeek-VL2
-  - T + I<sup>+</sup>
-  - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
-  -
-  - ✅︎
-  - ✅︎
-* - `FuyuForCausalLM`
-  - Fuyu
-  - T + I
-  - `adept/fuyu-8b` etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `ChatGLMModel`
-  - GLM-4V
-  - T + I
-  - `THUDM/glm-4v-9b` etc.
-  - ✅︎
-  - ✅︎
-  -
-* - `H2OVLChatModel`
-  - H2OVL
-  - T + I<sup>E+</sup>
-  - `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
-  -
-  - ✅︎
-  -
-* - `Idefics3ForConditionalGeneration`
-  - Idefics3
-  - T + I
-  - `HuggingFaceM4/Idefics3-8B-Llama3` etc.
-  - ✅︎
-  -
-  -
-* - `InternVLChatModel`
-  - InternVL 2.5, Mono-InternVL, InternVL 2.0
-  - T + I<sup>E+</sup>
-  - `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `LlavaForConditionalGeneration`
-  - LLaVA-1.5
-  - T + I<sup>E+</sup>
-  - `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `LlavaNextForConditionalGeneration`
-  - LLaVA-NeXT
-  - T + I<sup>E+</sup>
-  - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `LlavaNextVideoForConditionalGeneration`
-  - LLaVA-NeXT-Video
-  - T + V
-  - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `LlavaOnevisionForConditionalGeneration`
-  - LLaVA-Onevision
-  - T + I<sup>+</sup> + V<sup>+</sup>
-  - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `MiniCPMV`
-  - MiniCPM-V
-  - T + I<sup>E+</sup>
-  - `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
-  - ✅︎
-  - ✅︎
-  -
-* - `MllamaForConditionalGeneration`
-  - Llama 3.2
-  - T + I<sup>+</sup>
-  - `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.
-  -
-  -
-  -
-* - `MolmoForCausalLM`
-  - Molmo
-  - T + I
-  - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
-  - ✅︎
-  - ✅︎
-  - ✅︎
-* - `NVLM_D_Model`
-  - NVLM-D 1.0
-  - T + I<sup>E+</sup>
-  - `nvidia/NVLM-D-72B`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `PaliGemmaForConditionalGeneration`
-  - PaliGemma, PaliGemma 2
-  - T + I<sup>E</sup>
-  - `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
-  -
-  - ✅︎
-  -
-* - `Phi3VForCausalLM`
-  - Phi-3-Vision, Phi-3.5-Vision
-  - T + I<sup>E+</sup>
-  - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `PixtralForConditionalGeneration`
-  - Pixtral
-  - T + I<sup>+</sup>
-  - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
-  -
-  - ✅︎
-  - ✅︎
-* - `QWenLMHeadModel`
-  - Qwen-VL
-  - T + I<sup>E+</sup>
-  - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
-  - ✅︎
-  - ✅︎
-  - ✅︎
-* - `Qwen2AudioForConditionalGeneration`
-  - Qwen2-Audio
-  - T + A<sup>+</sup>
-  - `Qwen/Qwen2-Audio-7B-Instruct`
-  -
-  - ✅︎
-  - ✅︎
-* - `Qwen2VLForConditionalGeneration`
-  - QVQ, Qwen2-VL
-  - T + I<sup>E+</sup> + V<sup>E+</sup>
-  - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
-  - ✅︎
-  - ✅︎
-  - ✅︎
-* - `UltravoxModel`
-  - Ultravox
-  - T + A<sup>E+</sup>
-  - `fixie-ai/ultravox-v0_3`
-  -
-  - ✅︎
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Inputs
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+  * [V1](gh-issue:8779)
+- * `AriaForConditionalGeneration`
+  * Aria
+  * T + I<sup>+</sup>
+  * `rhymes-ai/Aria`
+  *
+  * ✅︎
+  * ✅︎
+- * `Blip2ForConditionalGeneration`
+  * BLIP-2
+  * T + I<sup>E</sup>
+  * `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `ChameleonForConditionalGeneration`
+  * Chameleon
+  * T + I
+  * `facebook/chameleon-7b` etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `DeepseekVLV2ForCausalLM`
+  * DeepSeek-VL2
+  * T + I<sup>+</sup>
+  * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
+  *
+  * ✅︎
+  * ✅︎
+- * `FuyuForCausalLM`
+  * Fuyu
+  * T + I
+  * `adept/fuyu-8b` etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `ChatGLMModel`
+  * GLM-4V
+  * T + I
+  * `THUDM/glm-4v-9b` etc.
+  * ✅︎
+  * ✅︎
+  *
+- * `H2OVLChatModel`
+  * H2OVL
+  * T + I<sup>E+</sup>
+  * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
+  *
+  * ✅︎
+  *
+- * `Idefics3ForConditionalGeneration`
+  * Idefics3
+  * T + I
+  * `HuggingFaceM4/Idefics3-8B-Llama3` etc.
+  * ✅︎
+  *
+  *
+- * `InternVLChatModel`
+  * InternVL 2.5, Mono-InternVL, InternVL 2.0
+  * T + I<sup>E+</sup>
+  * `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `LlavaForConditionalGeneration`
+  * LLaVA-1.5
+  * T + I<sup>E+</sup>
+  * `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `LlavaNextForConditionalGeneration`
+  * LLaVA-NeXT
+  * T + I<sup>E+</sup>
+  * `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `LlavaNextVideoForConditionalGeneration`
+  * LLaVA-NeXT-Video
+  * T + V
+  * `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `LlavaOnevisionForConditionalGeneration`
+  * LLaVA-Onevision
+  * T + I<sup>+</sup> + V<sup>+</sup>
+  * `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `MiniCPMV`
+  * MiniCPM-V
+  * T + I<sup>E+</sup>
+  * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
+  * ✅︎
+  * ✅︎
+  *
+- * `MllamaForConditionalGeneration`
+  * Llama 3.2
+  * T + I<sup>+</sup>
+  * `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.
+  *
+  *
+  *
+- * `MolmoForCausalLM`
+  * Molmo
+  * T + I
+  * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
+  * ✅︎
+  * ✅︎
+  * ✅︎
+- * `NVLM_D_Model`
+  * NVLM-D 1.0
+  * T + I<sup>E+</sup>
+  * `nvidia/NVLM-D-72B`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `PaliGemmaForConditionalGeneration`
+  * PaliGemma, PaliGemma 2
+  * T + I<sup>E</sup>
+  * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
+  *
+  * ✅︎
+  *
+- * `Phi3VForCausalLM`
+  * Phi-3-Vision, Phi-3.5-Vision
+  * T + I<sup>E+</sup>
+  * `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `PixtralForConditionalGeneration`
+  * Pixtral
+  * T + I<sup>+</sup>
+  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
+  *
+  * ✅︎
+  * ✅︎
+- * `QWenLMHeadModel`
+  * Qwen-VL
+  * T + I<sup>E+</sup>
+  * `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
+  * ✅︎
+  * ✅︎
+  * ✅︎
+- * `Qwen2AudioForConditionalGeneration`
+  * Qwen2-Audio
+  * T + A<sup>+</sup>
+  * `Qwen/Qwen2-Audio-7B-Instruct`
+  *
+  * ✅︎
+  * ✅︎
+- * `Qwen2VLForConditionalGeneration`
+  * QVQ, Qwen2-VL
+  * T + I<sup>E+</sup> + V<sup>E+</sup>
+  * `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
+  * ✅︎
+  * ✅︎
+  * ✅︎
+- * `UltravoxModel`
+  * Ultravox
+  * T + A<sup>E+</sup>
+  * `fixie-ai/ultravox-v0_3`
+  *
+  * ✅︎
+  * ✅︎
+:::
 
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-```{note}
+:::{note}
 To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
-```
+:::
 
-```{note}
+:::{note}
 To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
-```
+:::
 
-```{note}
+:::{note}
 The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
-```
+:::
 
-```{note}
+:::{note}
 The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)).
 A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
-```
+:::
 
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
 
-```{important}
+:::{important}
 Since some model architectures support both generative and pooling tasks,
 you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
-```
+:::
 
 #### Text Embedding (`--task embed`)
 
 Any text generation model can be converted into an embedding model by passing `--task embed`.
 
-```{note}
+:::{note}
 To get the best results, you should use pooling models that are specifically trained as such.
-```
+:::
 
 The following table lists those that are tested in vLLM.
 
-```{list-table}
+:::{list-table}
 :widths: 25 25 15 25 5 5
 :header-rows: 1
 
-* - Architecture
-  - Models
-  - Inputs
-  - Example HF Models
-  - [LoRA](#lora-adapter)
-  - [PP](#distributed-serving)
-* - `LlavaNextForConditionalGeneration`
-  - LLaVA-NeXT-based
-  - T / I
-  - `royokong/e5-v`
-  -
-  - ✅︎
-* - `Phi3VForCausalLM`
-  - Phi-3-Vision-based
-  - T + I
-  - `TIGER-Lab/VLM2Vec-Full`
-  - 🚧
-  - ✅︎
-* - `Qwen2VLForConditionalGeneration`
-  - Qwen2-VL-based
-  - T + I
-  - `MrLight/dse-qwen2-2b-mrl-v1`
-  -
-  - ✅︎
-```
+- * Architecture
+  * Models
+  * Inputs
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `LlavaNextForConditionalGeneration`
+  * LLaVA-NeXT-based
+  * T / I
+  * `royokong/e5-v`
+  *
+  * ✅︎
+- * `Phi3VForCausalLM`
+  * Phi-3-Vision-based
+  * T + I
+  * `TIGER-Lab/VLM2Vec-Full`
+  * 🚧
+  * ✅︎
+- * `Qwen2VLForConditionalGeneration`
+  * Qwen2-VL-based
+  * T + I
+  * `MrLight/dse-qwen2-2b-mrl-v1`
+  *
+  * ✅︎
+:::
 
 _________________
 
@@ -849,9 +852,9 @@ At vLLM, we are committed to facilitating the integration and support of third-p
 
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
-    ```{tip}
+    :::{tip}
     When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
-    ```
+    :::
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
 
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index daf6e2f250416..3f9ca27eb438e 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -14,9 +14,9 @@ In short, you should increase the number of GPUs and the number of nodes until y
 
 After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
 
-```{note}
+:::{note}
 There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
-```
+:::
 
 ## Running vLLM on a single node
 
@@ -94,12 +94,12 @@ vllm serve /path/to/the/model/in/the/container \
 
 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
 
-```{warning}
+:::{warning}
 After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
-```
+:::
 
-```{warning}
+:::{warning}
 Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
 
 When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
-```
+:::
diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md
index cd3c6a430b7fa..827c25b50522f 100644
--- a/docs/source/serving/engine_args.md
+++ b/docs/source/serving/engine_args.md
@@ -4,6 +4,7 @@
 
 Below, you can find an explanation of every engine argument for vLLM:
 
+<!--- pyml disable-num-lines 7 no-space-in-emphasis-->
 ```{eval-rst}
 .. argparse::
     :module: vllm.engine.arg_utils
@@ -16,6 +17,7 @@ Below, you can find an explanation of every engine argument for vLLM:
 
 Below are the additional arguments related to the asynchronous engine:
 
+<!--- pyml disable-num-lines 7 no-space-in-emphasis-->
 ```{eval-rst}
 .. argparse::
     :module: vllm.engine.arg_utils
diff --git a/docs/source/serving/env_vars.md b/docs/source/serving/env_vars.md
index f9b08077a03b4..9845241930a40 100644
--- a/docs/source/serving/env_vars.md
+++ b/docs/source/serving/env_vars.md
@@ -2,14 +2,14 @@
 
 vLLM uses the following environment variables to configure the system:
 
-```{warning}
+:::{warning}
 Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
 
 All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
-```
+:::
 
-```{literalinclude} ../../../vllm/envs.py
+:::{literalinclude} ../../../vllm/envs.py
 :end-before: end-env-vars-definition
 :language: python
 :start-after: begin-env-vars-definition
-```
+:::
diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md
index 371c284981ce9..e2b4c0814605b 100644
--- a/docs/source/serving/integrations/index.md
+++ b/docs/source/serving/integrations/index.md
@@ -1,8 +1,8 @@
 # External Integrations
 
-```{toctree}
+:::{toctree}
 :maxdepth: 1
 
 langchain
 llamaindex
-```
+:::
diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
index 6c84f6d1350a6..6c0dc8880a90d 100644
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@@ -31,8 +31,8 @@ vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-I
 
 The following metrics are exposed:
 
-```{literalinclude} ../../../vllm/engine/metrics.py
+:::{literalinclude} ../../../vllm/engine/metrics.py
 :end-before: end-metrics-definitions
 :language: python
 :start-after: begin-metrics-definitions
-```
+:::
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 0213b0a3388ea..217b531e83788 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -4,10 +4,10 @@
 
 This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM.
 
-```{note}
+:::{note}
 We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
 and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
-```
+:::
 
 ## Offline Inference
 
@@ -203,13 +203,13 @@ for o in outputs:
 
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
 
-```{important}
+:::{important}
 A chat template is **required** to use Chat Completions API.
 
 Although most models come with a chat template, for others you have to define one yourself.
 The chat template can be inferred based on the documentation on the model's HuggingFace repo.
 For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: <gh-file:examples/template_llava.jinja>
-```
+:::
 
 ### Image
 
@@ -273,24 +273,25 @@ print("Chat completion output:", chat_response.choices[0].message.content)
 
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
-```{tip}
+:::{tip}
 Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
 and pass the file path as `url` in the API request.
-```
+:::
 
-```{tip}
+:::{tip}
 There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
 In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
-```
+:::
 
-````{note}
+:::{note}
 By default, the timeout for fetching images through HTTP URL is `5` seconds.
 You can override this by setting the environment variable:
 
 ```console
-$ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
+export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 ```
-````
+
+:::
 
 ### Video
 
@@ -345,14 +346,15 @@ print("Chat completion output from image url:", result)
 
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
-````{note}
+:::{note}
 By default, the timeout for fetching videos through HTTP URL is `30` seconds.
 You can override this by setting the environment variable:
 
 ```console
-$ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
+export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
 ```
-````
+
+:::
 
 ### Audio
 
@@ -448,24 +450,25 @@ print("Chat completion output from audio url:", result)
 
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
-````{note}
+:::{note}
 By default, the timeout for fetching audios through HTTP URL is `10` seconds.
 You can override this by setting the environment variable:
 
 ```console
-$ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
+export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
 ```
-````
+
+:::
 
 ### Embedding
 
 vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings),
 where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models.
 
-```{tip}
+:::{tip}
 The schema of `messages` is exactly the same as in Chat Completions API.
 You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
-```
+:::
 
 Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
 Refer to the examples below for illustration.
@@ -477,13 +480,13 @@ vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
   --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
 ```
 
-```{important}
+:::{important}
 Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
 to run this model in embedding mode instead of text generation mode.
 
 The custom chat template is completely different from the original one for this model,
 and can be found here: <gh-file:examples/template_vlm2vec.jinja>
-```
+:::
 
 Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
 
@@ -518,16 +521,16 @@ vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
   --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
 ```
 
-```{important}
+:::{important}
 Like with VLM2Vec, we have to explicitly pass `--task embed`.
 
 Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
 by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
-```
+:::
 
-```{important}
+:::{important}
 Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
 example below for details.
-```
+:::
 
 Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 8a18598665a70..ded57500c5d0d 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -22,9 +22,9 @@ The available APIs depend on the type of model that is being run:
 
 Please refer to the above pages for more details about each API.
 
-```{seealso}
+:::{seealso}
 [API Reference](/api/offline_inference/index)
-```
+:::
 
 ## Configuration Options
 
@@ -70,12 +70,12 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
           tensor_parallel_size=2)
 ```
 
-```{important}
+:::{important}
 To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`)
 before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
 
 To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
-```
+:::
 
 #### Quantization
 
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 8bc234545befd..82ef54c16dafb 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -161,11 +161,11 @@ print(completion._request_id)
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
 
-```{argparse}
+:::{argparse}
 :module: vllm.entrypoints.openai.cli_args
 :func: create_parser_for_docs
 :prog: vllm serve
-```
+:::
 
 #### Configuration file
 
@@ -188,10 +188,10 @@ To use the above config file:
 vllm serve SOME_MODEL --config config.yaml
 ```
 
-```{note}
+:::{note}
 In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
 The order of priorities is `command line > config file values > defaults`.
-```
+:::
 
 ## API Reference
 
@@ -208,19 +208,19 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 The following [sampling parameters](#sampling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-completion-sampling-params
 :end-before: end-completion-sampling-params
-```
+:::
 
 The following extra parameters are supported:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-completion-extra-params
 :end-before: end-completion-extra-params
-```
+:::
 
 (chat-api)=
 
@@ -240,19 +240,19 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 The following [sampling parameters](#sampling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-chat-completion-sampling-params
 :end-before: end-chat-completion-sampling-params
-```
+:::
 
 The following extra parameters are supported:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-chat-completion-extra-params
 :end-before: end-chat-completion-extra-params
-```
+:::
 
 (embeddings-api)=
 
@@ -264,9 +264,9 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
 which will be treated as a single prompt to the model.
 
-```{tip}
+:::{tip}
 This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
-```
+:::
 
 Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
@@ -274,27 +274,27 @@ Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
 The following [pooling parameters](#pooling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-embedding-pooling-params
 :end-before: end-embedding-pooling-params
-```
+:::
 
 The following extra parameters are supported by default:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-embedding-extra-params
 :end-before: end-embedding-extra-params
-```
+:::
 
 For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-chat-embedding-extra-params
 :end-before: end-chat-embedding-extra-params
-```
+:::
 
 (tokenizer-api)=
 
@@ -465,19 +465,19 @@ Response:
 
 The following [pooling parameters](#pooling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-score-pooling-params
 :end-before: end-score-pooling-params
-```
+:::
 
 The following extra parameters are supported:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-score-extra-params
 :end-before: end-score-extra-params
-```
+:::
 
 (rerank-api)=
 
@@ -552,16 +552,16 @@ Response:
 
 The following [pooling parameters](#pooling-params) are supported.
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-rerank-pooling-params
 :end-before: end-rerank-pooling-params
-```
+:::
 
 The following extra parameters are supported:
 
-```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-rerank-extra-params
 :end-before: end-rerank-extra-params
-```
+:::
diff --git a/pyproject.toml b/pyproject.toml
index 8f2e20d0f5800..9892967b82d79 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -111,6 +111,7 @@ markers = [
 ]
 
 [tool.pymarkdown]
+plugins.md004.style = "sublist" # ul-style
 plugins.md013.enabled = false # line-length
 plugins.md041.enabled = false # first-line-h1
 plugins.md033.enabled = false # inline-html

From 46fb056749b7d9f5e4ea7a060207ed2eb3ad75e0 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Wed, 29 Jan 2025 04:11:16 +0000
Subject: [PATCH 28/69] [V1][Metrics] Add TTFT and TPOT histograms (#12530)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |  6 ++++++
 vllm/v1/engine/output_processor.py       |  4 +++-
 vllm/v1/metrics/loggers.py               | 25 ++++++++++++++++++++++++
 vllm/v1/metrics/stats.py                 | 11 +++++++++++
 4 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 9a84c82b62fdf..901ba8e8e5ef3 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -208,6 +208,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:request_generation_tokens_sum",
     "vllm:request_generation_tokens_bucket",
     "vllm:request_generation_tokens_count",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
 ]
 
 
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 39217b8090140..234ef8194ca93 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -27,6 +27,7 @@ def __init__(
         prompt: Optional[str],
         prompt_token_ids: List[int],
         detokenizer: IncrementalDetokenizer,
+        arrival_time: float,
         queue: Optional[asyncio.Queue[RequestOutput]],
     ):
         self.request_id = request_id
@@ -37,7 +38,7 @@ def __init__(
         self.is_prefilling = True
         self.queue = queue
 
-        self.stats = RequestStateStats()
+        self.stats = RequestStateStats(last_token_time=arrival_time)
 
     @classmethod
     def from_new_request(
@@ -54,6 +55,7 @@ def from_new_request(
                 tokenizer=tokenizer,
                 request=request,
             ),
+            arrival_time=request.arrival_time,
             queue=queue,
         )
 
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 87d9d63652c05..9bb24d1948651 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -121,6 +121,26 @@ def __init__(self, model_config: ModelConfig):
                 buckets=build_1_2_5_buckets(max_model_len),
                 labelnames=labelnames).labels(*labelvalues)
 
+        self.histogram_time_to_first_token = \
+            prometheus_client.Histogram(
+                name="vllm:time_to_first_token_seconds",
+                documentation="Histogram of time to first token in seconds.",
+                buckets=[
+                    0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+                ],
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_time_per_output_token = \
+            prometheus_client.Histogram(
+                name="vllm:time_per_output_token_seconds",
+                documentation="Histogram of time per output token in seconds.",
+                buckets=[
+                    0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
+                    0.75, 1.0, 2.5
+                ],
+                labelnames=labelnames).labels(*labelvalues)
+
     def log(self, scheduler_stats: SchedulerStats,
             iteration_stats: IterationStats):
         """Log to prometheus."""
@@ -137,6 +157,11 @@ def log(self, scheduler_stats: SchedulerStats,
             self.histogram_num_generation_tokens_request.observe(
                 finished_request.num_generation_tokens)
 
+        for ttft in iteration_stats.time_to_first_tokens_iter:
+            self.histogram_time_to_first_token.observe(ttft)
+        for tpot in iteration_stats.time_per_output_tokens_iter:
+            self.histogram_time_per_output_token.observe(tpot)
+
     @staticmethod
     def _unregister_vllm_metrics():
         # Unregister any existing vLLM collectors (for CI/CD
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 55d85a7992cc5..f4c276f0b6902 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,3 +1,4 @@
+import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, List
 
@@ -22,6 +23,7 @@ class RequestStateStats:
     """Stats that need to be tracked across delta updates."""
 
     num_generation_tokens: int = 0
+    last_token_time: float = 0.0
 
 
 @dataclass
@@ -40,6 +42,8 @@ def __init__(self, log_stats: bool):
         self.num_generation_tokens = 0
         self.num_prompt_tokens = 0
         self.finished_requests: List[FinishedRequestStats] = []
+        self.time_to_first_tokens_iter: List[float] = []
+        self.time_per_output_tokens_iter: List[float] = []
 
     def update_from_output(self, output: "EngineCoreOutput",
                            is_prefilling: bool, prompt_len: int,
@@ -48,6 +52,8 @@ def update_from_output(self, output: "EngineCoreOutput",
             return
 
         num_new_generation_tokens = len(output.new_token_ids)
+        now = time.time()
+        last_token_latency = now - request_state_stats.last_token_time
 
         self.num_generation_tokens += num_new_generation_tokens
         if is_prefilling:
@@ -58,7 +64,12 @@ def update_from_output(self, output: "EngineCoreOutput",
             assert (num_new_generation_tokens > 0)
             self.num_prompt_tokens += prompt_len
 
+            self.time_to_first_tokens_iter.append(last_token_latency)
+        else:
+            self.time_per_output_tokens_iter.append(last_token_latency)
+
         request_state_stats.num_generation_tokens += num_new_generation_tokens
+        request_state_stats.last_token_time = now
 
     def update_from_finished_request(self, request_output: "RequestOutput",
                                      request_state_stats: RequestStateStats):

From bd02164cf9eeed8436b26d62c37c1d792e97f9e8 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 28 Jan 2025 23:49:03 -0500
Subject: [PATCH 29/69] Bugfix for whisper quantization due to fake k_proj bias
 (#12524)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/models/whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index b8512b735da94..15e35fa9cd2c9 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -743,7 +743,7 @@ def _create_fake_bias_for_k_proj(
     So that the bias for k_proj in qkv_proj can be initialized with zeros.
     """
     for name, weight in weights:
-        if ".self_attn.k_proj.weight" in name:
+        if name.endswith(".self_attn.k_proj.weight"):
             bias = torch.zeros(weight.size(0))
             bias_name = name.replace("weight", "bias")
             yield from [(name, weight), (bias_name, bias)]

From 5f671cb4c3145194e94ffb393ee459432f7fa2b8 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Tue, 28 Jan 2025 23:56:56 -0500
Subject: [PATCH 30/69] [V1] Improve Error Message for Unsupported Config
 (#12535)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 vllm/platforms/cuda.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2587e3a11dde3..e4b436edf7588 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -120,13 +120,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if parallel_config.worker_cls == "auto":
             if scheduler_config.is_multi_step:
                 if envs.VLLM_USE_V1:
-                    raise NotImplementedError
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on VLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
                 else:
                     parallel_config.worker_cls = \
                         "vllm.worker.multi_step_worker.MultiStepWorker"
             elif vllm_config.speculative_config:
                 if envs.VLLM_USE_V1:
-                    raise NotImplementedError
+                    raise NotImplementedError(
+                        "Speculative decoding is not yet supported on VLLM V1."
+                    )
                 else:
                     parallel_config.worker_cls = \
                         "vllm.spec_decode.spec_decode_worker.create_spec_worker"

From ef001d98ef36166ebacb48eab2e32eb738407b53 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Wed, 29 Jan 2025 04:53:13 -0300
Subject: [PATCH 31/69] Fix the pydantic logging validator (#12420)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/openai/protocol.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 2bc136cc48038..29d071ce50c8e 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -6,7 +6,8 @@
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union
 
 import torch
-from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
+                      ValidationInfo, field_validator, model_validator)
 from typing_extensions import Annotated
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
@@ -45,14 +46,14 @@ class OpenAIBaseModel(BaseModel):
     # Cache class field names
     field_names: ClassVar[Optional[Set[str]]] = None
 
-    @model_validator(mode="before")
+    @model_validator(mode="wrap")
     @classmethod
-    def __log_extra_fields__(cls, data):
-
+    def __log_extra_fields__(cls, data, handler):
+        result = handler(data)
+        if not isinstance(data, dict):
+            return result
         field_names = cls.field_names
         if field_names is None:
-            if not isinstance(data, dict):
-                return data
             # Get all class field names and their potential aliases
             field_names = set()
             for field_name, field in cls.model_fields.items():
@@ -67,7 +68,7 @@ def __log_extra_fields__(cls, data):
                 "The following fields were present in the request "
                 "but ignored: %s",
                 data.keys() - field_names)
-        return data
+        return result
 
 
 class ErrorResponse(OpenAIBaseModel):
@@ -1287,6 +1288,20 @@ class BatchRequestInput(OpenAIBaseModel):
     # The parameters of the request.
     body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest]
 
+    @field_validator('body', mode='plain')
+    @classmethod
+    def check_type_for_url(cls, value: Any, info: ValidationInfo):
+        # Use url to disambiguate models
+        url = info.data['url']
+        if url == "/v1/chat/completions":
+            return ChatCompletionRequest.model_validate(value)
+        if url == "/v1/embeddings":
+            return TypeAdapter(EmbeddingRequest).validate_python(value)
+        if url == "/v1/score":
+            return ScoreRequest.model_validate(value)
+        return TypeAdapter(Union[ChatCompletionRequest, EmbeddingRequest,
+                                 ScoreRequest]).validate_python(value)
+
 
 class BatchResponseData(OpenAIBaseModel):
     # HTTP status code of the response.

From 036ca94c25fa07391016aa1b4f93a8ac5d74f296 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 29 Jan 2025 01:54:35 -0700
Subject: [PATCH 32/69] [Bugfix] handle alignment of arguments in
 convert_sparse_cross_attention_mask_to_dense (#12347)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Wallas Santos <wallashss@ibm.com>
---
 .../vision_language/test_mllama.py            | 208 ++++++++++++++++++
 vllm/model_executor/models/mllama.py          |  18 +-
 2 files changed, 222 insertions(+), 4 deletions(-)

diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 636a3eedff31b..16c71228ede7a 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -1,11 +1,15 @@
 from typing import List, Optional, Tuple, Type, overload
 
 import pytest
+import torch
 from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
                           BatchEncoding)
 
+from vllm.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
+from vllm.model_executor.models.mllama import (MLLAMA_IMAGE_TOKEN_ID,
+                                               MllamaForConditionalGeneration)
 from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
@@ -33,6 +37,29 @@
     "meta-llama/Llama-3.2-11B-Vision-Instruct",
 ]
 
+# Indices for inputs
+TEXT_ONLY = '0'
+IMAGE_AT_BEG = '1'
+IMAGE_AT_MIDDLE = '2'
+TWO_IMAGES = '3'
+
+# Input tokenized
+prompt_data = {
+    # Tell me a story
+    TEXT_ONLY: [41551, 757, 264, 3446],
+    # <|image|> What's the content of this image
+    IMAGE_AT_BEG:
+    [MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220],
+    # Hello <|image|>What' the content of this image
+    IMAGE_AT_MIDDLE:
+    [9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217],
+    #<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501
+    TWO_IMAGES: [
+        MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30,
+        MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30
+    ]
+}
+
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                          Optional[SampleLogprobs]],
@@ -365,3 +392,184 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
             num_logprobs=num_logprobs,
             tensor_parallel_size=1,
         )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
+                    num_logprobs, attn_backend: _Backend) -> None:
+
+    stop_sign = image_assets[0].pil_image
+
+    with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
+            model,
+            dtype=dtype,
+            max_model_len=4096,
+            max_num_seqs=2,
+            tensor_parallel_size=1,
+            enforce_eager=True,
+            limit_mm_per_prompt={"image":
+                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
+
+        # Regression tests for https://github.com/vllm-project/vllm/issues/10648
+
+        # Number of image tags is greater than the number of images provided
+        prompt = "<|begin_of_text|><|image|><|image|> Compare the two images"  # noqa: E501
+        image = stop_sign
+        with pytest.raises(ValueError):
+            vllm_model.generate_greedy_logprobs([prompt],
+                                                max_tokens,
+                                                num_logprobs,
+                                                images=[image])
+
+        # Batch of a text-only and image request that requires cross-attention
+        prompts = [
+            "What is the capital of spain?",
+            "Text before the image...<|image|>What is in the image?",  # noqa: E501
+        ]
+        images = [
+            None,
+            [stop_sign],
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+
+        # Test the reverse order too for good measure
+        prompts = [
+            "<|begin_of_text|>Text before the image...<|image|>What is in the image?",  # noqa: E501
+            "<|begin_of_text|>Hello!",
+        ]
+        images = [
+            [stop_sign],
+            None,
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "input_indices_and_output",
+    # inputs, (cross_attention_mask, kv_range_for_decode)
+    [([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)),
+     ([TEXT_ONLY, IMAGE_AT_BEG], (None, None)),
+     ([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])),
+     ([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])),
+     ([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
+      ((23, 24), [[0, 6], [6, 12]])),
+     ([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])),
+     ([TWO_IMAGES], ((18, 12), [[6, 12]])),
+     ([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))])
+def test_get_cross_attention_mask(input_indices_and_output) -> None:
+
+    input_indices, expected_output = input_indices_and_output
+
+    sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
+    num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices
+                 if i != TEXT_ONLY]
+    input = torch.cat(sequences)
+
+    seq_lens = [len(s) for s in sequences]
+
+    attn_data = FlashAttentionMetadata(
+        seq_lens=seq_lens,
+        # Dummy values
+        enable_kv_scales_calculation=False,
+        num_prefills=0,
+        num_prefill_tokens=0,
+        num_decode_tokens=0,
+        slot_mapping=0,
+        multi_modal_placeholder_index_maps=None,
+        seq_lens_tensor=0,
+        max_prefill_seq_len=0,
+        max_decode_seq_len=0,
+        context_lens_tensor=None,
+        block_tables=None,
+        use_cuda_graph=False,
+    )
+
+    dummy: dict[str, str] = {}
+
+    cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
+        .get_cross_attention_mask(dummy,
+                                  input,
+                                  attn_data,
+                                  num_tiles=num_tiles,
+                                  num_tokens_per_tile=3,
+                                  dtype=torch.bfloat16)
+
+    expected_cross_attention_mask, expected_kv_range_for_decode = \
+        expected_output
+
+    assert kv_range_for_decode == expected_kv_range_for_decode
+    if expected_cross_attention_mask is not None:
+        assert cross_attention_mask is not None
+        assert cross_attention_mask.shape == expected_cross_attention_mask
+    else:
+        assert cross_attention_mask is None
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "input_indices",
+    [[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE],
+     [TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
+     [IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]])
+def test_get_full_text_row_masked_out_mask(input_indices) -> None:
+
+    sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
+
+    seq_lens = [len(s) for s in sequences]
+
+    num_prefill_tokens = sum(seq_lens)
+
+    # TEXT_ONLY is zero, so it will be masked out,
+    # other instances should not be.
+    encoder_seq_lens = [int(i) for i in input_indices]
+
+    attn_data = FlashAttentionMetadata(
+        seq_lens=seq_lens,
+        encoder_seq_lens=encoder_seq_lens,
+        num_prefill_tokens=num_prefill_tokens,
+        # Dummy values
+        enable_kv_scales_calculation=False,
+        num_prefills=0,
+        num_decode_tokens=0,
+        slot_mapping=0,
+        multi_modal_placeholder_index_maps=None,
+        seq_lens_tensor=0,
+        max_prefill_seq_len=0,
+        max_decode_seq_len=0,
+        context_lens_tensor=None,
+        block_tables=None,
+        use_cuda_graph=False,
+    )
+
+    dummy: dict[str, str] = {}
+
+    full_text_row_masked_out_mask = MllamaForConditionalGeneration\
+        .get_full_text_row_masked_out_mask(dummy,
+                                  attn_data,
+                                  torch.get_default_device())
+
+    full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze()
+    full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist()
+
+    idx = 0
+    assert len(full_text_row_masked_out_mask) == num_prefill_tokens
+    for i, seq_len in enumerate(seq_lens):
+        must_be_masked = input_indices[i] != TEXT_ONLY
+        for _ in range(seq_len):
+            assert full_text_row_masked_out_mask[idx] == must_be_masked, \
+                f"full_text_row_masked_out_mask[{idx}] must be " \
+                f"'{must_be_masked}' "
+            idx += 1
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index e15ac84a6049b..34b8624647ce6 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1485,14 +1485,23 @@ def convert_sparse_cross_attention_mask_to_dense(
     total_length = sum(lengths)
     total_tiles = sum([sum(tiles) for tiles in num_tiles])
     dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64)
-    # A list of ranges, range[i] = [start, end] means
-    # if the i-th sample has N tiles in total, the tiles[start, end]
-    # will be used for cross-attention decoding.
+    # A list of ranges, range[i] = [start, end] means that the i-th image will
+    # use tiles[start, end] for cross-attention decoding.
     tile_range_for_decode = []
 
     seq_start = 0
     tile_start = 0
-    for masks, tiles, length in zip(sparse_mask, num_tiles, lengths):
+
+    # sparse_mask has an [] entry for each sequence that does not have images,
+    # but num_tiles does not have these entries...
+    num_tiles_idx = 0
+    for masks, length in zip(sparse_mask, lengths):
+        if len(masks) == 0:
+            # Text only
+            continue
+
+        tiles = num_tiles[num_tiles_idx]
+        num_tiles_idx += 1
         ts, td = -1, 0
         for mask, tile in zip(masks, tiles):
             if len(mask) != 2:
@@ -1512,6 +1521,7 @@ def convert_sparse_cross_attention_mask_to_dense(
         assert td != 0
         tile_range_for_decode.append((ts, ts + td))
         seq_start += length
+    assert num_tiles_idx == len(num_tiles)
 
     return dense_mask, tile_range_for_decode
 

From d93bf4da855a0c5e8d3c875def6b37c5e9d77763 Mon Sep 17 00:00:00 2001
From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com>
Date: Wed, 29 Jan 2025 17:24:59 +0800
Subject: [PATCH 33/69] [Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6
 support for vLLM (#12069)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md        |   9 +-
 examples/offline_inference/audio_language.py  |  32 +-
 examples/offline_inference/vision_language.py |  33 +-
 requirements-cpu.txt                          |   1 +
 requirements-cuda.txt                         |   1 +
 requirements-test.in                          |   3 +
 requirements-test.txt                         |  37 +-
 .../vision_language/test_models.py            |  14 +
 .../vision_language/vlm_utils/model_utils.py  |  11 +
 .../multimodal/processing/test_common.py      |   2 +
 tests/models/registry.py                      |   4 +-
 vllm/entrypoints/chat_utils.py                |   6 +-
 vllm/model_executor/models/minicpmo.py        | 811 +++++++++++++++++
 vllm/model_executor/models/minicpmv.py        | 843 ++++++++++++++----
 vllm/model_executor/models/registry.py        |   1 +
 15 files changed, 1622 insertions(+), 186 deletions(-)
 create mode 100644 vllm/model_executor/models/minicpmo.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 94f4bd6cadabd..afaad8818bdcb 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -693,9 +693,16 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `MiniCPMO`
+  * MiniCPM-O
+  * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup>
+  * `openbmb/MiniCPM-o-2_6`, etc.
+  * ✅︎
+  * ✅︎
+  *
 - * `MiniCPMV`
   * MiniCPM-V
-  * T + I<sup>E+</sup>
+  * T + I<sup>E+</sup> + V<sup>E+</sup>
   * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
   * ✅︎
   * ✅︎
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 6fd74782a9aae..5952ec13ec3cb 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -67,7 +67,37 @@ def run_qwen2_audio(question: str, audio_count: int):
     return llm, prompt, stop_token_ids
 
 
-model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio}
+def run_minicpmo(question: str, audio_count: int):
+    model_name = "openbmb/MiniCPM-o-2_6"
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    llm = LLM(model=model_name,
+              trust_remote_code=True,
+              max_model_len=4096,
+              max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
+
+    stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    audio_placeholder = "(<audio>./</audio>)" * audio_count
+    audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"  # noqa: E501
+    messages = [{
+        'role': 'user',
+        'content': f'{audio_placeholder}\n{question}'
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True,
+                                           chat_template=audio_chat_template)
+    return llm, prompt, stop_token_ids
+
+
+model_example_map = {
+    "ultravox": run_ultravox,
+    "qwen2_audio": run_qwen2_audio,
+    "minicpmo": run_minicpmo
+}
 
 
 def main(args):
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 415439e88ed59..38c2b13d3f2c7 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -265,8 +265,9 @@ def run_mantis(question: str, modality: str):
 
 
 # MiniCPM-V
-def run_minicpmv(question: str, modality: str):
-    assert modality == "image"
+def run_minicpmv_base(question: str, modality: str, model_name):
+    assert modality in ["image", "video"]
+    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
 
     # 2.0
     # The official repo doesn't work yet, so we need to use a fork for now
@@ -277,7 +278,15 @@ def run_minicpmv(question: str, modality: str):
     # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
 
     # 2.6
-    model_name = "openbmb/MiniCPM-V-2_6"
+    # model_name = "openbmb/MiniCPM-V-2_6"
+    # o2.6
+
+    # modality supports
+    # 2.0: image
+    # 2.5: image
+    # 2.6: image, video
+    # o2.6: image, video, audio
+    # model_name = "openbmb/MiniCPM-o-2_6"
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
     llm = LLM(
@@ -294,13 +303,18 @@ def run_minicpmv(question: str, modality: str):
     # 2.5
     # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
 
-    # 2.6
+    # 2.6 / o2.6
     stop_tokens = ['<|im_end|>', '<|endoftext|>']
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
+    modality_placeholder = {
+        "image": "(<image>./</image>)",
+        "video": "(<video>./</video>)",
+    }
+
     messages = [{
         'role': 'user',
-        'content': f'(<image>./</image>)\n{question}'
+        'content': f'{modality_placeholder[modality]}\n{question}'
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
@@ -308,6 +322,14 @@ def run_minicpmv(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+def run_minicpmo(question: str, modality: str):
+    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-o-2_6")
+
+
+def run_minicpmv(question: str, modality: str):
+    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-V-2_6")
+
+
 # LLama 3.2
 def run_mllama(question: str, modality: str):
     assert modality == "image"
@@ -523,6 +545,7 @@ def run_qwen2_vl(question: str, modality: str):
     "llava-next-video": run_llava_next_video,
     "llava-onevision": run_llava_onevision,
     "mantis": run_mantis,
+    "minicpmo": run_minicpmo,
     "minicpmv": run_minicpmv,
     "mllama": run_mllama,
     "molmo": run_molmo,
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 056fbf5a7adec..ed0d2c9fae0b6 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -4,5 +4,6 @@
 # Dependencies for CPUs
 torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
 torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" 
+torchaudio; platform_machine != "ppc64le"  # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
 datasets # for benchmark scripts
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 8002fbd8ee5b9..78fa360f2dc96 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -5,6 +5,7 @@
 ray[default] >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
 torch == 2.5.1
+torchaudio==2.5.1
 # These must be updated alongside torch
 torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
diff --git a/requirements-test.in b/requirements-test.in
index bc76a91ad5356..13ad17b256734 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -12,6 +12,8 @@ decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
+vector_quantize_pytorch # required for minicpmo_26 test
+vocos # required for minicpmo_26 test
 peft
 pqdm
 ray[adag]==2.40.0
@@ -19,6 +21,7 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 timm # required for internvl test
 torch==2.5.1
+torchaudio==2.5.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.0 # required for pixtral test
diff --git a/requirements-test.txt b/requirements-test.txt
index 09e009c2e21f4..df7e904bb0d34 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -106,9 +106,17 @@ dnspython==2.7.0
 docutils==0.16
     # via awscli
 einops==0.8.0
-    # via -r requirements-test.in
+    # via
+    #   -r requirements-test.in
+    #   encodec
+    #   vector-quantize-pytorch
+    #   vocos
+einx==0.3.0
+    # via vector-quantize-pytorch
 email-validator==2.2.0
     # via pydantic
+encodec==0.1.1
+    # via vocos
 evaluate==0.4.3
     # via lm-eval
 fastparquet==2024.11.0
@@ -125,6 +133,8 @@ filelock==3.16.1
     #   triton
 fonttools==4.54.1
     # via matplotlib
+frozendict==2.4.6
+    # via einx
 frozenlist==1.5.0
     # via
     #   aiohttp
@@ -159,6 +169,7 @@ huggingface-hub==0.26.2
     #   timm
     #   tokenizers
     #   transformers
+    #   vocos
 idna==3.10
     # via
     #   anyio
@@ -261,6 +272,8 @@ numpy==1.26.4
     #   cupy-cuda12x
     #   datasets
     #   decord
+    #   einx
+    #   encodec
     #   evaluate
     #   fastparquet
     #   genai-perf
@@ -283,6 +296,7 @@ numpy==1.26.4
     #   torchvision
     #   transformers
     #   tritonclient
+    #   vocos
 nvidia-cublas-cu12==12.4.5.8
     # via
     #   nvidia-cudnn-cu12
@@ -455,6 +469,7 @@ pyyaml==6.0.2
     #   responses
     #   timm
     #   transformers
+    #   vocos
 ray[adag]==2.40.0
     # via -r requirements-test.in
 redis==5.2.0
@@ -517,6 +532,7 @@ scipy==1.13.1
     #   scikit-learn
     #   sentence-transformers
     #   statsmodels
+    #   vocos
 sentence-transformers==3.2.1
     # via -r requirements-test.in
 sentencepiece==0.2.0
@@ -540,7 +556,9 @@ sqlitedict==2.1.0
 statsmodels==0.14.4
     # via genai-perf
 sympy==1.13.1
-    # via torch
+    # via
+    #   einx
+    #   torch
 tabledata==1.3.3
     # via pytablewriter
 tabulate==0.9.0
@@ -568,12 +586,21 @@ torch==2.5.1
     #   -r requirements-test.in
     #   accelerate
     #   bitsandbytes
+    #   encodec
     #   lm-eval
     #   peft
     #   sentence-transformers
     #   tensorizer
     #   timm
+    #   torchaudio
     #   torchvision
+    #   vector-quantize-pytorch
+    #   vocos
+torchaudio==2.5.1
+    # via
+    #   -r requirements-test.in
+    #   encodec
+    #   vocos
 torchvision==0.20.1
     # via timm
 tqdm==4.66.6
@@ -584,6 +611,7 @@ tqdm==4.66.6
     #   lm-eval
     #   nltk
     #   peft
+    #   pqdm
     #   sentence-transformers
     #   tqdm-multiprocess
     #   transformers
@@ -615,6 +643,7 @@ typing-extensions==4.12.2
     #   huggingface-hub
     #   librosa
     #   mistral-common
+    #   pqdm
     #   pydantic
     #   pydantic-core
     #   torch
@@ -626,6 +655,10 @@ urllib3==2.2.3
     #   requests
     #   responses
     #   tritonclient
+vector-quantize-pytorch==1.21.2
+    # via -r requirements-test.in
+vocos==0.1.0
+    # via -r requirements-test.in
 word2number==1.1
     # via lm-eval
 xxhash==3.5.0
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index d5f0d63288cc1..62c644f73d62d 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -350,6 +350,20 @@
         postprocess_inputs=model_utils.wrap_inputs_post_processor,
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
     ),
+    "minicpmo_26": VLMTestInfo(
+        models=["openbmb/MiniCPM-o-2_6"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        postprocess_inputs=model_utils.ignore_inputs_post_processor(
+            "image_sizes"
+        ),
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmo_patch_hf_runner
+    ),
     "minicpmv_26": VLMTestInfo(
         models=["openbmb/MiniCPM-V-2_6"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 1ca85c7bb2056..07bdb2cee44d2 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -497,6 +497,17 @@ def _generate(self, *args, **kwargs):
     return hf_model
 
 
+def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
 def _generate_greedy_logprobs_limit(
     self,
     prompts: List[str],
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index b575ec6acbef3..ca28da268fa05 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -152,6 +152,8 @@ def _test_processing_correctness(
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
     "TIGER-Lab/Mantis-8B-siglip-llama3",
     "mistral-community/pixtral-12b",
+    "openbmb/MiniCPM-o-2_6",
+    "openbmb/MiniCPM-V-2_6",
     "Qwen/Qwen-VL-Chat",
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 0bd06dea0ec7f..7952e65aa76a5 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -245,7 +245,9 @@ def check_available_online(
     "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
     "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3",  # noqa: E501
                                                       hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
-    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
+    "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
+                                trust_remote_code=True),
+    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-V-2_6",
                                 trust_remote_code=True),
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
                                         trust_remote_code=True),
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 723d6e9085806..97d2561df602a 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -392,7 +392,7 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type == "phi3_v":
                 # Workaround since this token is not defined in the tokenizer
                 return f"<|image_{current_count}|>"
-            if model_type == "minicpmv":
+            if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
             if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
                               "pixtral"):
@@ -424,10 +424,14 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type == "qwen2_audio":
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
+            if model_type == "minicpmo":
+                return "(<audio>./</audio>)"
             raise TypeError(f"Unknown model type: {model_type}")
         elif modality == "video":
             if model_type == "qwen2_vl":
                 return "<|vision_start|><|video_pad|><|vision_end|>"
+            if model_type in ("minicpmo", "minicpmv"):
+                return "(<video>./</video>)"
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.video_token_index)
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
new file mode 100644
index 0000000000000..eb4282d62005a
--- /dev/null
+++ b/vllm/model_executor/models/minicpmo.py
@@ -0,0 +1,811 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM-O model compatible with HuggingFace weights."""
+from functools import partial
+from itertools import accumulate
+from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, TypedDict, Union)
+
+import torch
+import torch.types
+from torch import nn
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.models.whisper.modeling_whisper import (
+    ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder)
+
+from vllm.attention import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.multimodal.parse import (ModalityData, ModalityDataItems,
+                                   MultiModalDataItems, MultiModalDataParser,
+                                   VideoItem)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        PromptReplacement)
+from vllm.multimodal.profiling import ProcessorInputs
+from vllm.sequence import IntermediateTensors
+
+from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
+                       MiniCPMVEmbeddingItems, MiniCPMVMultiModalDataParser,
+                       MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo)
+from .utils import AutoWeightsLoader, maybe_prefix
+
+CPU_DEVICE = torch.device("cpu")
+
+MiniCPMOEmbeddingItems = MiniCPMVEmbeddingItems
+
+
+class MiniCPMOAudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
+    data: torch.Tensor
+    """
+    Shape: `(batch_size * num_audios * num_slices, num_channels, length)`
+    Slice here means chunk. Audio that is too long will be split into slices,
+    which is the same as image.
+    Padding is used therefore `data` is `torch.Tensor`.
+    """
+
+    audio_feature_lens: torch.Tensor
+    """
+    Shape: `(batch_size * num_audios * num_slices)`
+
+    This should be feature length of each audio slice, 
+    which equals to `data.shape[-1]`
+    """
+
+    audio_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_audios * num_slices, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+
+class MiniCPMOAudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    data: List[torch.Tensor]
+    """
+    Shape: `(batch_size * num_images * num_slices, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    instead of a batched tensor.
+    Length of each slice may vary, so pass it as a list.
+    """
+    audio_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_audios * num_slices, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+
+MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
+                            MiniCPMOAudioEmbeddingInputs]
+
+
+class MiniCPMOAudioEmbeddingItems(MiniCPMOEmbeddingItems):
+
+    def __init__(self, data: Dict) -> None:
+        super().__init__(data, "audio")
+        audio_embeds = self.data.get("audio_embeds", None)
+        if audio_embeds is None:
+            raise ValueError("Incorrect type of video_embeds",
+                             "Got type: None")
+        self.data["audio_embeds"] = audio_embeds
+
+    def get(self, index: int) -> object:
+        return self.data["audio_embeds"][index]
+
+
+class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser):
+
+    def _parse_audio_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return MiniCPMOAudioEmbeddingItems(data)
+        return super()._parse_audio_data(data)
+
+
+class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
+    audio_pattern = "(<audio>./</audio>)"
+
+    def get_supported_mm_modalities(self) -> List[str]:
+        return ["image", "video", "audio"]
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None, "audio": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "audio": self.get_max_audio_tokens(),
+            "video": self.get_max_video_tokens(seq_len)
+        }
+
+    def get_default_audio_pool_step(self) -> int:
+        return 2
+
+    def get_default_audio_sampling_rate(self) -> int:
+        return 16000
+
+    def get_chunk_length(self) -> int:
+        return self.get_hf_config().audio_chunk_length
+
+    def get_max_audio_tokens_per_chunk(self) -> int:
+        pool_step = self.get_default_audio_pool_step()
+        fbank_feat_in_chunk = 100
+        cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
+        num_audio_tokens = (cnn_feat_in_chunk - pool_step) // pool_step + 1
+        return num_audio_tokens + 2  # <audio>(<unk>*N)</audio>
+
+    def get_max_audio_chunks_with_most_features(self) -> int:
+        return 30
+
+    def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
+        sampling_rate = self.get_default_audio_sampling_rate()
+        # exclude <audio> </audio>
+        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
+        return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
+
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_audios = mm_config.limit_per_prompt.get("audio", 1)
+
+        # count <image_idx></image_idx> tokens
+        # which are not in get_max_image_tokens
+        max_image_tokens = self.get_max_image_tokens(
+        ) * max_images + 4 * max_images
+        max_audio_tokens = self.get_max_audio_tokens(
+        ) * max_audios + 2 * max_audios
+        max_total_frames = self.get_max_video_frames(seq_len -
+                                                     max_image_tokens -
+                                                     max_audio_tokens)
+
+        num_frames = max(max_total_frames // max(max_videos, 1), 1)
+
+        return num_frames
+
+
+class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder):
+
+    def get_dummy_processor_inputs(
+            self, seq_len: int, mm_counts: Mapping[str,
+                                                   int]) -> ProcessorInputs:
+        num_audios = mm_counts.get("audio", 0)
+        audio_len = self.info.get_max_audio_chunks_with_most_features() * \
+            self.info.get_default_audio_sampling_rate()
+
+        processor_inputs = super().get_dummy_processor_inputs(
+            seq_len, mm_counts)
+        mm_data = {
+            "image":
+            processor_inputs.mm_data["image"],
+            "video":
+            processor_inputs.mm_data["video"],
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+        audio_prompt_texts = self.info.audio_pattern * num_audios
+
+        return ProcessorInputs(prompt_text=processor_inputs.prompt_text + \
+                               audio_prompt_texts,
+                               mm_data=mm_data)
+
+
+class MiniCPMOMultiModalProcessor(
+        MiniCPMVMultiModalProcessor,
+        BaseMultiModalProcessor[MiniCPMOProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MiniCPMOMultiModalDataParser(
+            target_sr=self.info.get_default_audio_sampling_rate())
+
+    def get_audio_prompt_texts(self,
+                               audio_lens: int,
+                               chunk_input: bool = True,
+                               chunk_length: int = 1) -> str:
+        return self.info.get_hf_processor().get_audio_placeholder(
+            audio_lens, chunk_input, chunk_length)
+
+    def get_special_tokens(self) -> Dict[str, torch.Tensor]:
+        tokenizer = self.info.get_tokenizer()
+        special_tokens = super().get_special_tokens()
+        if hasattr(tokenizer, "audio_start_id"):
+            special_tokens["audio_start_id"] = torch.tensor(
+                tokenizer.audio_start_id)
+            special_tokens["audio_end_id"] = torch.tensor(
+                tokenizer.audio_end_id)
+        return special_tokens
+
+    def process_audios(self, mm_data: Mapping[str, object],
+                       mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
+        audios = mm_data.pop("audios", [])
+        audio_embeds = mm_data.pop("audio_embeds", [])
+        if isinstance(audios, (list, torch.Tensor)) and len(audios) > 0:
+            audio_outputs = {
+                "audio_lens": [],
+                "audio_features": [],
+                "audio_feature_lens": [],
+                "audio_num_segments": []
+            }
+            for audio in audios:
+                single_audio_outputs = super().call_base_hf_processor(
+                    prompt=self.info.audio_pattern,
+                    mm_data={
+                        "audios": audio,
+                        "chunk_input": True
+                    },
+                    mm_kwargs=mm_kwargs)
+                audio_outputs["audio_lens"].append(len(audio))
+                audio_outputs["audio_features"].append(
+                    single_audio_outputs["audio_features"])
+                audio_outputs["audio_num_segments"].append(
+                    len(single_audio_outputs["audio_feature_lens"][0]))
+                audio_outputs["audio_feature_lens"] += \
+                    single_audio_outputs["audio_feature_lens"]
+            audio_outputs["audio_features"] = [
+                audio_feature for single_audio_features in \
+                    audio_outputs["audio_features"]
+                for audio_feature in single_audio_features
+            ]
+            audio_outputs["audio_feature_lens"] = torch.cat(
+                audio_outputs["audio_feature_lens"])
+        elif len(audio_embeds):
+            audio_outputs = {
+                "audio_lens": [
+                    self.info.get_audio_len_by_num_chunks(
+                        sum(chunk_embeds.shape[0]
+                            for chunk_embeds in single_audio_embeds))
+                    for single_audio_embeds in audio_embeds
+                ],
+                "audio_embeds": [
+                    chunk_embeds for single_audio_embeds in audio_embeds
+                    for chunk_embeds in single_audio_embeds
+                ],
+                "audio_num_segments": [
+                    len(single_audio_embeds)
+                    for single_audio_embeds in audio_embeds
+                ]
+            }
+        else:
+            audio_outputs = {}
+        return audio_outputs
+
+    def get_placeholder_match_pattern(self) -> str:
+        return r"\(<(image|video|audio)>./</\1>\)"
+
+    def get_placeholder_split_pattern(self) -> str:
+        return r"\(<(?:image|video|audio)>./</(?:image|video|audio)>\)"
+
+    def process_mm_inputs(self, mm_data, mm_kwargs) -> object:
+        return {
+            "image": self.process_images(mm_data, mm_kwargs),
+            "video": self.process_videos(mm_data, mm_kwargs),
+            "audio": self.process_audios(mm_data, mm_kwargs)
+        }
+
+    def get_modality_num_counter(self, modality: str) -> str:
+        if modality == "audio":
+            return "audio_lens"
+        return super().get_modality_num_counter(modality)
+
+    def get_num_slices_by_modality(self, inputs: Dict[str, object],
+                                   modality: str, index: int) -> int:
+        if modality == "audio":
+            return inputs["audio"]["audio_num_segments"][index]
+        return super().get_num_slices_by_modality(inputs, modality, index)
+
+    def get_prompt_texts_by_modality(self, inputs: Dict[str, object],
+                                     modality: str, index: int) -> str:
+        if modality == "audio":
+            return self.get_audio_prompt_texts(
+                inputs["audio"]["audio_lens"][index])
+        return super().get_prompt_texts_by_modality(inputs, modality, index)
+
+    def _get_prompt_replacements(
+            self, mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, Any],
+            out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]:
+        placeholder = {
+            "image": self.info.image_pattern,
+            "video": self.info.video_pattern,
+            "audio": self.info.audio_pattern
+        }
+
+        def get_replacement_minicpmv(item_idx: int, modality: str):
+            if modality == "image":
+                return self.get_image_prompt_texts(
+                    mm_items["image"].get_image_size(item_idx), item_idx)
+            elif modality == "video":
+                return self.get_video_prompt_texts(
+                    mm_items["video"].get_frame_size(item_idx),
+                    mm_items["video"].get_num_frames(item_idx))
+            else:  # audio
+                if isinstance(mm_items["audio"], MiniCPMOAudioEmbeddingItems):
+                    single_audio_embeds = mm_items["audio"].get(item_idx)
+                    audio_len = self.info.get_audio_len_by_num_chunks(
+                        sum(chunk_embeds.shape[0]
+                            for chunk_embeds in single_audio_embeds))
+                    return self.get_audio_prompt_texts(audio_len)
+                return self.get_audio_prompt_texts(
+                    len(mm_items["audio"].get(item_idx)))
+
+        return [
+            PromptReplacement(modality=modality,
+                              target=placeholder[modality],
+                              replacement=partial(get_replacement_minicpmv,
+                                                  modality=modality))
+            for modality in ("image", "video", "audio")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+
+        def get_slices(num_slices: List[int]) -> List[int]:
+            slice_indices = [0] + list(accumulate(num_slices))
+            slices = [(slice_indices[i], slice_indices[i + 1])
+                      for i in range(len(num_slices))]
+            return [slice(*slice_item) for slice_item in slices]
+
+        audio_slices = get_slices(
+            hf_inputs.get("audio_num_slices", torch.empty(0)))
+        return dict(
+            **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs),
+            audio_features=MultiModalFieldConfig.flat("audio", audio_slices),
+            audio_feature_lens=MultiModalFieldConfig.flat(
+                "audio", audio_slices),
+            audio_num_slices=MultiModalFieldConfig.batched("audio"),
+            audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"),
+            audio_embeds=MultiModalFieldConfig.flat("audio", audio_slices))
+
+
+class MultiModalProjector(nn.Module):
+
+    def __init__(self, in_dim: int, out_dim: int):
+        super().__init__()
+        self.linear1 = nn.Linear(in_features=in_dim,
+                                 out_features=out_dim,
+                                 bias=True)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(in_features=out_dim,
+                                 out_features=out_dim,
+                                 bias=True)
+
+    def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.relu(self.linear1(audio_features))
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+class MiniCPMWhisperEncoderLayer(nn.Module):
+
+    def __init__(self, config: WhisperConfig, layer_idx: int = None):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = WHISPER_ATTENTION_CLASSES[
+            config._attn_implementation](
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                config=config,
+                layer_idx=layer_idx,
+            )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        past_key_values = None
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, past_key_values = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            past_key_value=past_key_values,
+        )
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.activation_dropout,
+                                              training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+                torch.isinf(hidden_states).any()
+                or torch.isnan(hidden_states).any()):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states,
+                                        min=-clamp_value,
+                                        max=clamp_value)
+
+        outputs = (hidden_states, )
+
+        return outputs
+
+
+class MiniCPMWhisperEncoder(WhisperEncoder):
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList([
+            MiniCPMWhisperEncoderLayer(config, layer_idx=i)
+            for i in range(config.encoder_layers)
+        ])
+
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> BaseModelOutputWithPast:
+        # Ignore copy
+        input_features = input_features.to(dtype=self.conv1.weight.dtype,
+                                           device=self.conv1.weight.device)
+
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+
+        embed_pos = self.embed_positions.weight
+
+        embed_pos = embed_pos[:inputs_embeds.shape[1], :]
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.dropout,
+                                              training=self.training)
+
+        encoder_states = ()
+
+        for idx, encoder_layer in enumerate(self.layers):
+            encoder_states = encoder_states + (hidden_states, )
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            # Ignore copy
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                )
+
+                hidden_states = layer_outputs[0]
+
+        hidden_states = self.layer_norm(hidden_states)
+        encoder_states = encoder_states + (hidden_states, )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MiniCPMOMultiModalProcessor,
+    info=MiniCPMOProcessingInfo,
+    dummy_inputs=MiniCPMODummyInputsBuilder)
+class MiniCPMO(MiniCPMV2_6):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        self.apm = self.init_audio_module(vllm_config=vllm_config,
+                                          prefix=maybe_prefix(prefix, "apm"))
+
+    def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        # Do not use parameters temporarily
+        audio_config = self.config.audio_config
+        model = MiniCPMWhisperEncoder(audio_config)
+        audio_output_dim = int(audio_config.encoder_ffn_dim // 4)
+        self.audio_avg_pooler = \
+            nn.AvgPool1d(self.config.audio_pool_step,
+                         stride=self.config.audio_pool_step)
+        self.audio_projection_layer = \
+            MultiModalProjector(in_dim=audio_output_dim,out_dim=self.embed_dim)
+        self.audio_encoder_layer = -1
+        return model
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["tts"])
+        return loader.load_weights(weights)
+
+    def subsequent_chunk_mask(
+        self,
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = CPU_DEVICE,
+        num_lookhead: int = 0,
+    ) -> torch.Tensor:
+        ret = torch.zeros(size, size, device=device, dtype=torch.bool)
+        for i in range(size):
+            if num_left_chunks < 0:
+                start = 0
+            else:
+                start = max((i // chunk_size - num_left_chunks) * chunk_size,
+                            0)
+            ending = min((i // chunk_size + 1) * chunk_size + num_lookhead,
+                         size)
+            ret[i, start:ending] = True
+        return ret
+
+    def _get_feat_extract_output_lengths(self,
+                                         input_lengths: torch.LongTensor):
+        input_lengths_after_cnn = (input_lengths - 1) // 2 + 1
+        input_lengths_after_pooling = (
+            input_lengths_after_cnn -
+            self.config.audio_pool_step) // self.config.audio_pool_step + 1
+        input_lengths_after_pooling = input_lengths_after_pooling.to(
+            dtype=torch.int32)
+
+        return input_lengths_after_cnn, input_lengths_after_pooling
+
+    # Copied from HF repo of MiniCPM-o-2_6,
+    # designed for batched inputs and outputs
+    def get_audio_hidden_states(self, data: MiniCPMOAudioInputs,
+                                chunk_length: int) -> torch.Tensor:
+        wavforms = data.get(
+            "data",
+            [])  # (bs, 80, frames) or [], multi audios need filled in advance
+        audio_feature_lens_raw = [data.get("audio_feature_lens",
+                                           [])]  # list, [[x1, x2], [y1], [z1]]
+
+        # exist audio
+        if len(wavforms) > 0:
+            audio_feature_lens = torch.hstack(audio_feature_lens_raw)
+            batch_size, _, max_mel_seq_len = wavforms.shape
+            max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+            # Create a sequence tensor of shape (batch_size, max_seq_len)
+            seq_range = (torch.arange(
+                0,
+                max_seq_len,
+                dtype=audio_feature_lens.dtype,
+                device=audio_feature_lens.device).unsqueeze(0).expand(
+                    batch_size, max_seq_len))
+            lengths_expand = audio_feature_lens.unsqueeze(1).expand(
+                batch_size, max_seq_len)
+            # Create mask
+            padding_mask = seq_range >= lengths_expand  # 1 for padded values
+
+            audio_attention_mask_ = padding_mask.view(
+                batch_size, 1, 1, max_seq_len).expand(batch_size, 1,
+                                                      max_seq_len, max_seq_len)
+            audio_attention_mask = audio_attention_mask_.to(
+                dtype=self.apm.conv1.weight.dtype,
+                device=self.apm.conv1.weight.device)
+
+            if chunk_length > 0:
+                chunk_num_frame = int(chunk_length * 50)
+                chunk_mask = self.subsequent_chunk_mask(
+                    size=max_seq_len,
+                    chunk_size=chunk_num_frame,
+                    num_left_chunks=-1,
+                    device=audio_attention_mask_.device,
+                )
+                audio_attention_mask_ = torch.logical_or(
+                    audio_attention_mask_, torch.logical_not(chunk_mask))
+
+            audio_attention_mask[audio_attention_mask_] = float("-inf")
+            audio_states = self.apm(
+                wavforms, attention_mask=audio_attention_mask).hidden_states[
+                    self.audio_encoder_layer]
+            audio_embeds = self.audio_projection_layer(audio_states)
+
+            audio_embeds = audio_embeds.transpose(1, 2)
+            audio_embeds = self.audio_avg_pooler(audio_embeds)
+            audio_embeds = audio_embeds.transpose(1, 2)
+
+            _, feature_lens_after_pooling = \
+                self._get_feat_extract_output_lengths(audio_feature_lens)
+
+            num_audio_tokens = feature_lens_after_pooling
+
+            final_audio_embeds = []
+            idx = 0
+            for i in range(len(audio_feature_lens_raw)):
+                target_audio_embeds = []
+                for _ in range(len(audio_feature_lens_raw[i])):
+                    target_audio_embeds.append(
+                        audio_embeds[idx, :num_audio_tokens[idx], :])
+                    idx += 1
+                final_audio_embeds.append(target_audio_embeds)
+            return final_audio_embeds
+        else:
+            return []
+
+    def get_embedding_with_audios(self, vlm_embedding: torch.Tensor,
+                                  audio_inputs: Optional[MiniCPMOAudioInputs],
+                                  chunk_length: int) -> torch.Tensor:
+        device, dtype = vlm_embedding.device, vlm_embedding.dtype
+        if audio_inputs["type"] == "audio_embeds":
+            audio_embeddings = audio_inputs["data"]
+            audio_embeddings = [
+                audio_embeddings[i].to(device=device, dtype=dtype)
+                for i in range(len(audio_embeddings))
+            ]
+        else:
+            audio_embeddings = self.get_audio_hidden_states(
+                audio_inputs, chunk_length)[0]
+        if audio_embeddings is None or len(audio_embeddings) == 0:
+            return vlm_embedding
+        audio_bounds = audio_inputs["audio_bounds"]
+        if self.config.chunk_input:
+            audio_embs = torch.cat(audio_embeddings, dim=0).to(device=device,
+                                                               dtype=dtype)
+            audio_start_pos = 0
+            for bound in audio_bounds:
+                audio_len = bound[1] - bound[0]
+                vlm_embedding[bound[0]:bound[1]] = audio_embs[
+                    audio_start_pos:audio_start_pos + audio_len, :]
+                audio_start_pos += audio_len
+        else:
+            for embs, bound in zip(audio_embeddings, audio_bounds):
+                audio_indices = torch.arange(bound[0],
+                                             bound[1],
+                                             dtype=torch.long).to(device)
+
+                if embs.shape[0] != len(audio_indices):
+                    raise ValueError(
+                        "Shape mismatch: Trying to assign embeddings "
+                        f"of shape {embs.shape} "
+                        f"to input indices of length {len(audio_indices)}")
+                vlm_embedding[audio_indices] = embs.to(dtype)
+        return vlm_embedding
+
+    def _get_audio_bounds(self, input_ids: torch.Tensor,
+                          audio_start_id: torch.Tensor,
+                          audio_end_id: torch.Tensor) -> torch.Tensor:
+        audio_start_tokens, = torch.where(input_ids == audio_start_id[0])
+        audio_start_tokens += 1
+        audio_end_tokens, = torch.where(input_ids == audio_end_id[0])
+        valid_audio_nums = max(len(audio_start_tokens), len(audio_end_tokens))
+        return torch.hstack([
+            audio_start_tokens[:valid_audio_nums].unsqueeze(-1),
+            audio_end_tokens[:valid_audio_nums].unsqueeze(-1)
+        ])
+
+    def _parse_and_validate_audio_inputs(
+            self, input_ids: torch.Tensor,
+            **kwargs: object) -> Tuple[MiniCPMOAudioInputs]:
+        audio_features = kwargs.pop("audio_features", [])
+        audio_feature_lens = kwargs.pop("audio_feature_lens", [])
+        audio_embeds = kwargs.pop("audio_embeds", None)
+        audio_start_id = kwargs.pop("audio_start_id", None)
+        audio_end_id = kwargs.pop("audio_end_id", None)
+        if audio_embeds is not None:
+            audio_embeds = [
+                audio_embeds[i][j] for i in range(len(audio_embeds))
+                for j in range(len(audio_embeds[i]))
+            ]
+            return MiniCPMOAudioEmbeddingInputs(
+                audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
+                                                    audio_end_id),
+                data=audio_embeds,
+                type="audio_embeds")
+        if len(audio_features) > 0:
+            audio_features_all = [
+                i.permute(1, 0) for audio_feature in audio_features
+                for i in audio_feature
+            ]
+            audio_features = torch.nn.utils.rnn.pad_sequence(
+                audio_features_all, batch_first=True,
+                padding_value=0.0).permute(0, 2, 1)
+            audio_feature_lens = torch.cat(
+                [item for item in audio_feature_lens])
+
+            return MiniCPMOAudioFeatureInputs(
+                audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
+                                                    audio_end_id),
+                data=audio_features,
+                audio_feature_lens=audio_feature_lens,
+                type="audio_features")
+        return None
+
+    def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
+                                   **kwargs: object):
+        image_inputs = self._parse_and_validate_image_inputs(
+            input_ids, **kwargs)
+        if not any("audio" in key for key in kwargs):
+            return image_inputs, None
+        audio_inputs = self._parse_and_validate_audio_inputs(
+            input_ids, **kwargs)
+        return image_inputs, audio_inputs
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        if intermediate_tensors is not None:
+            vlm_embeddings = None
+        else:
+            image_inputs, audio_inputs = \
+                self._parse_and_validate_inputs(input_ids, **kwargs)
+            vlm_embeddings, _ = self.get_embedding_with_vision(
+                input_ids, image_inputs)
+
+            if audio_inputs is not None:
+                vlm_embeddings = self.get_embedding_with_audios(
+                    vlm_embeddings, audio_inputs,
+                    self.config.audio_chunk_length)
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
+
+        output = self.llm.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=vlm_embeddings,
+        )
+        return output
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1aa529056893b..bf967d33a3176 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -22,21 +22,21 @@
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
 import re
+from collections import Counter
 from functools import cached_property, partial
-from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
-                    Set, Tuple, TypedDict, Union)
+from itertools import accumulate
+from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
+                    Optional, Set, Tuple, TypedDict, Union)
 
+import numpy as np
 import torch
 import torch.types
 from PIL import Image
 from torch import nn
-from transformers import PretrainedConfig
-from typing_extensions import NotRequired
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
                                                   get_2d_sincos_pos_embed)
@@ -48,33 +48,30 @@
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, PlaceholderRange)
+from vllm.multimodal.parse import (ImageItem, ImageSize, ModalityData,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser, VideoItem)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import AutoWeightsLoader, maybe_prefix
 
-RawImageType = Union[Image.Image, torch.Tensor]
-
-
-class MiniCPMVRawImageInput(TypedDict):
-    """Input mapper input with auxiliary data for computing image bounds."""
-    image: RawImageType
+CPU_DEVICE = torch.device("cpu")
 
-    # Image bounds token ids in 0-dim scaler tensor.
-    im_start_id: torch.Tensor
-    im_end_id: torch.Tensor
-    slice_start_id: NotRequired[torch.Tensor]
-    slice_end_id: NotRequired[torch.Tensor]
+RawImageType = Union[Image.Image, torch.Tensor]
 
 
 class MiniCPMVImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: List[torch.Tensor]
     """
-    Shape: `(batch_size * num_images, num_channels, height, width)`
+    Shape: `(batch_size * num_images * num_slices, num_channels, height, width)`
 
     Note that the image size may vary, so we pass it as a list
     instead of a batched tensor.
@@ -82,14 +79,14 @@ class MiniCPMVImagePixelInputs(TypedDict):
 
     image_bounds: torch.Tensor
     """
-    Shape: `(batch_size * num_images, 2)`
+    Shape: `(batch_size * num_images * num_slices, 2)`
 
     This should be in `(start, stop)` format.
     """
 
     tgt_sizes: torch.Tensor
     """
-    Shape: `(batch_size * num_images, 2)`
+    Shape: `(batch_size * num_images * num_slices, 2)`
 
     This should be in `(height, width)` format.
     """
@@ -99,7 +96,8 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: torch.Tensor
     """
-    Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+    Shape: `(batch_size * num_images * num_slices, 
+             image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     instead of a batched tensor.
@@ -107,7 +105,7 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
 
     image_bounds: torch.Tensor
     """
-    Shape: `(batch_size * num_images, 2)`
+    Shape: `(batch_size * num_images * num_slices, 2)`
 
     This should be in `(start, stop)` format.
     """
@@ -116,6 +114,93 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
 MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
                             MiniCPMVImageEmbeddingInputs]
 
+
+class MiniCPMVEmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
+                                               dict[str, torch.Tensor]]):
+
+    def __init__(self, data: Dict, modality: str) -> None:
+        super().__init__(data, modality)
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return self.data
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return {}
+
+    def get_count(self) -> int:
+        return len(self.data[f"{self.modality}_embeds"])
+
+    def get(self, index: int) -> Dict[str, torch.Tensor]:
+        out = {}
+        for k, v in self.data.items():
+            out[k] = v[index]
+        return out
+
+
+class MiniCPMVImageEmbeddingItems(MiniCPMVEmbeddingItems):
+
+    def __init__(self, data: Dict) -> None:
+        super().__init__(data, "image")
+        image_embeds = self.data.get("image_embeds", None)
+        image_sizes = self.data.get("image_sizes", None)
+        if image_embeds is None:
+            raise ValueError("In correct type of image_embeds",
+                             "Got type: None")
+        if not isinstance(image_embeds[0], torch.Tensor):
+            raise ValueError("In correct type of image_embeds",
+                             f"Got type: {type(image_embeds[0])}")
+        if image_sizes is None:
+            raise ValueError(
+                "In correct type of image_sizes", "Got type: None."
+                "If you're using `image_size_list`, "
+                "please rename it to `image_sizes`")
+        if len(image_embeds[0].shape) == 2:
+            image_embeds = [image_embeds]
+            image_sizes = [image_sizes]
+        self.data["image_embeds"] = image_embeds
+        self.data["image_sizes"] = image_sizes
+
+    def get_image_size(self, index: int) -> ImageSize:
+        image_size = self.data["image_sizes"][index]
+        return ImageSize(width=image_size[0], height=image_size[1])
+
+
+class MiniCPMVVideoEmbeddingItems(MiniCPMVEmbeddingItems):
+
+    def __init__(self, data: Dict) -> None:
+        super().__init__(data, "video")
+        video_embeds = self.data.get("video_embeds", None)
+        image_sizes = self.data.get("image_sizes", None)
+        num_frames = self.data.get("num_frames", None)
+        if video_embeds is None:
+            raise ValueError("In correct type of video_embeds",
+                             "Got type: None")
+        if not isinstance(video_embeds[0], torch.Tensor):
+            raise ValueError("In correct type of video_embeds",
+                             f"Got type: {type(video_embeds[0])}")
+        if image_sizes is None:
+            raise ValueError(
+                "In correct type of image_sizes", "Got type: None."
+                "If you're using `image_size_list`, "
+                "please rename it to `image_sizes`")
+        if num_frames is None:
+            raise ValueError("In correct type of numframes", "Got type: None")
+        if len(video_embeds[0].shape) == 2:
+            video_embeds = [video_embeds]
+            image_sizes = [image_sizes]
+            num_frames = [num_frames]
+        self.data["video_embeds"] = video_embeds
+        self.data["image_sizes"] = image_sizes
+        self.data["num_frames"] = num_frames
+
+    def get_frame_size(self, index: int) -> ImageSize:
+        frame_size = self.data["image_sizes"][index]
+        return ImageSize(width=frame_size[0], height=frame_size[1])
+
+    def get_num_frames(self, index: int) -> int:
+        return self.data["num_frames"][index]
+
+
 DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
 
 
@@ -212,25 +297,6 @@ def forward(self, x: torch.Tensor,
         return x
 
 
-def _build_image_input(ctx: InputContext,
-                       image: RawImageType) -> MiniCPMVRawImageInput:
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.tokenizer,
-        trust_remote_code=ctx.model_config.trust_remote_code)
-    if hasattr(tokenizer, "slice_start_id"):
-        return MiniCPMVRawImageInput(
-            image=image,
-            im_start_id=torch.tensor(tokenizer.im_start_id),
-            im_end_id=torch.tensor(tokenizer.im_end_id),
-            slice_start_id=torch.tensor(tokenizer.slice_start_id),
-            slice_end_id=torch.tensor(tokenizer.slice_end_id))
-    else:
-        return MiniCPMVRawImageInput(
-            image=image,
-            im_start_id=torch.tensor(tokenizer.im_start_id),
-            im_end_id=torch.tensor(tokenizer.im_end_id))
-
-
 def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
     version_float = getattr(config, "version", None)
 
@@ -240,129 +306,512 @@ def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
         if config.hidden_size == 2304 and config.query_num == 64:
             return (2, 0)
         return (2, 5)
-
     version_str = str(version_float)
     return tuple(int(x) for x in version_str.split("."))
 
 
-def get_max_minicpmv_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    return getattr(hf_config, "query_num", 64)
+class MiniCPMVMultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return MiniCPMVImageEmbeddingItems(data)
+        return super()._parse_image_data(data)
+
+    def _parse_video_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return MiniCPMVVideoEmbeddingItems(data)
+        return super()._parse_video_data(data)
+
+
+class MiniCPMVProcessingInfo(BaseProcessingInfo):
+    image_pattern = "(<image>./</image>)"
+    video_pattern = "(<video>./</video>)"
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(
+        self,
+        **kwargs: object,
+    ):
+        hf_processor = self.ctx.get_hf_processor()
+        return hf_processor
+
+    def get_image_processor(self):
+        hf_processor = self.get_hf_processor()
+        image_processor = hf_processor.image_processor  # type: ignore
+        return image_processor
+
+    def get_model_version(self):
+        return get_version_by_config(self.get_hf_config())
+
+    def get_supported_mm_modalities(self) -> List[str]:
+        if self.get_model_version() == (2, 6):
+            return ["image", "video"]
+        else:
+            return ["image"]
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        if self.get_model_version() == (2, 6):
+            return {"image": None, "video": None}
+        else:
+            return {"image": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        mm_max_tokens = {"image": self.get_max_image_tokens()}
+        if self.get_model_version() == (2, 6):
+            mm_max_tokens["video"] = self.get_max_video_tokens(seq_len)
+        return mm_max_tokens
+
+    def get_max_video_frame_tokens(self) -> int:
+        frame_size = self.get_video_frame_size_with_most_features()
+        return self.get_num_image_tokens(frame_size,
+                                         self.get_video_max_slice_num())
+
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        return self.get_max_video_frame_tokens(
+        ) * self.get_num_frames_with_most_features(seq_len)
+
+    def get_max_audio_tokens(self) -> int:
+        return self.get_max_audio_tokens_per_chunk(
+        ) * self.get_max_audio_chunks_with_most_features()
+
+    def get_slice_query_num(self) -> int:
+        hf_config = self.get_hf_config()
+        query_num = getattr(hf_config, "query_num", 64)
+        return query_num
+
+    def get_max_slice_num(self) -> int:
+        hf_config = self.get_hf_config()
+        max_slice_num = getattr(hf_config, "max_slice_num", 9)
+        return max_slice_num
+
+    def get_sliced_grid(self, image_size: ImageSize,
+                        max_slice_num: int) -> Tuple[int, int]:
+        if self.get_model_version() == (2, 6):
+            slice_grid = self.get_image_processor().get_sliced_grid(
+                image_size, max_slice_num)
+        else:
+            slice_grid = self.get_image_processor().get_sliced_grid(image_size)
+        return slice_grid
+
+    def get_num_image_tokens(self, image_size: ImageSize,
+                             max_slice_num: int) -> int:
+        slice_grid = self.get_sliced_grid(image_size, max_slice_num)
+        num_tokens = self.get_slice_query_num(
+        ) + 2  # <image>(<unk> * query_num)</image>
+        if slice_grid is not None:
+            if self.get_model_version() == (2, 6):
+                num_additional_tokens = 0
+            else:
+                # <slice><image>(<unk> * query_num)</image></slice>
+                num_additional_tokens = 2
+            num_tokens += ((self.get_slice_query_num() + 2) \
+                            * slice_grid[0] * slice_grid[1]) \
+                            + slice_grid[1] - 1 + num_additional_tokens
+        return num_tokens
 
+    def get_image_slice_nums(self, image_size: torch.Tensor,
+                             max_slice_nums: int) -> int:
+        grid = self.get_sliced_grid(image_size, max_slice_nums)
+        return 1 if grid is None else grid[0] * grid[1] + 1
 
-def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int):
-    return SequenceData.from_prompt_token_counts((0, seq_len))
+    def get_max_image_tokens(self) -> int:
+        image_size = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(image_size, self.get_max_slice_num())
 
+    def get_image_size_with_most_features(self) -> ImageSize:
+        # Result in the max possible feature size (h:w = 9:1)
+        return self.get_default_image_sizes(self.get_max_slice_num())
 
-def dummy_image_for_minicpmv(ctx: InputContext, hf_config: PretrainedConfig,
-                             num_images: int):
-    width = height = hf_config.image_size
-    image = _build_image_input(ctx,
-                               image=Image.new("RGB", (width, height),
-                                               color=0))
-    return {"image": [image] if num_images == 1 else [image] * num_images}
+    def get_video_max_slice_num(self) -> int:
+        return 1
 
+    def get_video_frame_size_with_most_features(self) -> ImageSize:
+        return self.get_default_image_sizes(self.get_video_max_slice_num())
 
-def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int,
-                            mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config()
-    num_images = mm_counts["image"]
+    def get_max_video_frames(self, max_tokens: int) -> int:
+        num_frame_tokens = self.get_max_video_frame_tokens()
+        num_frames = max_tokens // num_frame_tokens
+        return num_frames
 
-    seq_data = dummy_seq_data_for_minicpmv(seq_len, num_images)
-    mm_data = dummy_image_for_minicpmv(ctx, hf_config, num_images)
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
+        mm_config = self.ctx.get_mm_config()
+        max_images = mm_config.limit_per_prompt.get("image", 1)
+        max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-    return DummyData(seq_data, mm_data)
+        # count <image_idx></image_idx> tokens
+        # which are not in get_max_image_tokens
+        max_image_tokens = self.get_max_image_tokens(
+        ) * max_images + 4 * max_images
+        max_total_frames = self.get_max_video_frames(seq_len -
+                                                     max_image_tokens)
 
+        num_frames = max(max_total_frames // max(max_videos, 1), 1)
 
-def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-    model_config = ctx.model_config
-    version = get_version_by_config(model_config.hf_config)
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-    image_processor = cached_get_image_processor(model_config.tokenizer)
+        return num_frames
 
-    def get_placeholder(image_size: Tuple[int, int], num_image: int):
+    def get_default_image_sizes(self, num_slices: int) -> ImageSize:
+        image_size = getattr(self.get_hf_config(), "image_size", 448)
+        return ImageSize(width=image_size, height=image_size * num_slices)
+
+
+class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[MiniCPMVProcessingInfo]
+                                 ):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        image_width, image_height = \
+            self.info.get_image_size_with_most_features()
+        video_width, video_height = \
+            self.info.get_video_frame_size_with_most_features()
+        num_video_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=image_width,
+                                   height=image_height,
+                                   num_images=num_images),
+            "video": [
+                self._get_dummy_images(width=video_width,
+                                       height=video_height,
+                                       num_images=num_video_frames)
+            ] * num_videos,
+        }
+
+        image_prompt_texts = self.info.image_pattern * num_images
+        video_prompt_texts = self.info.video_pattern * num_videos
+
+        return ProcessorInputs(prompt_text=image_prompt_texts +
+                               video_prompt_texts,
+                               mm_data=mm_data)
+
+
+class MiniCPMVMultiModalProcessor(
+        BaseMultiModalProcessor[MiniCPMVProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return MiniCPMVMultiModalDataParser()
+
+    def get_slice_image_placeholder(self, image_size: ImageSize,
+                                    **kwargs) -> str:
+        image_processor = self.info.get_image_processor()
+        version = self.info.get_model_version()
         if version == (2, 0) or version == (2, 5):
             return image_processor.get_slice_image_placeholder(image_size)
         return image_processor.get_slice_image_placeholder(
-            image_size, num_image)
-
-    prompt = inputs.get("prompt")
-    token_ids = inputs.get("prompt_token_ids")
-    if prompt is None:
-        prompt = tokenizer.decode(token_ids)
-
-    pattern = "(<image>./</image>)"
-    images = multi_modal_data["image"]
-    image_tags = re.findall(pattern, prompt)
-    if len(image_tags) == 0:
-        new_token_ids = token_ids
-        new_prompt = prompt
-    else:
-        if isinstance(images, dict):
-            image_size_list = images.get("image_size_list")
-            images = [images.get("image_embeds")]
+            image_size, **kwargs)
+
+    def get_image_prompt_texts(self,
+                               image_size: ImageSize,
+                               image_idx: int = 0) -> str:
+        prompt_texts = self.get_slice_image_placeholder(image_size,
+                                                        image_idx=image_idx)
+        return prompt_texts
+
+    def get_video_prompt_texts(self, image_size: ImageSize,
+                               num_frames: int) -> str:
+        prompt_texts = "".join(
+            self.get_slice_image_placeholder(
+                image_size=image_size,
+                image_idx=0,
+                max_slice_nums=self.info.get_video_max_slice_num(),
+                use_image_id=False) for image_idx in range(num_frames))
+        return prompt_texts
+
+    def get_special_tokens(self) -> Dict[str, torch.Tensor]:
+        tokenizer = self.info.get_tokenizer()
+        special_tokens = {
+            "im_start_id": torch.tensor(tokenizer.im_start_id),
+            "im_end_id": torch.tensor(tokenizer.im_end_id)
+        }
+        if hasattr(tokenizer, "slice_start_id"):
+            special_tokens["slice_start_id"] = torch.tensor(
+                tokenizer.slice_start_id)
+            special_tokens["slice_end_id"] = torch.tensor(
+                tokenizer.slice_end_id)
+        return special_tokens
+
+    @staticmethod
+    def repack_processor_outputs(outputs: Any) -> BatchFeature:
+        valid_keys = ["pixel_values", "image_sizes", "tgt_sizes"]
+        outputs = {key: outputs[key][0] for key in valid_keys}
+        return outputs
+
+    def process_images(self, mm_data: Mapping[str, object],
+                       mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
+        images = mm_data.pop("images", [])
+        image_embeds = mm_data.pop("image_embeds", [])
+        if isinstance(images, Image.Image):
+            images = [images]
+        if isinstance(images, (list, torch.Tensor)) and len(images) > 0:
+            image_outputs = super()._call_hf_processor(
+                prompt=self.info.image_pattern * len(images),
+                mm_data={"images": images},
+                mm_kwargs=mm_kwargs)
+            image_outputs = MiniCPMVMultiModalProcessor.\
+                repack_processor_outputs(image_outputs)
+        elif len(image_embeds) > 0:
+            image_sizes = mm_data.pop("image_sizes", None)
+            image_outputs = {
+                "image_embeds": torch.cat(image_embeds),
+                "image_sizes": image_sizes
+            }
         else:
-            if isinstance(images, Image.Image):
-                images = [images]
-            image_size_list = [image.size for image in images]
-
-        text_chunks = prompt.split(pattern)
-        new_prompt_chunks: List[str] = []
-        for i in range(len(image_size_list)):
-            new_prompt_chunks += [
-                text_chunks[i],
-                get_placeholder(image_size_list[i], i)
-            ]
-        new_prompt_chunks.append(text_chunks[-1])
-        new_prompt = "".join(new_prompt_chunks)
-        new_token_ids = tokenizer.encode(new_prompt)
-
-    multi_modal_data["image"] = [
-        _build_image_input(ctx, image) for image in images
-    ]
+            image_outputs = {}
+        return image_outputs
+
+    def process_videos(self, mm_data: Mapping[str, object],
+                       mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
+        videos = mm_data.pop("videos", [])
+        video_embeds = mm_data.pop("video_embeds", [])
+        if len(videos) > 0 and isinstance(videos[0], Image.Image):
+            videos = [videos]
+        if isinstance(videos, list) and len(videos) > 0:
+            video_outputs = {
+                "video_pixel_values": [],
+                "video_image_sizes": [],
+                "video_tgt_sizes": [],
+                "num_frames": []
+            }
+            for video in videos:
+                parsed_video = []
+                for frame in video:
+                    if isinstance(frame, np.ndarray):
+                        parsed_video.append(Image.fromarray(frame))
+                    else:
+                        parsed_video.append(frame)
+                video = parsed_video
+                single_video_outputs = super()._call_hf_processor(
+                    prompt=self.info.image_pattern * len(video),
+                    mm_data={"images": video},
+                    mm_kwargs={
+                        **mm_kwargs, "max_slice_nums":
+                        self.info.get_video_max_slice_num()
+                    })
+                video_outputs["num_frames"].append(len(video))
+                for key in single_video_outputs:
+                    if "video_" + key in video_outputs:
+                        if key == "image_sizes":
+                            video_outputs["video_" + key].append(
+                                single_video_outputs[key][0][0])
+                        else:
+                            video_outputs["video_" +
+                                          key] += single_video_outputs[key][0]
+        elif len(video_embeds):
+            image_sizes = mm_data.pop("image_sizes", None)
+            num_frames = mm_data.pop("num_frames", None)
+            video_outputs = {
+                "video_embeds": torch.cat(video_embeds),
+                "video_image_sizes": image_sizes,
+                "num_frames": num_frames
+            }
+        else:
+            video_outputs = {}
+        return video_outputs
 
-    return token_inputs(
-        prompt_token_ids=new_token_ids,
-        prompt=new_prompt,
-        multi_modal_data=multi_modal_data,
-    )
+    def get_placeholder_match_pattern(self) -> str:
+        return r"\(<(image|video)>./</\1>\)"
 
+    def get_placeholder_split_pattern(self) -> str:
+        return r"\(<(?:image|video)>./</(?:image|video)>\)"
 
-def input_mapper_for_minicpmv(ctx: InputContext, data: object):
-    model_config = ctx.model_config
+    def process_mm_inputs(self, mm_data, mm_kwargs) -> object:
+        return {
+            "image": self.process_images(mm_data, mm_kwargs),
+            "video": self.process_videos(mm_data, mm_kwargs)
+        }
 
-    image_processor = cached_get_image_processor(
-        model_config.model, trust_remote_code=model_config.trust_remote_code)
-    if image_processor is None:
-        raise RuntimeError("No HuggingFace processor is available "
-                           "to process the image object")
+    def get_input_modalities(self, mm_data) -> List[str]:
+        supported_mm_modalities = self.info.get_supported_mm_modalities()
+        input_modalities = []
+        for modality in supported_mm_modalities:
+            if modality in mm_data and mm_data[modality] != {}:
+                input_modalities.append(modality)
+        return input_modalities
+
+    def get_modality_num_counter(self, modality: str) -> str:
+        if modality == "image":
+            return "image_sizes"
+        elif modality == "video":
+            return "video_image_sizes"
+
+    def get_num_slices_by_modality(self, inputs: Dict[str, object],
+                                   modality: str, index: int) -> int:
+        if modality == "image":
+            return self.info.get_image_slice_nums(
+                inputs[modality]["image_sizes"][index],
+                self.info.get_max_slice_num())
+        elif modality == "video":
+            return self.info.get_image_slice_nums(
+                inputs[modality]["video_image_sizes"][index],
+                self.info.get_video_max_slice_num()
+            ) * inputs[modality]["num_frames"][index]
+        else:
+            raise ValueError(f"UnExpected modality: {modality}")
+
+    def check_mm_inputs(self, inputs: Dict[str, object],
+                        matches: List[str]) -> None:
+        counts = Counter(matches)
+        for modality, count in counts.items():
+            if modality not in inputs or not inputs[modality]:
+                raise ValueError(f"None input data of {modality}."
+                                 "But prompt requires.")
+            counter_key = self.get_modality_num_counter(modality)
+            if len(inputs[modality][counter_key]) != count:
+                raise ValueError(f"The prompt requires {count} "
+                                 f"{modality} inputs while you pass "
+                                 f"{len(inputs[modality][counter_key])}")
+
+    def get_prompt_texts_by_modality(self, inputs: Dict[str, object],
+                                     modality: str, index: int) -> str:
+        if modality == "image":
+            return self.get_image_prompt_texts(
+                inputs["image"]["image_sizes"][index], index)
+        elif modality == "video":
+            return self.get_video_prompt_texts(
+                inputs["video"]["video_image_sizes"][index],
+                inputs["video"]["num_frames"][index])
+        else:
+            raise ValueError(f"UnExpected modality: {modality}")
 
-    if not isinstance(data, list):
-        raise ValueError(
-            "Image input must be list of MiniCPMVImageInput, got (%s)", data)
+    def call_base_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        return super()._call_hf_processor(prompt=prompt,
+                                          mm_data=mm_data,
+                                          mm_kwargs=mm_kwargs)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Do not support combination inputs of images and videos for now
+        # Try to handle interleaved multimodal data
+        tokenizer = self.info.get_tokenizer()
+        inputs = self.process_mm_inputs(mm_data, mm_kwargs)
+        mm_input_modalities = self.get_input_modalities(inputs)
+        num_mm_slices = {modality: [] for modality in mm_input_modalities}
+        for modality in mm_input_modalities:
+            num_counter_key = self.get_modality_num_counter(modality)
+            for index in range(len(inputs[modality][num_counter_key])):
+                num_mm_slices[modality].append(
+                    self.get_num_slices_by_modality(inputs, modality, index))
+        return {
+            "input_ids": np.array([tokenizer.encode(prompt)]),
+            **{
+                key: value
+                for modality in inputs
+                for key, value in inputs[modality].items()
+            },
+            **{
+                f"{modality}_num_slices": num_mm_slices[modality]
+                for modality in mm_input_modalities
+            }
+        }
 
-    if len(data) > 0 and isinstance(data[0]['image'], torch.Tensor):
-        batch_data = {
-            "image_embeds": data[0]['image'],
+    def _get_prompt_replacements(
+            self, mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, Any],
+            out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]:
+        placeholder = {
+            "image": self.info.image_pattern,
+            "video": self.info.video_pattern,
         }
-    else:
-        batch_data = image_processor \
-            .preprocess([img["image"] for img in data], return_tensors="pt") \
-            .data
 
-    if len(data) > 0:
-        batch_data["im_start_id"] = data[0]["im_start_id"]
-        batch_data["im_end_id"] = data[0]["im_end_id"]
-        if "slice_start_id" in data[0]:
-            batch_data["slice_start_id"] = data[0]["slice_start_id"]
-            batch_data["slice_end_id"] = data[0]["slice_end_id"]
+        def get_replacement_minicpmv(item_idx: int, modality: str):
+            if modality == "image":
+                return self.get_image_prompt_texts(
+                    mm_items["image"].get_image_size(item_idx), item_idx)
+            else:  # video
+                return self.get_video_prompt_texts(
+                    mm_items["video"].get_frame_size(item_idx),
+                    mm_items["video"].get_num_frames(item_idx))
+
+        return [
+            PromptReplacement(modality=modality,
+                              target=placeholder[modality],
+                              replacement=partial(get_replacement_minicpmv,
+                                                  modality=modality))
+            for modality in ("image", "video")
+        ]
 
-    return MultiModalKwargs(batch_data)
+    def _get_mm_fields_config(
+        self,
+        hf_inputs,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+
+        def get_slices(num_slices: List[int]) -> List[int]:
+            slice_indices = [0] + list(accumulate(num_slices))
+            slices = [(slice_indices[i], slice_indices[i + 1])
+                      for i in range(len(num_slices))]
+            return [slice(*slice_item) for slice_item in slices]
+
+        image_slices = get_slices(
+            hf_inputs.get("image_num_slices", torch.empty(0)))
+        video_slices = get_slices(
+            hf_inputs.get("video_num_slices", torch.empty(0)))
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat("image", image_slices),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            tgt_sizes=MultiModalFieldConfig.flat("image", image_slices),
+            image_num_slices=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.flat("image", image_slices),
+            video_pixel_values=MultiModalFieldConfig.flat(
+                "video", video_slices),
+            video_image_sizes=MultiModalFieldConfig.batched("video"),
+            video_tgt_sizes=MultiModalFieldConfig.flat("video", video_slices),
+            video_embeds=MultiModalFieldConfig.flat("video", video_slices),
+            video_num_slices=MultiModalFieldConfig.batched("video"))
+
+    def apply(
+        self,
+        prompt: Union[str, List[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputs:
+        supported_mm_modalities = self.info.get_supported_mm_modalities()
+        if isinstance(prompt, list):
+            prompt = self.info.get_tokenizer().decode(prompt)
+        matches = re.findall(self.get_placeholder_match_pattern(), prompt)
+        mm_orders = {
+            f"{modality}_orders":
+            torch.tensor(
+                [index for index, m in enumerate(matches) if m == modality])
+            for modality in supported_mm_modalities
+        }
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+        # Exclude <image_id>x</image_id> from placeholders
+        if "image" in result["mm_placeholders"] and \
+            self.info.get_model_version() == (2, 6):
+            result["mm_placeholders"]["image"] = [
+                PlaceholderRange(offset=p["offset"] + 3 + idx // 10,
+                                 length=p["length"] - 3 - idx // 10)
+                for idx, p in enumerate(result["mm_placeholders"]["image"])
+            ]
+        result["mm_kwargs"].update(**mm_orders)
+        result["mm_kwargs"].update(**self.get_special_tokens())
+        return result
 
 
 class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
@@ -409,7 +858,7 @@ def sampler(self):
 
         return get_sampler()
 
-    def get_embedding(
+    def get_embedding_with_vision(
         self,
         input_ids: torch.Tensor,
         image_inputs: Optional[MiniCPMVImageInputs],
@@ -471,25 +920,46 @@ def _get_image_bounds(
             image_end_tokens[:valid_image_nums].unsqueeze(-1),
         ])
 
-    def _parse_and_validate_inputs(
+    def _parse_and_validate_image_inputs(
         self,
         input_ids: torch.Tensor,
         **kwargs: object,
     ) -> Optional[MiniCPMVImageInputs]:
-        pixel_values = kwargs.pop("pixel_values", [])
-        tgt_sizes = kwargs.pop("tgt_sizes", [])
+        mm_data = {
+            "image": {
+                key: kwargs.pop(key, [])
+                for key in ["pixel_values", "tgt_sizes", "image_num_slices"]
+            },
+            "video": {
+                "pixel_values": kwargs.pop("video_pixel_values", []),
+                "tgt_sizes": kwargs.pop("video_tgt_sizes", []),
+                "video_num_slices": kwargs.pop("video_num_slices", [])
+            }
+        }
         im_start_id = kwargs.pop("im_start_id", None)
         im_end_id = kwargs.pop("im_end_id", None)
         slice_start_id = kwargs.pop("slice_start_id", None)
         slice_end_id = kwargs.pop("slice_end_id", None)
+        mm_orders = {
+            f"{modality}": kwargs.pop(f"{modality}_orders", None)
+            for modality in ["image", "video", "audio"]
+        }
+        batch_size = max(len(mm_data["image"]["pixel_values"]),
+                         len(mm_data["video"]["pixel_values"]))
         image_embeds = kwargs.pop("image_embeds", None)
-
+        video_embeds = kwargs.pop("video_embeds", None)
+        if image_embeds is not None and video_embeds is not None:
+            raise ValueError(
+                "Incorrect inputs for vision embeddings. "
+                "Image embeds and video embeds can not exist simultaneously.")
+        if video_embeds is not None:
+            image_embeds = video_embeds
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError(f"Incorrect type of image embeds. "
                                  f"Got type: {type(image_embeds)}")
-            if isinstance(image_embeds, list):
-                image_embeds = torch.concat(image_embeds)
+            image_embeds = torch.concat(
+                [image_embeds[i] for i in range(len(image_embeds))])
 
             return MiniCPMVImageEmbeddingInputs(
                 image_bounds=self._get_image_bounds(input_ids, im_start_id,
@@ -498,29 +968,47 @@ def _parse_and_validate_inputs(
                 data=image_embeds,
                 type="image_embeds",
             )
-
-        if not isinstance(pixel_values, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
-
-        if not isinstance(tgt_sizes, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of target sizes. "
-                             f"Got type: {type(tgt_sizes)}")
-
-        if len(pixel_values) != len(tgt_sizes):
-            raise ValueError("Inconsistent batch lengths, found: "
-                             f"{len(pixel_values)} vs. {len(tgt_sizes)}")
+        for modality, modality_mm_data in mm_data.items():
+            if not isinstance(modality_mm_data["pixel_values"],
+                              (torch.Tensor, list)):
+                raise ValueError(
+                    "Incorrect type of pixel values. "
+                    f"Got type: {type(modality_mm_data['pixel_values'])}")
+
+            if not isinstance(modality_mm_data["tgt_sizes"],
+                              (torch.Tensor, list)):
+                raise ValueError(
+                    "Incorrect type of target sizes. "
+                    f"Got type: {type(modality_mm_data['tgt_sizes'])}")
+
+            if len(modality_mm_data["pixel_values"]) != len(
+                    modality_mm_data["tgt_sizes"]):
+                raise ValueError(
+                    "Inconsistent batch lengths, found: "
+                    f"{len(modality_mm_data['pixel_values'])} vs. "
+                    f"{len(modality_mm_data['tgt_sizes'])}")
 
         pixel_values_flat: List[torch.Tensor] = []
         tgt_sizes_flat: List[torch.Tensor] = []
-        for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
-            if len(pixel_b) != len(tgt_b):
-                raise ValueError("Inconsistent N lengths, found: "
-                                 f"{len(pixel_b)} vs {len(tgt_b)}")
-
-            for pixel_n, tgt_n in zip(pixel_b, tgt_b):
-                pixel_values_flat += pixel_n
-                tgt_sizes_flat += tgt_n
+        for b in range(batch_size):
+            mm_counts = {"image": 0, "video": 0} if self.version == (2, 6) \
+                        else {"image": 0}
+            mm_slice_counts = {"image": 0, "video": 0} \
+                               if self.version == (2, 6) else {"image": 0}
+            mm_orders_b = [(index, modality) for modality in mm_counts
+                           for index in mm_orders[modality][b]]
+            for _, modality in sorted(mm_orders_b, key=lambda x: x[0]):
+                pos = mm_counts[modality]
+                num_slices = mm_data[modality][f"{modality}_num_slices"][b][
+                    pos]
+                slice_start_idx = mm_slice_counts[modality]
+                slice_end_idx = slice_start_idx + num_slices
+                pixel_values_flat += mm_data[modality]["pixel_values"][b][
+                    slice_start_idx:slice_end_idx]
+                tgt_sizes_flat += mm_data[modality]["tgt_sizes"][b][
+                    slice_start_idx:slice_end_idx]
+                mm_counts[modality] += 1
+                mm_slice_counts[modality] += num_slices
 
         # NOTE: Input IDs does not contain image tokens during memory profiling,
         # so we allow it to be empty
@@ -544,6 +1032,10 @@ def _parse_and_validate_inputs(
             type="pixel_values",
         )
 
+    def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
+                                   **kwargs: object):
+        return self._parse_and_validate_image_inputs(input_ids, **kwargs)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -556,9 +1048,10 @@ def forward(
         if intermediate_tensors is not None:
             vlm_embeddings = None
         else:
-            image_inputs = self._parse_and_validate_inputs(input_ids, **kwargs)
-
-            vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs)
+            image_inputs = \
+                self._parse_and_validate_inputs(input_ids, **kwargs)
+            vlm_embeddings, _ = self.get_embedding_with_vision(
+                input_ids, image_inputs)
 
         # always pass the input via `inputs_embeds`
         # to make sure the computation graph is consistent
@@ -964,15 +1457,15 @@ def get_vision_hidden_states(self,
 _SUPPORT_VERSION = {
     (2, 0): MiniCPMV2_0,
     (2, 5): MiniCPMV2_5,
-    (2, 6): MiniCPMV2_6
+    (2, 6): MiniCPMV2_6,
 }
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_minicpmv)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv)
-class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA):
+@MULTIMODAL_REGISTRY.register_processor(
+    MiniCPMVMultiModalProcessor,
+    info=MiniCPMVProcessingInfo,
+    dummy_inputs=MiniCPMVDummyInputsBuilder)
+class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
     """
     Different versions of MiniCPMV use different visual encoders and LLMs,
     which is not conducive to the current integration logic of LoRA and
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 8d71b19060bf4..de05bf2b772f5 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -162,6 +162,7 @@
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
     "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
+    "MiniCPMO": ("minicpmo", "MiniCPMO"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),

From ff7424f491935a1b4737bcc1570de0d616fc22f3 Mon Sep 17 00:00:00 2001
From: Yanyi Liu <wolfsonliu@163.com>
Date: Wed, 29 Jan 2025 17:41:01 +0800
Subject: [PATCH 34/69] [Frontend] Support override generation config in args
 (#12409)

Signed-off-by: liuyanyi <wolfsonliu@163.com>
---
 tests/test_config.py     | 70 ++++++++++++++++++++++++++++++++++++++++
 vllm/config.py           | 13 ++++++--
 vllm/engine/arg_utils.py | 25 ++++++++++----
 3 files changed, 100 insertions(+), 8 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 4518adfc31bfc..ec366b93d6a37 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -281,3 +281,73 @@ def test_uses_mrope(model_id, uses_mrope):
     )
 
     assert config.uses_mrope == uses_mrope
+
+
+def test_generation_config_loading():
+    model_id = "Qwen/Qwen2.5-1.5B-Instruct"
+
+    # When set generation_config to None, the default generation config
+    # will not be loaded.
+    model_config = ModelConfig(model_id,
+                               task="auto",
+                               tokenizer=model_id,
+                               tokenizer_mode="auto",
+                               trust_remote_code=False,
+                               seed=0,
+                               dtype="float16",
+                               generation_config=None)
+    assert model_config.get_diff_sampling_param() == {}
+
+    # When set generation_config to "auto", the default generation config
+    # should be loaded.
+    model_config = ModelConfig(model_id,
+                               task="auto",
+                               tokenizer=model_id,
+                               tokenizer_mode="auto",
+                               trust_remote_code=False,
+                               seed=0,
+                               dtype="float16",
+                               generation_config="auto")
+
+    correct_generation_config = {
+        "repetition_penalty": 1.1,
+        "temperature": 0.7,
+        "top_p": 0.8,
+        "top_k": 20,
+    }
+
+    assert model_config.get_diff_sampling_param() == correct_generation_config
+
+    # The generation config could be overridden by the user.
+    override_generation_config = {"temperature": 0.5, "top_k": 5}
+
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        generation_config="auto",
+        override_generation_config=override_generation_config)
+
+    override_result = correct_generation_config.copy()
+    override_result.update(override_generation_config)
+
+    assert model_config.get_diff_sampling_param() == override_result
+
+    # When generation_config is set to None and override_generation_config
+    # is set, the override_generation_config should be used directly.
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        generation_config=None,
+        override_generation_config=override_generation_config)
+
+    assert model_config.get_diff_sampling_param() == override_generation_config
diff --git a/vllm/config.py b/vllm/config.py
index d7c9311ae3cb0..58464eae80b82 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -165,6 +165,8 @@ class ModelConfig:
             `logits_processors` extra completion argument. Defaults to None,
             which allows no processors.
         generation_config: Configuration parameter file for generation.
+        override_generation_config: Override the generation config with the
+            given config.
     """
 
     def compute_hash(self) -> str:
@@ -225,6 +227,7 @@ def __init__(
         logits_processor_pattern: Optional[str] = None,
         generation_config: Optional[str] = None,
         enable_sleep_mode: bool = False,
+        override_generation_config: Optional[Dict[str, Any]] = None,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -368,6 +371,7 @@ def __init__(
         self.logits_processor_pattern = logits_processor_pattern
 
         self.generation_config = generation_config
+        self.override_generation_config = override_generation_config or {}
 
         self._verify_quantization()
         self._verify_cuda_graph()
@@ -904,8 +908,13 @@ def get_diff_sampling_param(self) -> Dict[str, Any]:
         """
         if self.generation_config is None:
             # When generation_config is not set
-            return {}
-        config = self.try_get_generation_config()
+            config = {}
+        else:
+            config = self.try_get_generation_config()
+
+        # Overriding with given generation config
+        config.update(self.override_generation_config)
+
         available_params = [
             "repetition_penalty",
             "temperature",
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ba96484e3fce9..1f203b6eaeb33 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -195,6 +195,7 @@ class EngineArgs:
     kv_transfer_config: Optional[KVTransferConfig] = None
 
     generation_config: Optional[str] = None
+    override_generation_config: Optional[Dict[str, Any]] = None
     enable_sleep_mode: bool = False
 
     calculate_kv_scales: Optional[bool] = None
@@ -936,12 +937,23 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=nullable_str,
             default=None,
             help="The folder path to the generation config. "
-            "Defaults to None, will use the default generation config in vLLM. "
-            "If set to 'auto', the generation config will be automatically "
-            "loaded from model. If set to a folder path, the generation config "
-            "will be loaded from the specified folder path. If "
-            "`max_new_tokens` is specified, then it sets a server-wide limit "
-            "on the number of output tokens for all requests.")
+            "Defaults to None, no generation config is loaded, vLLM defaults "
+            "will be used. If set to 'auto', the generation config will be "
+            "loaded from model path. If set to a folder path, the generation "
+            "config will be loaded from the specified folder path. If "
+            "`max_new_tokens` is specified in generation config, then "
+            "it sets a server-wide limit on the number of output tokens "
+            "for all requests.")
+
+        parser.add_argument(
+            "--override-generation-config",
+            type=json.loads,
+            default=None,
+            help="Overrides or sets generation config in JSON format. "
+            "e.g. ``{\"temperature\": 0.5}``. If used with "
+            "--generation-config=auto, the override parameters will be merged "
+            "with the default config from the model. If generation-config is "
+            "None, only the override parameters are used.")
 
         parser.add_argument("--enable-sleep-mode",
                             action="store_true",
@@ -1002,6 +1014,7 @@ def create_model_config(self) -> ModelConfig:
             override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern,
             generation_config=self.generation_config,
+            override_generation_config=self.override_generation_config,
             enable_sleep_mode=self.enable_sleep_mode,
         )
 

From b02fd288b28f0bfa2d7ac8958fe0d71ec22ffc1b Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Wed, 29 Jan 2025 01:46:12 -0800
Subject: [PATCH 35/69] [Hardware][NV] Fix Modelopt model loading for
 k-v-scales for Llama models. (#11787)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/model_loader/weight_utils.py | 11 ++++++++++-
 vllm/model_executor/models/llama.py              |  9 +++++----
 vllm/model_executor/models/mixtral.py            |  6 +++++-
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index b764a940b1742..e4d103f7cab99 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -652,9 +652,18 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         return remapped_name
 
     possible_scale_names = [".k_scale", ".v_scale"]
+    modelopt_scale_names = [
+        ".self_attn.k_proj.k_scale", ".self_attn.v_proj.v_scale"
+    ]
     for scale_name in possible_scale_names:
         if name.endswith(scale_name):
-            remapped_name = name.replace(scale_name, f".attn{scale_name}")
+            if any(mo_scale_name in name
+                   for mo_scale_name in modelopt_scale_names):
+                remapped_name = name.replace(
+                    f".self_attn.{scale_name[1]}_proj{scale_name}",
+                    f".self_attn.attn{scale_name}")
+            else:
+                remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
                 logger.warning_once(
                     f"Found {scale_name} in the checkpoint (e.g. {name}), "
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e214c30f5d60b..e7c264c04f1aa 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -404,6 +404,11 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
+            if "scale" in name:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -423,10 +428,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
 
                 if is_pp_missing_parameter(name, self):
                     continue
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index da415cdae96ed..fbb3704fa080f 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -452,7 +452,11 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
                     continue
-
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)

From 27b78c73cad00f5c7bb3b2431f02dc680f7034bc Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Wed, 29 Jan 2025 22:07:09 +0800
Subject: [PATCH 36/69] [Kernel] add triton fused moe kernel for gptq/awq
 (#12185)

---
 tests/kernels/test_moe.py                     |  91 ++++
 .../layers/fused_moe/fused_moe.py             | 407 ++++++++++++++---
 .../layers/quantization/__init__.py           |   7 +-
 .../layers/quantization/moe_wna16.py          | 424 ++++++++++++++++++
 4 files changed, 874 insertions(+), 55 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/moe_wna16.py

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 7fa5de1984452..7aa248ed1475c 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -18,6 +18,8 @@
     fused_moe as iterative_moe)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     marlin_quantize)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    quantize_weights)
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
@@ -55,6 +57,95 @@ def test_fused_moe(
                                rtol=0)
 
 
+@pytest.mark.parametrize("m", [1, 32, 222])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("group_size", [64, 128])
+@pytest.mark.parametrize("has_zp", [True, False])
+@pytest.mark.parametrize("weight_bits", [4, 8])
+def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
+                        dtype: torch.dtype, group_size: int, has_zp: bool,
+                        weight_bits: int):
+    print(m, n, k, e, topk, dtype, group_size, has_zp, weight_bits)
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    if weight_bits == 4:
+        pack_factor = 2
+        quant_type = scalar_types.uint4 if has_zp else scalar_types.uint4b8
+    elif weight_bits == 8:
+        pack_factor = 1
+        quant_type = scalar_types.uint8 if has_zp else scalar_types.uint8b128
+
+    w1_ref = w1.clone()
+    w2_ref = w2.clone()
+    w1_qweight = torch.empty((e, 2 * n, k // pack_factor),
+                             device="cuda",
+                             dtype=torch.uint8)
+    w2_qweight = torch.empty((e, k, n // pack_factor),
+                             device="cuda",
+                             dtype=torch.uint8)
+    w1_scales = torch.empty((e, 2 * n, k // group_size),
+                            device="cuda",
+                            dtype=dtype)
+    w2_scales = torch.empty((e, k, n // group_size),
+                            device="cuda",
+                            dtype=dtype)
+    w1_qzeros = torch.empty((e, 2 * n // pack_factor, k // group_size),
+                            device="cuda",
+                            dtype=torch.uint8)
+    w2_qzeros = torch.empty((e, k // pack_factor, n // group_size),
+                            device="cuda",
+                            dtype=torch.uint8)
+
+    for i in range(e * 2):
+        expert_id = i % e
+        if i // e == 0:
+            w, w_ref, w_qweight, w_scales, w_qzeros = \
+                w1, w1_ref, w1_qweight, w1_scales, w1_qzeros
+        else:
+            w, w_ref, w_qweight, w_scales, w_qzeros = \
+                w2, w2_ref, w2_qweight, w2_scales, w2_qzeros
+        weight, qweight, scales, qzeros = quantize_weights(
+            w[expert_id].T, quant_type, group_size, has_zp, False)
+        weight = weight.T
+        qweight = qweight.T.contiguous().to(torch.uint8)
+        scales = scales.T
+        if has_zp:
+            qzeros = qzeros.T.contiguous().to(torch.uint8)
+        if weight_bits == 4:
+            qweight = qweight[:, 1::2] * 16 + qweight[:, ::2]
+            if has_zp:
+                qzeros = qzeros[1::2, :] * 16 + qzeros[::2, :]
+
+        w_ref[expert_id] = weight
+        w_qweight[expert_id] = qweight
+        w_scales[expert_id] = scales
+        if has_zp:
+            w_qzeros[expert_id] = qzeros
+
+    triton_output = fused_moe(a,
+                              w1_qweight,
+                              w2_qweight,
+                              score,
+                              topk,
+                              renormalize=False,
+                              use_int4_w4a16=weight_bits == 4,
+                              use_int8_w8a16=weight_bits == 8,
+                              w1_scale=w1_scales,
+                              w2_scale=w2_scales,
+                              w1_zp=w1_qzeros if has_zp else None,
+                              w2_zp=w2_qzeros if has_zp else None,
+                              block_shape=[0, group_size])
+    torch_output = torch_moe(a, w1_ref, w2_ref, score, topk)
+    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
+
+
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
 @torch.inference_mode()
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 308c1d6ac6db1..dbb6c2ce4649e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -19,6 +19,206 @@
 logger = init_logger(__name__)
 
 
+@triton.jit
+def fused_moe_kernel_gptq_awq(
+        # Pointers to matrices
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        b_scale_ptr,
+        b_zp_ptr,
+        topk_weights_ptr,
+        sorted_token_ids_ptr,
+        expert_ids_ptr,
+        num_tokens_post_padded_ptr,
+        # Matrix dimensions
+        N: tl.constexpr,
+        K: tl.constexpr,
+        EM,
+        num_valid_tokens,
+        # The stride variables represent how much to increase the ptr by when
+        # moving by 1 element in a particular dimension. E.g. `stride_am` is
+        # how much to increase `a_ptr` by to get the element one row down
+        # (A has M rows).
+        stride_am,
+        stride_ak,
+        stride_be,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        stride_bse,
+        stride_bsk,
+        stride_bsn,
+        stride_bze,
+        stride_bzk,
+        stride_bzn,
+        block_k_diviable: tl.constexpr,
+        group_size: tl.constexpr,
+        # Meta-parameters
+        BLOCK_SIZE_M: tl.constexpr,
+        BLOCK_SIZE_N: tl.constexpr,
+        BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+        MUL_ROUTED_WEIGHT: tl.constexpr,
+        top_k: tl.constexpr,
+        compute_type: tl.constexpr,
+        has_zp: tl.constexpr,
+        use_int4_w4a16: tl.constexpr,
+        use_int8_w8a16: tl.constexpr):
+    """
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
+
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+    # -----------------------------------------------------------
+    # Map program ids `pid` to the block of C it should compute.
+    # This is done in a grouped ordering to promote L2 data reuse.
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # ----------------------------------------------------------
+    # Create pointers for the first blocks of A and B.
+    # We will advance this pointer as we move in the K direction
+    # and accumulate
+    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(
+        tl.int64)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    token_mask = offs_token < num_valid_tokens
+
+    offs_bn = (pid_n * BLOCK_SIZE_N +
+               tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
+                      offs_k[None, :] * stride_ak)
+
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+
+    if use_int4_w4a16:
+        b_ptrs = b_ptr + off_experts * stride_be + \
+            (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * stride_bn
+        b_shifter = (offs_k[:, None] % 2) * 4
+    elif use_int8_w8a16:
+        b_ptrs = b_ptr + off_experts * stride_be + \
+            offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+
+    if not has_zp and use_int4_w4a16:
+        b_zp_num = 8
+    if not has_zp and use_int8_w8a16:
+        b_zp_num = 128
+    elif has_zp and use_int4_w4a16:
+        b_zp_shifter = (offs_bn[None, :] % 2) * 4
+
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
+
+        if not block_k_diviable:
+            k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K
+            k_other = 0.0
+        else:
+            k_mask = None
+            k_other = None
+
+        a = tl.load(a_ptrs,
+                    mask=token_mask[:, None] &
+                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                    other=0.0)
+        b = tl.load(b_ptrs)
+        if use_int4_w4a16:
+            b = (b >> b_shifter) & 0xF
+
+        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + \
+            offs_bn[None, :] * stride_bsn + \
+            ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * stride_bsk
+        b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other)
+        b_scale = b_scale.to(tl.float32)
+
+        if has_zp and use_int4_w4a16:
+            offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+            b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \
+                (offs_bn[None, :] // 2) * stride_bzn + \
+                offs_k_true * stride_bzk
+            b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+            b_zp = ((b_zp >> b_zp_shifter) & 0xF)
+            b_zp = b_zp.to(tl.float32)
+        elif has_zp and use_int8_w8a16:
+            offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+            b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \
+                offs_bn[None, :] * stride_bzn + \
+                offs_k_true * stride_bzk
+            b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+            b_zp = b_zp.to(tl.float32)
+
+        # We accumulate along the K dimension.
+        if has_zp:
+            b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type)
+        else:
+            b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type)
+        accumulator = tl.dot(a, b, acc=accumulator)
+
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        if use_int4_w4a16:
+            b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
+        else:
+            b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token,
+                             mask=token_mask,
+                             other=0)
+        accumulator = accumulator * moe_weight[:, None]
+
+    accumulator = accumulator.to(compute_type)
+    # -----------------------------------------------------------
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
 @triton.jit
 def fused_moe_kernel(
         # Pointers to matrices
@@ -266,6 +466,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             C: torch.Tensor,
                             A_scale: Optional[torch.Tensor],
                             B_scale: Optional[torch.Tensor],
+                            B_zp: Optional[torch.Tensor],
                             topk_weights: torch.Tensor,
                             topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
@@ -277,6 +478,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             compute_type: tl.dtype,
                             use_fp8_w8a8: bool,
                             use_int8_w8a16: bool,
+                            use_int4_w4a16: bool,
                             block_shape: Optional[List[int]] = None) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
@@ -292,50 +494,108 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
             assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
             assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
-    elif use_int8_w8a16:
+    elif use_int8_w8a16 or use_int4_w4a16:
         assert B_scale is not None
+        assert block_shape is None or block_shape[0] == 0
     else:
         assert A_scale is None
         assert B_scale is None
 
-    grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
-        'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), )
+    EM = sorted_token_ids.shape[0]
+    if A.shape[0] < config["BLOCK_SIZE_M"]:
+        # optimize for small batch_size.
+        # We assume that top_ids of each token is unique, so
+        # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
+        # and we can skip some invalid blocks.
+        EM = min(sorted_token_ids.shape[0],
+                 A.shape[0] * top_k * config['BLOCK_SIZE_M'])
+    grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv(
+        B.shape[1], META['BLOCK_SIZE_N']), )
+
+    if (use_int8_w8a16 or use_int4_w4a16) and \
+            block_shape is not None and block_shape[1] > 0:
+        assert B_scale is not None and B_scale.ndim == 3
+        assert B_zp is None or B_zp.ndim == 3
+
+        fused_moe_kernel_gptq_awq[grid](
+            A,
+            B,
+            C,
+            B_scale,
+            B_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.shape[1],
+            A.shape[1],
+            EM,
+            topk_ids.numel(),
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            B_scale.stride(0),
+            B_scale.stride(2),
+            B_scale.stride(1),
+            B_zp.stride(0) if B_zp is not None else 0,
+            B_zp.stride(2) if B_zp is not None else 0,
+            B_zp.stride(1) if B_zp is not None else 0,
+            block_k_diviable=A.shape[1] % config["BLOCK_SIZE_K"] == 0,
+            group_size=block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            has_zp=B_zp is not None,
+            use_int4_w4a16=use_int4_w4a16,
+            use_int8_w8a16=use_int8_w8a16,
+            **config,
+        )
 
-    fused_moe_kernel[grid](
-        A,
-        B,
-        C,
-        A_scale,
-        B_scale,
-        topk_weights,
-        sorted_token_ids,
-        expert_ids,
-        num_tokens_post_padded,
-        B.shape[1],
-        B.shape[2],
-        sorted_token_ids.shape[0],
-        topk_ids.numel(),
-        A.stride(0),
-        A.stride(1),
-        B.stride(0),
-        B.stride(2),
-        B.stride(1),
-        C.stride(1),
-        C.stride(2),
-        A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
-        A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
-        B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
-        B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,
-        B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,
-        0 if block_shape is None else block_shape[0],
-        0 if block_shape is None else block_shape[1],
-        MUL_ROUTED_WEIGHT=mul_routed_weight,
-        top_k=top_k,
-        compute_type=compute_type,
-        use_fp8_w8a8=use_fp8_w8a8,
-        use_int8_w8a16=use_int8_w8a16,
-        **config,
-    )
+    else:
+        fused_moe_kernel[grid](
+            A,
+            B,
+            C,
+            A_scale,
+            B_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.shape[1],
+            A.shape[1],
+            EM,
+            topk_ids.numel(),
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            A_scale.stride(0)
+            if A_scale is not None and A_scale.ndim == 2 else 0,
+            A_scale.stride(1)
+            if A_scale is not None and A_scale.ndim == 2 else 0,
+            B_scale.stride(0)
+            if B_scale is not None and B_scale.ndim >= 2 else 0,
+            B_scale.stride(2)
+            if B_scale is not None and B_scale.ndim == 3 else 0,
+            B_scale.stride(1)
+            if B_scale is not None and B_scale.ndim >= 2 else 0,
+            0 if block_shape is None else block_shape[0],
+            0 if block_shape is None else block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            **config,
+        )
 
 
 def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str:
@@ -432,7 +692,7 @@ def try_get_optimal_moe_config(
     # NOTE: For block-wise quant,
     # BLOCK_K must be divisible by block_shape[1]
     # BLOCK_N and BLOCK_M has no requirements
-    if block_shape is not None:
+    if block_shape is not None and block_shape[0] != 0:
         config["BLOCK_SIZE_N"] = block_shape[0]
         config["BLOCK_SIZE_K"] = block_shape[1]
     return config
@@ -531,12 +791,15 @@ def grouped_topk(hidden_states: torch.Tensor,
 
 
 def get_config_dtype_str(dtype: torch.dtype,
+                         use_int4_w4a16: Optional[bool] = False,
                          use_int8_w8a16: Optional[bool] = False,
                          use_fp8_w8a8: Optional[bool] = False):
     if use_fp8_w8a8:
         return "fp8_w8a8"
     elif use_int8_w8a16:
         return "int8_w8a16"
+    elif use_int4_w4a16:
+        return "int4_w8a16"
     elif dtype == torch.float:
         # avoiding cases where kernel fails when float32 MoE
         # use fp16/bfloat16 configs
@@ -551,14 +814,17 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           topk_ids: torch.Tensor,
                           use_fp8_w8a8: bool = False,
                           use_int8_w8a16: bool = False,
+                          use_int4_w4a16: bool = False,
                           w1_scale: Optional[torch.Tensor] = None,
                           w2_scale: Optional[torch.Tensor] = None,
+                          w1_zp: Optional[torch.Tensor] = None,
+                          w2_zp: Optional[torch.Tensor] = None,
                           a1_scale: Optional[torch.Tensor] = None,
                           a2_scale: Optional[torch.Tensor] = None,
                           block_shape: Optional[List[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
-                       use_fp8_w8a8, use_int8_w8a16, w1_scale, w2_scale,
-                       a1_scale, a2_scale, block_shape)
+                       use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, w1_scale,
+                       w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
 
 
 def inplace_fused_experts_fake(
@@ -569,8 +835,11 @@ def inplace_fused_experts_fake(
         topk_ids: torch.Tensor,
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None) -> None:
@@ -593,14 +862,18 @@ def outplace_fused_experts(
         topk_ids: torch.Tensor,
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
-                              False, use_fp8_w8a8, use_int8_w8a16, w1_scale,
-                              w2_scale, a1_scale, a2_scale, block_shape)
+                              False, use_fp8_w8a8, use_int8_w8a16,
+                              use_int4_w4a16, w1_scale, w2_scale, w1_zp, w2_zp,
+                              a1_scale, a2_scale, block_shape)
 
 
 def outplace_fused_experts_fake(
@@ -611,8 +884,11 @@ def outplace_fused_experts_fake(
         topk_ids: torch.Tensor,
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None) -> torch.Tensor:
@@ -635,8 +911,11 @@ def fused_experts(hidden_states: torch.Tensor,
                   inplace: bool = False,
                   use_fp8_w8a8: bool = False,
                   use_int8_w8a16: bool = False,
+                  use_int4_w4a16: bool = False,
                   w1_scale: Optional[torch.Tensor] = None,
                   w2_scale: Optional[torch.Tensor] = None,
+                  w1_zp: Optional[torch.Tensor] = None,
+                  w2_zp: Optional[torch.Tensor] = None,
                   a1_scale: Optional[torch.Tensor] = None,
                   a2_scale: Optional[torch.Tensor] = None,
                   block_shape: Optional[List[int]] = None):
@@ -644,16 +923,15 @@ def fused_experts(hidden_states: torch.Tensor,
         torch.ops.vllm.inplace_fused_experts(hidden_states, w1, w2,
                                              topk_weights, topk_ids,
                                              use_fp8_w8a8, use_int8_w8a16,
-                                             w1_scale, w2_scale, a1_scale,
+                                             use_int4_w4a16, w1_scale,
+                                             w2_scale, w1_zp, w2_zp, a1_scale,
                                              a2_scale, block_shape)
         return hidden_states
     else:
-        return torch.ops.vllm.outplace_fused_experts(hidden_states, w1, w2,
-                                                     topk_weights, topk_ids,
-                                                     use_fp8_w8a8,
-                                                     use_int8_w8a16, w1_scale,
-                                                     w2_scale, a1_scale,
-                                                     a2_scale, block_shape)
+        return torch.ops.vllm.outplace_fused_experts(
+            hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,
+            use_int8_w8a16, use_int4_w4a16, w1_scale, w2_scale, w1_zp, w2_zp,
+            a1_scale, a2_scale, block_shape)
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,
@@ -664,13 +942,21 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                        inplace: bool = False,
                        use_fp8_w8a8: bool = False,
                        use_int8_w8a16: bool = False,
+                       use_int4_w4a16: bool = False,
                        w1_scale: Optional[torch.Tensor] = None,
                        w2_scale: Optional[torch.Tensor] = None,
+                       w1_zp: Optional[torch.Tensor] = None,
+                       w2_zp: Optional[torch.Tensor] = None,
                        a1_scale: Optional[torch.Tensor] = None,
                        a2_scale: Optional[torch.Tensor] = None,
                        block_shape: Optional[List[int]] = None):
     # Check constraints.
-    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+    if use_int4_w4a16:
+        assert hidden_states.shape[1] // 2 == w1.shape[
+            2], "Hidden size mismatch"
+    else:
+        assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
@@ -687,6 +973,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
     M = min(num_tokens, CHUNK_SIZE)
     config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8,
                                         use_int8_w8a16=use_int8_w8a16,
+                                        use_int4_w4a16=use_int4_w4a16,
                                         dtype=hidden_states.dtype)
 
     get_config_func = functools.partial(
@@ -755,6 +1042,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 intermediate_cache1,
                                 a1_scale,
                                 w1_scale,
+                                w1_zp,
                                 curr_topk_weights,
                                 curr_topk_ids,
                                 sorted_token_ids,
@@ -766,6 +1054,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
+                                use_int4_w4a16=use_int4_w4a16,
                                 block_shape=block_shape)
 
         torch.ops._C.silu_and_mul(intermediate_cache2,
@@ -776,6 +1065,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 intermediate_cache3,
                                 a2_scale,
                                 w2_scale,
+                                w2_zp,
                                 curr_topk_weights,
                                 curr_topk_ids,
                                 sorted_token_ids,
@@ -787,6 +1077,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
+                                use_int4_w4a16=use_int4_w4a16,
                                 block_shape=block_shape)
 
         ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
@@ -808,8 +1099,11 @@ def fused_moe(
     custom_routing_function: Optional[Callable] = None,
     use_fp8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[List[int]] = None,
@@ -834,8 +1128,12 @@ def fused_moe(
         note: Deepseekv2 model uses grouped_topk
     - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
         products for w1 and w2. Defaults to False.
-    - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner
-        products for w1 and w2. Defaults to False.
+    - use_int8_w8a16 (bool): If True, use matmul of int8 weight and bf16/fp16
+        activation to compute the inner products for w1 and w2.
+        Defaults to False.
+    - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16
+        activation to compute the inner products for w1 and w2.
+        Defaults to False.
     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
         w1.
     - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
@@ -873,8 +1171,11 @@ def fused_moe(
                          inplace=inplace,
                          use_fp8_w8a8=use_fp8_w8a8,
                          use_int8_w8a16=use_int8_w8a16,
+                         use_int4_w4a16=use_int4_w4a16,
                          w1_scale=w1_scale,
                          w2_scale=w2_scale,
+                         w1_zp=w1_zp,
+                         w2_zp=w2_zp,
                          a1_scale=a1_scale,
                          a2_scale=a2_scale,
                          block_shape=block_shape)
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index d2bde13fcf546..bd0fd47993396 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -26,7 +26,8 @@
     "experts_int8",
     "neuron_quant",
     "ipex",
-    "quark"
+    "quark",
+    "moe_wna16"
 ]
 
 # The customized quantization methods which will be added to this dict.
@@ -94,6 +95,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .ipex_quant import IPEXConfig
     from .marlin import MarlinConfig
     from .modelopt import ModelOptFp8Config
+    from .moe_wna16 import MoeWNA16Config
     from .neuron_quant import NeuronQuantConfig
     from .qqq import QQQConfig
     from .tpu_int8 import Int8TpuConfig
@@ -121,7 +123,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "experts_int8": ExpertsInt8Config,
         "neuron_quant": NeuronQuantConfig,
         "ipex": IPEXConfig,
-        "quark": QuarkConfig
+        "quark": QuarkConfig,
+        "moe_wna16": MoeWNA16Config,
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
new file mode 100644
index 0000000000000..8cd9c0a7ef253
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -0,0 +1,424 @@
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+
+from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.awq import (AWQConfig,
+                                                         AWQLinearMethod)
+from vllm.model_executor.layers.quantization.awq_marlin import (
+    AWQMarlinConfig, AWQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.gptq import (GPTQConfig,
+                                                          GPTQLinearMethod)
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig, GPTQMarlinLinearMethod)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+
+
+class MoeWNA16Config(QuantizationConfig):
+    """Config class for MOE WNA16 (W8A16/W4A16) quantization."""
+
+    def __init__(self, linear_quant_method: str, weight_bits: int,
+                 group_size: int, has_zp: bool, lm_head_quantized: bool,
+                 modules_to_not_convert: Optional[List[str]],
+                 full_config: Dict[str, Any]) -> None:
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.has_zp = has_zp
+        self.bit8_pack_factor = 8 // self.weight_bits
+        self.lm_head_quantized = lm_head_quantized
+        self.linear_quant_method = linear_quant_method
+        self.full_config = full_config
+        self.use_marlin = False
+        if self.linear_quant_method == "gptq":
+            self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(
+                full_config)
+        elif self.linear_quant_method == "awq":
+            capability_tuple = current_platform.get_device_capability()
+            device_capability = (-1 if capability_tuple is None else
+                                 capability_tuple.to_int())
+            awq_min_capability = AWQConfig.get_min_capability()
+            if device_capability < awq_min_capability:
+                raise ValueError(
+                    "The quantization method moe_wna16 + awq is not supported "
+                    "for the current GPU. "
+                    f"Minimum capability: {awq_min_capability}. "
+                    f"Current capability: {device_capability}.")
+            self.use_marlin = AWQMarlinConfig.is_awq_marlin_compatible(
+                full_config)
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+
+        if modules_to_not_convert is None:
+            self.modules_to_not_convert = []
+        else:
+            self.modules_to_not_convert = modules_to_not_convert
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "moe_wna16"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "MoeWNA16Config":
+        linear_quant_method = cls.get_from_keys(config, ["quant_method"])
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        if linear_quant_method == "gptq":
+            has_zp = not cls.get_from_keys(config, ["sym"])
+            modules_to_not_convert = []
+        elif linear_quant_method == "awq":
+            has_zp = cls.get_from_keys(config, ["zero_point"])
+            modules_to_not_convert = cls.get_from_keys(
+                config, ["modules_to_not_convert"])
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+
+        return cls(linear_quant_method, weight_bits, group_size, has_zp,
+                   lm_head_quantized, modules_to_not_convert, config)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
+        can_convert = cls.is_moe_wna16_compatible(hf_quant_cfg)
+        if can_convert and user_quant == "moe_wna16":
+            return cls.get_name()
+        return None
+
+    @classmethod
+    def is_moe_wna16_compatible(cls, quant_config: Dict[str, Any]):
+        # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        desc_act = quant_config.get("desc_act")
+
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+        awq_min_capability = AWQConfig.get_min_capability()
+
+        gptq_compatible = quant_method == "gptq" and \
+                not desc_act and num_bits in [4, 8]
+        awq_compatible = quant_method == "awq" and num_bits == 4 and \
+            device_capability >= awq_min_capability
+
+        return gptq_compatible or awq_compatible
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
+            return UnquantizedLinearMethod()
+        elif isinstance(layer, FusedMoE):
+            return MoeWNA16Method(self)
+        else:
+            if self.linear_quant_method == "gptq":
+                if self.use_marlin:
+                    return GPTQMarlinLinearMethod(
+                        GPTQMarlinConfig.from_config(self.full_config))
+                else:
+                    return GPTQLinearMethod(
+                        GPTQConfig.from_config(self.full_config))
+            elif self.linear_quant_method == "awq":
+                if self.use_marlin:
+                    return AWQMarlinLinearMethod(
+                        AWQMarlinConfig.from_config(self.full_config))
+                else:
+                    return AWQLinearMethod(
+                        AWQConfig.from_config(self.full_config))
+            else:
+                raise ValueError("moe_wna16 only support gptq and awq.")
+
+
+def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]):
+    return any(module_name in prefix for module_name in modules_to_not_convert)
+
+
+class MoeWNA16Method(FusedMoEMethodBase):
+    """Linear method for MOE WNA16 (W8A16/W4A16) quantization.
+
+    Args:
+        quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
+    """
+
+    def __init__(self, quant_config: MoeWNA16Config):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        layer.quant_config = self.quant_config
+        bit8_pack_factor = self.quant_config.bit8_pack_factor
+        group_size = self.quant_config.group_size
+        group_size_div_factor = 1
+
+        # make intermediate_size and hidden_size diviable by group_size
+        # we reduce the group size to ensure that
+        # and we would repeat the loaded_weight later
+        while intermediate_size_per_partition % group_size or \
+                hidden_size % group_size:
+            group_size = group_size // 2
+            group_size_div_factor *= 2
+            assert group_size >= 32
+        layer.group_size = group_size
+        layer.group_size_div_factor = group_size_div_factor
+
+        strategy = FusedMoeWeightScaleSupported.GROUP.value
+        extra_weight_attrs.update({
+            "quant_method": strategy,
+            "is_transposed": False
+        })
+
+        assert 'weight_loader' in extra_weight_attrs
+        weight_loader = extra_weight_attrs['weight_loader']
+        wrapped_weight_loader = MoeWNA16Method.get_weight_loader(
+            layer, weight_loader)
+        extra_weight_attrs['weight_loader'] = wrapped_weight_loader
+
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size // bit8_pack_factor,
+            dtype=torch.uint8),
+                                         requires_grad=False)
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition // bit8_pack_factor,
+            dtype=torch.uint8),
+                                        requires_grad=False)
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+
+        w13_scales = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size // group_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+
+        w2_scales = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition // group_size,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+
+        if self.quant_config.has_zp:
+            w13_qzeros = torch.nn.Parameter(torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition // bit8_pack_factor,
+                hidden_size // group_size,
+                dtype=torch.uint8),
+                                            requires_grad=False)
+            layer.register_parameter("w13_qzeros", w13_qzeros)
+            set_weight_attrs(w13_qzeros, extra_weight_attrs)
+
+            w2_qzeros = torch.nn.Parameter(torch.zeros(
+                num_experts,
+                hidden_size // bit8_pack_factor,
+                intermediate_size_per_partition // group_size,
+                dtype=torch.uint8),
+                                           requires_grad=False)
+            layer.register_parameter("w2_qzeros", w2_qzeros)
+            set_weight_attrs(w2_qzeros, extra_weight_attrs)
+
+        if self.quant_config.linear_quant_method == "gptq":
+            # some param are unused, but we need to init them in order to
+            # load weights
+            invalid_param_keys = ["w13_g_idx", "w2_g_idx"]
+            if not self.quant_config.has_zp:
+                invalid_param_keys += ["w13_qzeros", "w2_qzeros"]
+            for key in invalid_param_keys:
+                param = torch.nn.Parameter(torch.empty((0, ),
+                                                       dtype=torch.int32),
+                                           requires_grad=False)
+                layer.register_parameter(key, param)
+                set_weight_attrs(param, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        weight_bits = self.quant_config.weight_bits
+        has_zp = self.quant_config.has_zp
+
+        return fused_experts(x,
+                             layer.w13_qweight,
+                             layer.w2_qweight,
+                             topk_weights=topk_weights,
+                             topk_ids=topk_ids,
+                             inplace=True,
+                             use_int4_w4a16=weight_bits == 4,
+                             use_int8_w8a16=weight_bits == 8,
+                             w1_scale=layer.w13_scales,
+                             w2_scale=layer.w2_scales,
+                             w1_zp=layer.w13_qzeros if has_zp else None,
+                             w2_zp=layer.w2_qzeros if has_zp else None,
+                             block_shape=[0, layer.group_size])
+
+    @staticmethod
+    def get_weight_loader(layer, weight_loader):
+
+        def convert_awq_tensor(tensor, tensor_type):
+            # convert awq qweight/qzeros to a standard format (assume int4)
+            # qweight: (k, n // pack_factor_bit32) -> (n, k // pack_factor_bit8)
+            # qzeros: (k // group_size, n // pack_factor_bit32) ->
+            #         (n // pack_factor_bit8, k // group_size)
+            # pack_factor_bit32 = 32 // weight_bits
+            # pack_factor_bit8 = 8 // weight_bits
+
+            # 0. suppose origin shape (a, b), dtype int32
+            # 1. convert to uint8, shape (a, b) -> (a, 4 * b)
+            size0 = tensor.size(0)
+            tensor = tensor.view(torch.uint8)
+
+            # 2. unpack to uint4 (only when weight_bits == 4)
+            #    shape (a, 4 * b) -> (a, 4 * b, 2)
+            shifter = torch.tensor([0, 4],
+                                   dtype=torch.uint8,
+                                   device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+
+            # 3. change order, see
+            # https://github.com/casper-hansen/AutoAWQ/blob/v0.2.8/awq/utils/quant_utils.py
+            # shape -> (a, 4 * b * pack_factor_bit8)
+            reverse_awq_pack_order = [0, 4, 1, 5, 2, 6, 3, 7]
+            tensor = tensor.view(-1, 8)[:, reverse_awq_pack_order]
+            tensor = tensor.view(size0, -1)
+
+            # 4. transpose, shape -> (4 * b * pack_factor_bit8, a)
+            tensor = tensor.T.contiguous()
+
+            # 5. repack (only when weight_bits == 4)
+            # qweight shape -> (4 * b * pack_factor_bit8, a // pack_factor_bit8)
+            # qzeros shape -> (4 * b, a)
+
+            if tensor_type == "qweight":
+                tensor = tensor[:, 1::2] * 16 + tensor[:, ::2]
+            elif tensor_type == "qzeros":
+                tensor = tensor[1::2, :] * 16 + tensor[::2, :]
+            return tensor
+
+        def convert_gptq_int4_qzeros(tensor):
+            tensor = tensor.view(torch.uint8)
+            shifter = torch.tensor([0, 4],
+                                   dtype=torch.uint8,
+                                   device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+            tensor = tensor + 1
+            tensor = tensor[:, :, 0] + tensor[:, :, 1] * 16
+            return tensor
+
+        def moe_wna16_weight_loader(param: torch.nn.Parameter,
+                                    loaded_weight: torch.Tensor,
+                                    weight_name: str, shard_id: str,
+                                    expert_id: int):
+            if "g_idx" in weight_name:
+                return
+            if not layer.quant_config.has_zp and "qzeros" in weight_name:
+                return
+
+            device = get_tp_group().device
+            tp_rank = get_tensor_model_parallel_rank()
+            loaded_weight = loaded_weight.to(device)
+            shard_size = layer.intermediate_size_per_partition
+
+            # convert gptq and awq weight to a standard format
+            if layer.quant_config.linear_quant_method == "awq":
+                assert layer.quant_config.weight_bits == 4
+                if "weight" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight,
+                                                       "qweight")
+                elif "zeros" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight, "qzeros")
+                else:
+                    loaded_weight = loaded_weight.T
+            elif layer.quant_config.linear_quant_method == "gptq":
+                assert layer.quant_config.weight_bits in [4, 8]
+                if "weight" in weight_name:
+                    loaded_weight = loaded_weight.T.contiguous().view(
+                        torch.uint8)
+                elif "zeros" in weight_name:
+                    # add 1 to gptq qzeros to align with awq
+                    loaded_weight = loaded_weight.view(torch.uint8)
+                    if layer.quant_config.weight_bits == 4:
+                        loaded_weight = convert_gptq_int4_qzeros(
+                            loaded_weight).T
+                    else:
+                        loaded_weight = loaded_weight.T + 1
+                else:
+                    loaded_weight = loaded_weight.T
+
+            # repeat the qzeros/scales to fit new group size
+            if layer.group_size_div_factor > 1 and \
+                    "qzeros" in weight_name or "scales" in weight_name:
+                loaded_weight = loaded_weight.repeat_interleave(
+                    layer.group_size_div_factor, 1)
+
+            if "w13_qzeros" in weight_name:
+                tensor = loaded_weight.view(layer.tp_size, -1,
+                                            loaded_weight.size(1))[tp_rank]
+                if shard_id == "w1":
+                    param.data[expert_id, :shard_size // 2] = tensor
+                else:
+                    param.data[expert_id, shard_size // 2:] = tensor
+            elif "w2_qzeros" in weight_name:
+                param.data[expert_id] = loaded_weight.view(
+                    loaded_weight.size(0), layer.tp_size, -1)[:, tp_rank]
+            else:
+                weight_loader(param, loaded_weight, weight_name, shard_id,
+                              expert_id)
+
+        return moe_wna16_weight_loader

From 73aa6cfdf789ddc67a3d2924ef52fd791554fe2a Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 29 Jan 2025 16:12:24 -0500
Subject: [PATCH 37/69] Revert "[Build/CI] Fix libcuda.so linkage" (#12552)

---
 CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4dee9ec36895f..6c946fc5aa3ac 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -446,9 +446,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 endif()
 
 message(STATUS "Enabling C extension.")
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_C_LIBS cuda)
-endif()
 define_gpu_extension_target(
   _C
   DESTINATION vllm
@@ -457,7 +454,6 @@ define_gpu_extension_target(
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
-  LIBRARIES ${VLLM_C_LIBS}
   USE_SABI 3
   WITH_SOABI)
 

From e0cc5f259a8bec0d66ed0bc3e25ca245377679a1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 29 Jan 2025 13:47:33 -0800
Subject: [PATCH 38/69] [V1][BugFix] Free encoder cache for aborted requests
 (#12545)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/encoder_cache_manager.py |  9 ++++++++-
 vllm/v1/core/scheduler.py             | 14 ++++++++------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 0cd8c806a3e47..9d570b334c6cf 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -38,7 +38,8 @@ def allocate(self, request: Request, input_id: int) -> None:
     def get_cached_input_ids(self, request: Request) -> Set[int]:
         return self.cached.get(request.request_id, set())
 
-    def free(self, request: Request, input_id: int) -> None:
+    def free_encoder_input(self, request: Request, input_id: int) -> None:
+        """Free a single encoder input id for the request."""
         req_id = request.request_id
         if req_id not in self.cached:
             return
@@ -49,6 +50,12 @@ def free(self, request: Request, input_id: int) -> None:
         self.num_free_slots += request.get_num_encoder_tokens(input_id)
         self.freed.append((req_id, input_id))
 
+    def free(self, request: Request) -> None:
+        """Free all cached input ids for the request."""
+        input_ids = self.get_cached_input_ids(request)
+        for input_id in input_ids:
+            self.free_encoder_input(request, input_id)
+
     def get_freed_ids(self) -> List[Tuple[str, int]]:
         freed = self.freed
         self.freed = []
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 7a88cc9433b32..da2e31b1fb75b 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -202,7 +202,7 @@ def schedule(self) -> "SchedulerOutput":
                 # which have output tokens.
                 num_new_tokens = request.num_tokens - num_computed_tokens
                 if num_new_tokens == 0:
-                    # The happens when prompt length is divisible by the block
+                    # This happens when prompt length is divisible by the block
                     # size and all blocks are cached. Now we force to recompute
                     # the last block. Note that we have to re-compute an entire
                     # block because allocate_slots() assumes num_computed_tokens
@@ -269,6 +269,7 @@ def schedule(self) -> "SchedulerOutput":
 
         # Get the longest common prefix among all requests in the running queue.
         # This can be potentially used for cascade attention.
+        num_common_prefix_blocks = 0
         if self.running:
             any_request = self.running[0]
             num_common_prefix_blocks = (
@@ -433,7 +434,8 @@ def update_from_output(
                     if start_pos + num_tokens <= request.num_computed_tokens:
                         # The encoder output is already processed and stored
                         # in the decoder's KV cache.
-                        self.encoder_cache_manager.free(request, input_id)
+                        self.encoder_cache_manager.free_encoder_input(
+                            request, input_id)
 
             if request.num_computed_tokens == request.num_tokens:
                 req_index = model_runner_output.req_id_to_index[req_id]
@@ -445,8 +447,10 @@ def update_from_output(
                 # TODO: Update the KV cache manager for prefix caching.
 
                 # Check for stop and update request state.
-                # This must be called before me make the EngineCoreOutput.
+                # This must be called before we make the EngineCoreOutput.
                 stopped = self._check_stop(request)
+                if stopped:
+                    self._free_request(request)
 
                 # Add EngineCoreOutput for this Request.
                 output = EngineCoreOutput(
@@ -472,7 +476,6 @@ def _check_stop(self, request: Request) -> bool:
         if (request.num_tokens >= self.max_model_len
                 or request.num_output_tokens >= request.max_tokens):
             request.status = RequestStatus.FINISHED_LENGTH_CAPPED
-            self._free_request(request)
             return True
 
         sampling_params = request.sampling_params
@@ -480,13 +483,11 @@ def _check_stop(self, request: Request) -> bool:
         if (not sampling_params.ignore_eos
                 and last_token_id == request.eos_token_id):
             request.status = RequestStatus.FINISHED_STOPPED
-            self._free_request(request)
             return True
 
         if last_token_id in (sampling_params.stop_token_ids or ()):
             request.status = RequestStatus.FINISHED_STOPPED
             request.stop_reason = last_token_id
-            self._free_request(request)
             return True
         return False
 
@@ -525,6 +526,7 @@ def finish_requests(
     def _free_request(self, request: Request) -> None:
         assert request.is_finished()
         self.kv_cache_manager.free(request)
+        self.encoder_cache_manager.free(request)
         self.running_reqs_data.pop(request.request_id, None)
         del self.requests[request.request_id]
         self.finished_req_ids.add(request.request_id)

From 1c1bb0bbf20955d346f66bb25d349c1bd9fe6ea2 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Wed, 29 Jan 2025 18:47:30 -0600
Subject: [PATCH 39/69] [Misc][MoE] add Deepseek-V3 moe tuning support (#12558)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 benchmarks/kernels/benchmark_moe.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 1fa0da75c79d2..5c8bf33afebc8 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -450,7 +450,8 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
 def main(args: argparse.Namespace):
     print(args)
 
-    config = AutoConfig.from_pretrained(args.model)
+    config = AutoConfig.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code)
     if config.architectures[0] == "DbrxForCausalLM":
         E = config.ffn_config.moe_num_experts
         topk = config.ffn_config.moe_top_k
@@ -461,6 +462,11 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] == "DeepseekV3ForCausalLM":
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
         # Default: Mixtral.
         E = config.num_local_experts
@@ -538,6 +544,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--batch-size", type=int, required=False)
     parser.add_argument("--tune", action="store_true")
+    parser.add_argument("--trust-remote-code", action="store_true")
     args = parser.parse_args()
 
     main(args)

From f17f1d46086692a2973fad94860a95799fbd8582 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 30 Jan 2025 02:31:01 +0000
Subject: [PATCH 40/69] [V1][Metrics] Add GPU cache usage % gauge (#12561)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |  1 +
 vllm/v1/core/kv_cache_manager.py         |  5 +++++
 vllm/v1/core/scheduler.py                |  1 +
 vllm/v1/metrics/loggers.py               | 11 ++++++++++-
 vllm/v1/metrics/stats.py                 |  2 +-
 5 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 901ba8e8e5ef3..941f465711ef1 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -200,6 +200,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 EXPECTED_METRICS_V1 = [
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
+    "vllm:gpu_cache_usage_perc",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
     "vllm:request_prompt_tokens_sum",
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 18fdfdfe4a010..d6c612f155f01 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -69,6 +69,11 @@ def __init__(
         # is finished.
         self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
 
+    @property
+    def usage(self) -> float:
+        return 1.0 - (self.free_block_queue.num_free_blocks /
+                      self.num_gpu_blocks)
+
     def get_computed_blocks(
             self, request: Request) -> Tuple[List[KVCacheBlock], int]:
         """Get the computed (cached) blocks for the request.
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index da2e31b1fb75b..910fc4ff4d2b6 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -544,6 +544,7 @@ def make_stats(self) -> SchedulerStats:
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
+            gpu_cache_usage=self.kv_cache_manager.usage,
         )
 
 
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 9bb24d1948651..f901822c7887c 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -69,11 +69,13 @@ def log(self, scheduler_stats: SchedulerStats,
         logger.info(
             "Avg prompt throughput: %.1f tokens/s, "
             "Avg generation throughput: %.1f tokens/s, "
-            "Running: %d reqs, Waiting: %d reqs ",
+            "Running: %d reqs, Waiting: %d reqs "
+            "GPU KV cache usage: %.1f%%.",
             prompt_throughput,
             generation_throughput,
             scheduler_stats.num_running_reqs,
             scheduler_stats.num_waiting_reqs,
+            scheduler_stats.gpu_cache_usage * 100,
         )
 
 
@@ -97,6 +99,11 @@ def __init__(self, model_config: ModelConfig):
             documentation="Number of requests waiting to be processed.",
             labelnames=labelnames).labels(*labelvalues)
 
+        self.gauge_gpu_cache_usage = prometheus_client.Gauge(
+            name="vllm:gpu_cache_usage_perc",
+            documentation="GPU KV-cache usage. 1 means 100 percent usage.",
+            labelnames=labelnames).labels(*labelvalues)
+
         self.counter_prompt_tokens = prometheus_client.Counter(
             name="vllm:prompt_tokens_total",
             documentation="Number of prefill tokens processed.",
@@ -147,6 +154,8 @@ def log(self, scheduler_stats: SchedulerStats,
         self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
         self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
 
+        self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
+
         self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
         self.counter_generation_tokens.inc(
             iteration_stats.num_generation_tokens)
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index f4c276f0b6902..5277505128a63 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -14,7 +14,7 @@ class SchedulerStats:
     num_running_reqs: int = 0
     num_waiting_reqs: int = 0
 
-    # gpu_cache_usage: float = 0.0
+    gpu_cache_usage: float = 0.0
     # gpu_prefix_cache_hit_rate: float = 0.0
 
 
From a2769032ca78108e58abc45e2eb0ade8b47a6515 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 30 Jan 2025 08:05:42 +0000
Subject: [PATCH 41/69] Set `?device={device}` when changing tab in
 installation guides (#12560)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/_static/custom.js | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index 18b502c786e1d..be0b2a388e404 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -1,3 +1,4 @@
+// Add RunLLM widget
 document.addEventListener("DOMContentLoaded", function () {
     var script = document.createElement("script");
     script.type = "module";
@@ -15,4 +16,23 @@ document.addEventListener("DOMContentLoaded", function () {
   
     script.async = true;
     document.head.appendChild(script);
-  });
\ No newline at end of file
+  });
+
+// Update URL search params when tab is clicked
+  document.addEventListener("DOMContentLoaded", function () {
+    const tabs = document.querySelectorAll(".sd-tab-label");
+
+    function updateURL(tab) {
+      const syncGroup = tab.getAttribute("data-sync-group");
+      const syncId = tab.getAttribute("data-sync-id");
+      if (syncGroup && syncId) {
+          const url = new URL(window.location);
+          url.searchParams.set(syncGroup, syncId);
+          window.history.replaceState(null, "", url);
+      }
+    }
+
+    tabs.forEach(tab => {
+        tab.addEventListener("click", () => updateURL(tab));
+    });
+});

From 41bf5612f590dd13fa5e5dec083849ab6cde2f70 Mon Sep 17 00:00:00 2001
From: Beim <805908499@qq.com>
Date: Fri, 31 Jan 2025 04:39:22 +1300
Subject: [PATCH 42/69] [Misc] fix typo: add missing space in lora adapter
 error message (#12564)

Signed-off-by: Beim <beim2015@outlook.com>
---
 vllm/entrypoints/openai/serving_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index fc422f0917bd5..22e74b387cd73 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -203,7 +203,7 @@ async def _check_load_lora_adapter_request(
                for lora_request in self.lora_requests):
             return create_error_response(
                 message=
-                f"The lora adapter '{request.lora_name}' has already been"
+                f"The lora adapter '{request.lora_name}' has already been "
                 "loaded.",
                 err_type="InvalidUserInput",
                 status_code=HTTPStatus.BAD_REQUEST)

From 9b0c4bab36c8f355f562d58521650ee8d5b6095d Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 30 Jan 2025 14:53:22 -0500
Subject: [PATCH 43/69] [Kernel] Triton Configs for Fp8 Block Quantization
 (#11589)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
---
 setup.py                                      |   6 +-
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 .../layers/fused_moe/fused_moe.py             |  91 ++++++++---
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 ...dtype=fp8_w8a8,block_shape=[128, 128].json | 146 ++++++++++++++++++
 .../layers/quantization/utils/fp8_utils.py    |  77 +++++++--
 43 files changed, 5972 insertions(+), 42 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json

diff --git a/setup.py b/setup.py
index 59ece870b5585..50a2392a4d83b 100755
--- a/setup.py
+++ b/setup.py
@@ -608,7 +608,11 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
 package_data = {
-    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
+    "vllm": [
+        "py.typed",
+        "model_executor/layers/fused_moe/configs/*.json",
+        "model_executor/layers/quantization/utils/configs/*.json",
+    ]
 }
 
 if _no_device():
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..2e692a1583a4a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..6fcf408755f5d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index dbb6c2ce4649e..39607dc4ca11e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -598,15 +598,27 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         )
 
 
-def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str:
+# Adapted from: https://github.com/sgl-project/sglang/pull/2628
+def get_config_file_name(E: int,
+                         N: int,
+                         dtype: Optional[str],
+                         block_shape: Optional[List[int]] = None) -> str:
     device_name = current_platform.get_device_name().replace(" ", "_")
     dtype_selector = "" if not dtype else f",dtype={dtype}"
-    return f"E={E},N={N},device_name={device_name}{dtype_selector}.json"
+    block_shape_selector = ("" if not block_shape or not all(block_shape) else
+                            f",block_shape={block_shape}")
+    return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"  # noqa: E501
 
 
+# Adapted from: https://github.com/sgl-project/sglang/pull/2628
 @functools.lru_cache
-def get_moe_configs(E: int, N: int,
-                    dtype: Optional[str]) -> Optional[Dict[int, Any]]:
+def get_moe_configs(
+    E: int,
+    N: int,
+    dtype: Optional[str],
+    block_n: Optional[int] = None,
+    block_k: Optional[int] = None,
+) -> Optional[Dict[int, Any]]:
     """
     Return optimized configurations for the fused MoE kernel.
 
@@ -618,7 +630,8 @@ def get_moe_configs(E: int, N: int,
 
     # First look up if an optimized configuration is available in the configs
     # directory
-    json_file_name = get_config_file_name(E, N, dtype)
+    block_shape = [block_n, block_k] if block_n and block_k else None
+    json_file_name = get_config_file_name(E, N, dtype, block_shape)
 
     config_file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
@@ -645,21 +658,53 @@ def get_default_config(
     topk: int,
     dtype: Optional[str],
     is_marlin: bool,
+    block_shape: Optional[List[int]] = None,
 ) -> Dict[str, int]:
-    config = {
-        'BLOCK_SIZE_M': 64,
-        'BLOCK_SIZE_N': 64,
-        'BLOCK_SIZE_K': 32,
-        'GROUP_SIZE_M': 8
-    }
-    # A heuristic: fused marlin works faster with this config for small M
-    if M <= E or (is_marlin and M <= 32):
+    if dtype == "fp8_w8a8":
+        if block_shape is None:
+            config = {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 32,
+                "num_warps": 8,
+                "num_stages": 4,
+            }
+            if M <= E:
+                config = {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 128,
+                    "GROUP_SIZE_M": 1,
+                    "num_warps": 4,
+                    "num_stages": 4,
+                }
+        else:
+            # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
+            # BLOCK_SIZE_K must be divisible by block_shape[1]
+            config = {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": block_shape[0],
+                "BLOCK_SIZE_K": block_shape[1],
+                "GROUP_SIZE_M": 32,
+                "num_warps": 4,
+                "num_stages": 3,
+            }
+    else:
         config = {
-            'BLOCK_SIZE_M': 16,
-            'BLOCK_SIZE_N': 32,
-            'BLOCK_SIZE_K': 64,
-            'GROUP_SIZE_M': 1
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
         }
+        # A heuristic: fused marlin works faster with this config for small M
+        if M <= E or (is_marlin and M <= 32):
+            config = {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+            }
     return config
 
 
@@ -679,7 +724,9 @@ def try_get_optimal_moe_config(
     else:
         # First try to load optimal config from the file
         E, _, N = w2_shape
-        configs = get_moe_configs(E, N, dtype)
+        block_n = block_shape[0] if block_shape else 0
+        block_k = block_shape[1] if block_shape else 0
+        configs = get_moe_configs(E, N, dtype, block_n, block_k)
 
         if configs:
             # If an optimal configuration map has been found, look up the
@@ -688,13 +735,7 @@ def try_get_optimal_moe_config(
         else:
             # Else use the default config
             config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
-                                        is_marlin)
-    # NOTE: For block-wise quant,
-    # BLOCK_K must be divisible by block_shape[1]
-    # BLOCK_N and BLOCK_M has no requirements
-    if block_shape is not None and block_shape[0] != 0:
-        config["BLOCK_SIZE_N"] = block_shape[0]
-        config["BLOCK_SIZE_K"] = block_shape[1]
+                                        is_marlin, block_shape)
     return config
 
 
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..6496a38fba8ae
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..3618053b65831
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..46a982f5ee9a4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..035ec027fa566
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..8b49f2781cb54
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..851bc9f9f0b50
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..d1227c2157990
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..1c61451fb34e5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..63e661c80de6a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..cf354037903c0
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..eccb86a76df0d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..88af48431d8b8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..dd069726d7ed4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..56b939e52fac3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..63d9a0bf5d79d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..7fa398c15a2a5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..f15d8f64c7090
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..cd3e07804fdec
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..9d5a329d7466a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..96e1594a3eabb
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..5ffd367df833d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..eabc423949a24
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..51e237b91b8e7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..6280219c9ee7d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..40c01c0b92b4b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..c6fd3659799bc
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..160f12ed3f95a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..e5c4a1d2c94e5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..2bf5eb27e3820
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..0a1e14cffbb2a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..15b1c93f60fc5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..8ff12e64c172f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..4532f93681e2b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..ca7f32b9552b4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..5acea242cc0ad
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..58cdd93e90b8c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..b72e0371d1421
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000..293adce387e06
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 43b1997019107..a7a3fa6601639 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -1,12 +1,18 @@
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
-from typing import List, Optional, Tuple
+import functools
+import json
+import os
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 import triton
 import triton.language as tl
 
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
+logger = init_logger(__name__)
+
 
 def apply_w8a8_block_fp8_linear(
     input: torch.Tensor,
@@ -277,6 +283,43 @@ def _w8a8_block_fp8_matmul(
     tl.store(c_ptrs, c, mask=c_mask)
 
 
+@functools.lru_cache
+def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int,
+                               block_k: int) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n}, {block_k}].json"  # noqa: E501
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info(
+                "Using configuration from %s for W8A8 Block FP8 kernel.",
+                config_file_path,
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        "Using default W8A8 Block FP8 kernel config. Performance might "
+        "be sub-optimal! Config file not found at %s",
+        config_file_path,
+    )
+    return None
+
+
 def w8a8_block_fp8_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -316,17 +359,22 @@ def w8a8_block_fp8_matmul(
     C_shape = A.shape[:-1] + (N, )
     C = A.new_empty(C_shape, dtype=output_dtype)
 
-    # TODO:
-    # BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N can be optimized.
-    # BLOCK_SIZE_K must be divisible by block_k
-    # BLOCK_SIZE_N and BLOCK_SIZE_M has no requirements
-    BLOCK_SIZE_M = 128
-    if M < BLOCK_SIZE_M:
-        BLOCK_SIZE_M = triton.next_power_of_2(M)
-        BLOCK_SIZE_M = max(BLOCK_SIZE_M, 16)
-    BLOCK_SIZE_K = block_k
-    assert block_k % BLOCK_SIZE_K == 0
-    BLOCK_SIZE_N = block_n
+    configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # Get the optimal config if there is one
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_N must be divisible by block_size[0]
+        # BLOCK_SIZE_K must be divisible by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 2,
+        }
 
     def grid(META):
         return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
@@ -353,10 +401,7 @@ def grid(META):
         As.stride(-1),
         Bs.stride(1),
         Bs.stride(0),
-        BLOCK_SIZE_M=BLOCK_SIZE_M,
-        BLOCK_SIZE_N=BLOCK_SIZE_N,
-        BLOCK_SIZE_K=BLOCK_SIZE_K,
-        GROUP_SIZE_M=8,
+        **config,
     )
 
     return C

From bd2107e30a258a5bcaa94e678a3890ec083a60a0 Mon Sep 17 00:00:00 2001
From: Nishidha <nishidha.panpaliya@partner.ibm.com>
Date: Fri, 31 Jan 2025 02:59:39 +0530
Subject: [PATCH 44/69] [CPU][PPC] Updated torch, torchvision, torchaudio
 dependencies (#12555)

Signed-off-by: npanpaliya <nishidha.panpaliya@partner.ibm.com>
---
 Dockerfile.ppc64le   |  5 ++---
 requirements-cpu.txt | 12 +++++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index d3cd1c7b313bc..c4c1f3e357972 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -4,12 +4,12 @@ USER root
 
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
-RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
+RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
 
 COPY ./ /workspace/vllm
 
@@ -21,7 +21,6 @@ RUN --mount=type=bind,source=.git,target=.git \
 RUN --mount=type=cache,target=/root/.cache/pip  \
     RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
         'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        torch==2.3.1 \
         -r requirements-cpu.txt \
         xformers uvloop==0.20.0
 
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index ed0d2c9fae0b6..ecfa822e01186 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -3,7 +3,13 @@
 
 # Dependencies for CPUs
 torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
-torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" 
-torchaudio; platform_machine != "ppc64le"  # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
-torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
+torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" 
+
+# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
+torchaudio; platform_machine != "ppc64le"
+torchaudio==2.5.1; platform_machine == "ppc64le"
+
+# required for the image processor of phi3v, this must be updated alongside torch
+torchvision; platform_machine != "ppc64le"
+torchvision==0.20.1; platform_machine == "ppc64le"
 datasets # for benchmark scripts

From 4078052f09f42f898b542e18d60d15a43db67a8b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 30 Jan 2025 18:07:19 -0500
Subject: [PATCH 45/69] [V1][Log] Add max request concurrency log to V1
 (#12569)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/v1/core/kv_cache_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index bab99fe37caee..dbdda51aedaa0 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -393,6 +393,10 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
         num_blocks = num_gpu_blocks_override
 
     logger.info("# GPU blocks: %d", num_blocks)
+    max_concurrency = (num_blocks * vllm_config.cache_config.block_size /
+                       vllm_config.model_config.max_model_len)
+    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                vllm_config.model_config.max_model_len, max_concurrency)
 
     per_layer_size = page_size * num_blocks
 

From 9798b2fb0052092a6420172e41c0c8a307eedfa6 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 30 Jan 2025 21:33:00 -0500
Subject: [PATCH 46/69] [Kernel] Update `cutlass_scaled_mm` to support 2d group
 (blockwise) scaling (#11868)

---
 CMakeLists.txt                                |   9 +-
 .../cutlass_benchmarks/w8a8_benchmarks.py     | 290 ++++---
 csrc/core/math.hpp                            |   9 +-
 csrc/cutlass_extensions/common.hpp            |  17 +
 .../gemm/collective/collective_builder.hpp    | 123 +++
 .../gemm/collective/fp8_accumulation.hpp      | 183 +++++
 ..._warpspecialized_fp8_blockwise_scaling.hpp | 730 ++++++++++++++++++
 .../gemm/dispatch_policy.hpp                  |  39 +
 .../vllm_collective_builder.cuh               |   2 +-
 .../cutlass_w8a8/c3x/cutlass_gemm_caller.cuh  |  93 +++
 .../{scaled_mm_c3x.cuh => c3x/scaled_mm.cuh}  |  74 --
 .../c3x/scaled_mm_azp_sm90_int8.cu            |  24 +
 .../c3x/scaled_mm_blockwise_sm90_fp8.cu       |  24 +
 .../scaled_mm_blockwise_sm90_fp8_dispatch.cuh | 168 ++++
 .../cutlass_w8a8/c3x/scaled_mm_kernels.hpp    |  33 +
 .../cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu    |  24 +
 .../scaled_mm_sm90_fp8_dispatch.cuh}          |  26 +-
 .../cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu   |  24 +
 .../scaled_mm_sm90_int8_dispatch.cuh}         |  25 +-
 .../cutlass_w8a8/scaled_mm_c3x.cu             | 104 ++-
 .../cutlass_w8a8/scaled_mm_entry.cu           |   3 -
 .../quantization/machete/machete_mainloop.cuh |   4 +
 tests/kernels/test_cutlass.py                 | 188 +++--
 tests/kernels/utils.py                        |  32 +-
 vllm/_custom_ops.py                           |  22 +
 25 files changed, 1924 insertions(+), 346 deletions(-)
 create mode 100644 csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
 create mode 100644 csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
 create mode 100644 csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
 create mode 100644 csrc/cutlass_extensions/gemm/dispatch_policy.hpp
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_c3x.cuh => c3x/scaled_mm.cuh} (51%)
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_c3x_sm90_fp8_dispatch.cuh => c3x/scaled_mm_sm90_fp8_dispatch.cuh} (76%)
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_c3x_sm90_int8_dispatch.cuh => c3x/scaled_mm_sm90_int8_dispatch.cuh} (84%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c946fc5aa3ac..c823c9ff895c3 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -245,7 +245,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.6.0
+        GIT_TAG v3.7.0
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@@ -299,7 +299,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set(SRCS 
+       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index d0353bc8cb42a..b87496ca3b2b4 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -3,7 +3,7 @@
 import itertools
 import pickle as pkl
 import time
-from typing import Callable, Iterable, List, Tuple
+from typing import Callable, Iterable, List, Optional, Tuple
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -12,6 +12,8 @@
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    w8a8_block_fp8_matmul)
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -38,8 +40,15 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
     ).blocked_autorange(min_run_time=min_run_time)
 
 
-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
+def bench_int8(
+        dtype: torch.dtype,
+        m: int,
+        k: int,
+        n: int,
+        label: str,
+        sub_label: str,
+        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+    """Benchmark INT8-based kernels."""
     assert dtype == torch.int8
     a, b = make_rand_tensors(torch.int8, m, n, k)
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
@@ -48,155 +57,132 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
     azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
 
+    bench_fns = {
+        "pytorch_bf16_bf16_bf16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
+                         ),
+        "pytorch_fp16_fp16_fp16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
+        "cutlass_i8_i8_bf16_scaled_mm":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
+        "cutlass_i8_i8_bf16_scaled_mm_bias":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
+                                      bias),
+        "cutlass_i8_i8_bf16_scaled_mm_azp":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_bias":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj, None, bias),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_pt":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj, azp),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj, azp, bias),
+    }
+
     timers = []
-    # pytorch impl - bfloat16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
-
-    # pytorch impl - float16
-    timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
-
-    # cutlass impl
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-
-    # cutlass with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass with azp per-tensor
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj))
-
-    # cutlass with azp per-tensor + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, None, bias))
-
-    # cutlass with azp per-token
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp))
-
-    # cutlass with azp per-token + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp, bias))
+    for name, fn in bench_fns.items():
+        # If bench_kernels is None, run all. Otherwise, run only exact matches.
+        if bench_kernels is None or name in bench_kernels:
+            print(f"Running {name}")
+            timers.append(bench_fn(label, sub_label, name, fn))
 
     return timers
 
 
-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
+def bench_fp8(
+        dtype: torch.dtype,
+        m: int,
+        k: int,
+        n: int,
+        label: str,
+        sub_label: str,
+        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+    """Benchmark FP8-based kernels."""
     assert dtype == torch.float8_e4m3fn
     a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+    a_cont = a.contiguous()
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    block_scale_a = torch.rand((m, k // 128),
+                               device="cuda",
+                               dtype=torch.float32)
+    block_scale_b = torch.rand((k // 128, n // 128),
+                               device="cuda",
+                               dtype=torch.float32)
+    block_scale_a_M_major = block_scale_a.t().contiguous().t()
+    block_scale_b_K_major = block_scale_b.t().contiguous().t()
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    timers = []
+    print(m, k, n)
+
+    bench_fns = {
+        "pytorch_bf16_bf16_bf16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
+                         ),
+        "pytorch_fp16_fp16_fp16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
+        "pytorch_fp8_fp8_fp16_scaled_mm":
+        lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.float16),
+        "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum":
+        lambda: torch._scaled_mm(a,
+                                 b,
+                                 scale_a,
+                                 scale_b,
+                                 out_dtype=torch.float16,
+                                 use_fast_accum=True),
+        "pytorch_fp8_fp8_bf16_scaled_mm":
+        lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.bfloat16),
+        "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum":
+        lambda: torch._scaled_mm(a,
+                                 b,
+                                 scale_a,
+                                 scale_b,
+                                 out_dtype=torch.bfloat16,
+                                 use_fast_accum=True),
+        "cutlass_fp8_fp8_bf16_scaled_mm":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
+        "cutlass_fp8_fp8_fp16_scaled_mm":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16),
+        "cutlass_fp8_fp8_bf16_scaled_mm_bias":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
+                                      bias),
+        "cutlass_fp8_fp8_fp16_scaled_mm_bias":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16,
+                                      bias.to(dtype=torch.float16)),
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise":
+        lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a,
+                                      block_scale_b.t(), (128, 128)),
+        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise":
+        lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major,
+                                      block_scale_b_K_major, torch.float16),
+    }
 
-    # pytorch impl w. bf16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
-
-    # pytorch impl: bf16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16))
-
-    # pytorch impl: bf16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True))
-
-    # pytorch impl: fp16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16))
-
-    # pytorch impl: fp16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16,
-                 use_fast_accum=True))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
-
-    # cutlass impl: bf16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass impl: fp16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
-                 bias.to(dtype=torch.float16)))
+    timers = []
+    for name, fn in bench_fns.items():
+        # If bench_kernels is None, run all. Otherwise, run only exact matches.
+        if bench_kernels is None or name in bench_kernels:
+            print(f"Running {name}")
+            timers.append(bench_fn(label, sub_label, name, fn))
 
     return timers
 
 
-def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
+def bench(dtype: torch.dtype,
+          m: int,
+          k: int,
+          n: int,
+          label: str,
+          sub_label: str,
+          bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
     if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
+        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
     if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, m, k, n, label, sub_label)
+        return bench_fp8(dtype, m, k, n, label, sub_label, bench_kernels)
     raise ValueError("unsupported type")
 
 
@@ -207,18 +193,22 @@ def print_timers(timers: Iterable[TMeasurement]):
 
 
 def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+        MKNs: Iterable[Tuple[int, int, int]],
+        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
     results = []
     for m, k, n in MKNs:
-        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})")
+        timers = bench(dtype,
+                       m,
+                       k,
+                       n,
+                       f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})",
+                       bench_kernels=bench_kernels)
         print_timers(timers)
         results.extend(timers)
-
     return results
 
 
-# output makers
 def make_output(data: Iterable[TMeasurement],
                 MKNs: Iterable[Tuple[int, int, int]],
                 base_description: str,
@@ -232,15 +222,11 @@ def make_output(data: Iterable[TMeasurement],
         pkl.dump(data, f)
 
 
-# argparse runners
-
-
 def run_square_bench(args):
     dim_sizes = list(
         range(args.dim_start, args.dim_end + 1, args.dim_increment))
     MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args.dtype, MKNs)
-
+    data = run(args.dtype, MKNs, bench_kernels=args.kernels)
     make_output(data, MKNs, f"square_bench-{args.dtype}")
 
 
@@ -251,8 +237,7 @@ def run_range_bench(args):
     Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
     Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
     MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args.dtype, MKNs)
-
+    data = run(args.dtype, MKNs, bench_kernels=args.kernels)
     make_output(data, MKNs, f"range_bench-{args.dtype}")
 
 
@@ -278,7 +263,7 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
             for k, n in KNs:
                 MKNs.append((m, k, n))
 
-        data = run(args.dtype, MKNs)
+        data = run(args.dtype, MKNs, bench_kernels=args.kernels)
         model_bench_data.append(data)
 
     # Print all results
@@ -328,6 +313,15 @@ def to_torch_dtype(dt):
                         type=to_torch_dtype,
                         required=True,
                         help="Available options are ['int8', 'fp8']")
+    parser.add_argument(
+        "--kernels",
+        nargs="+",
+        type=str,
+        default=None,
+        help=
+        "Exact names of the kernels to benchmark. If not set, runs all kernels."
+    )
+
     subparsers = parser.add_subparsers(dest="cmd")
 
     square_parser = subparsers.add_parser("square_bench")
@@ -362,4 +356,4 @@ def to_torch_dtype(dt):
     model_parser.set_defaults(func=run_model_bench)
 
     args = parser.parse_args()
-    args.func(args)
\ No newline at end of file
+    args.func(args)
diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
index ba9f40a230c8e..ddfaca27147b4 100644
--- a/csrc/core/math.hpp
+++ b/csrc/core/math.hpp
@@ -1,7 +1,14 @@
+#pragma once
+
 #include <climits>
 #include <iostream>
 
-inline uint32_t next_pow_2(uint32_t const num) {
+inline constexpr uint32_t next_pow_2(uint32_t const num) {
   if (num <= 1) return num;
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
+template <typename T>
+inline constexpr std::enable_if_t<std::is_integral_v<T>, T> ceil_div(T a, T b) {
+  return (a + b - 1) / b;
 }
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index 07c9e46c27b06..febc4eccd9561 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -32,3 +32,20 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
 }
 
 int32_t get_sm_version_num();
+
+/**
+ * A wrapper for a kernel that is used to guard against compilation on
+ * architectures that will never use the kernel. The purpose of this is to
+ * reduce the size of the compiled binary.
+ * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+ * into code that will be executed on the device where it is defined.
+ */
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
new file mode 100644
index 0000000000000..ec75c29e54f4d
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
@@ -0,0 +1,123 @@
+// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+// clang-format off
+#pragma once
+
+#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
+
+#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS (BlockScaled Builders)
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  int ScaleGranularityM
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>,
+    cute::enable_if_t<
+      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+
+  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
+                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+  static_assert((!IsFP8Input || !IsArrayOfPointersGemm),
+                "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now.");
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
+
+  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
+                                                          KernelTmaWarpSpecializedCooperative,
+                                                          KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>;
+  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
new file mode 100644
index 0000000000000..13b90e998625e
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
@@ -0,0 +1,183 @@
+// clang-format off
+// adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cute/algorithm/clear.hpp"
+#include "cute/tensor.hpp"
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////FP8 Accumulation///////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+/// This class provides API to promote (add) or scale (multiply_add) the results
+/// from the tensor core accumulators to the main accumulators when the number 
+/// of MMAs reaches the max number of MMA interval specified by user, after that
+/// the tensor core accumulators are zeroed.
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+template <
+    class EngineAccum,
+    class LayoutAccum>
+struct GmmaFP8AccumulationWithScale {  
+  using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>;
+  using ElementAccumulator = typename EngineAccum::value_type;
+
+  static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static");
+  static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident.");
+
+private:
+  TensorAccum& accum_;
+  TensorAccum accum_temp_;
+
+  uint32_t accum_promotion_interval_;         // defines the max num of executed MMAs after which accum should be promoted.
+  uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
+  uint32_t mma_count_;                        // current executed MMAs
+  uint32_t reset_accum_flag_;                 // accum needs to be zeroed or not. 
+
+  // promote or `add` the partial accumulators to main accumulator (FADD).
+  CUTLASS_DEVICE
+  void promote_core() {
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i);
+    }
+  }
+
+  // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA).
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_core(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    using TensorScale = cute::Tensor<EngineScale, LayoutScale>;
+
+    static_assert(is_static<LayoutScale>::value, "Scale Layout should be static");
+    static_assert(is_rmem<TensorScale>::value , "Scale tensor must be rmem resident.");
+
+    static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape.");
+
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i) * scale(i);
+    }
+  }
+
+public:
+  CUTLASS_DEVICE
+  GmmaFP8AccumulationWithScale(
+      TensorAccum &accum,
+      uint32_t accum_promotion_interval,
+      uint32_t mma_count_per_mainloop_iteration)
+      : accum_(accum), 
+        accum_promotion_interval_(accum_promotion_interval),
+        mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
+        mma_count_(0), 
+        reset_accum_flag_(0) 
+  {
+    accum_temp_ = cute::make_fragment_like(accum);
+  }
+
+  //
+  // Methods (Common)
+  //
+
+  CUTLASS_DEVICE 
+  TensorAccum& operator()() {
+    return accum_temp_;
+  }
+
+  /// prepare the MMA accumulators when initialization or zeroing is required.
+  CUTLASS_DEVICE
+  bool prepare_if_needed() { 
+    return reset_accum_flag_;
+  }
+
+  //
+  // Methods (for FADD version)
+  //
+
+  /// promote (add) the results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_if_needed() {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      promote_core();
+      mma_count_ = 0;
+    }
+  }
+
+  /// promote (add) the residue results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_residue_if_needed() {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      promote_core();
+    }
+  }
+
+  //
+  // Methods (for FFMA version)
+  //
+
+  /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed.
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      scale_core(scale);
+      mma_count_ = 0;
+    }
+  }
+
+  /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed.
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_residue_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      scale_core(scale);
+    }
+  }
+};
+
+} // namespace cutlass::gemm::collective
diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
new file mode 100644
index 0000000000000..928a9500cbb08
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -0,0 +1,730 @@
+// clang-format off
+// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm80.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  int ScaleGranularityM_,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using ElementBlockScale = ElementAccumulator;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // Two threads per CTA are producers (1 for operand tile and 32 for scales)
+  static constexpr int NumProducerThreadEvents = 33; 
+
+  static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
+  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  
+  // Block scaling gmem-to-smem copy atom 
+  using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  
+  // Block scaling smem layout
+  using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
+  using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>; // `ScaleNsPerTile` is always 1.
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
+             "ElementAccumulator and ElementBlockScale should be same datatype");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // mxk
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // nxk
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // 1xk
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    ElementBlockScale const* ptr_scale_A; 
+    ElementBlockScale const* ptr_scale_B;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    // Block scaling factors for A and B
+    ElementBlockScale const* ptr_scale_A; 
+    ElementBlockScale const* ptr_scale_B;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk,
+      args.ptr_scale_A,
+      args.ptr_scale_B
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params)
+  {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    constexpr auto scales_m = Int<ScaleMsPerTile>{};
+    auto tM = get<2>(gA_mkl.shape());
+    auto tN = get<2>(gB_nkl.shape());
+    auto tK = get<3>(gA_mkl.shape());
+
+    // Make the tiled views of scale tensors
+    auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l)
+    auto scaleA_layout = make_ordered_layout(scaleA_shape,  Step<_0, _1, _2>{});
+    auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l)
+    auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{});
+
+    // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and 
+    // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
+    Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l)
+    Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorScaleA, class TensorScaleB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Blockscaling: Tma loads for load_input and CpAsync for load_scale
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+    Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k)
+    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+
+    // Block scaling: load_scale has scaling tensors in global memory which are not tiled
+    Tensor mScaleA_mkl = get<2>(load_inputs);
+    Tensor mScaleB_nkl = get<3>(load_inputs);
+    auto scales_m = get<0>(mScaleA_mkl.shape());
+
+    Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
+
+    Tensor gScaleA = local_tile( 
+      mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
+      make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
+    Tensor cScaleA = local_tile( 
+      cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
+      make_coord(m_coord,_,l_coord));
+    Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord);                                           // (1,k,1)
+
+    // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
+    TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, 
+      Layout<Shape<_32, _1>>{}, Layout<Shape<_4, _1>>{}); // (1,1,1)
+    TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, 
+      Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
+    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
+    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
+    
+    Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
+    Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
+    Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
+    
+    Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
+    Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+
+    // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+      }
+    }
+
+    // Allocate predicate tensors for a_scales (since we can't guarantee that 
+    // all scales are valid, since we could have a partial tiles along M)
+    Tensor tApA_ScaleA = make_tensor<bool>(shape(tAsA_ScaleA(_,_,0)));
+    #pragma unroll
+    for (int i = 0; i < size(tApA_ScaleA); ++i) {
+      tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m;
+    }
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+      int write_stage = smem_pipe_write.index();
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      // Copy operands A and B from global memory to shared memory
+      if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+      if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+      // Copy scale tensors from global memory to shared memory
+      copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
+      copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage));
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
+
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    
+    // Block scaling
+    Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
+      Layout<
+        Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
+        Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
+      >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
+    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+    
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+    
+    // Per block scale values for operand A and B
+
+    using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout.
+    using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above
+
+    Tensor tCrScaleAViewAsC = make_tensor<ElementBlockScale>(RegLayoutScaleAViewAsC{});              // (MMA,MMA_M,MMA_N)
+    ElementBlockScale scale_b;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA));
+    warpgroup_fence_operand(accumulation());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      int read_stage = smem_pipe_read.index();
+      
+      // Load per block scale values from shared memory to registers.
+      scale_b = sScaleB[read_stage];
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+      }
+      if constexpr (ScaleMsPerTile == 1) {
+        static_assert(size(RegLayoutScaleAEssential{}) == 1);
+        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        }
+      }
+
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
+      accumulation.scale_if_needed(tCrScaleAViewAsC);
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation());
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) 
+      scale_b = sScaleB[read_stage];
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+      }
+      if constexpr (ScaleMsPerTile == 1) {
+        static_assert(size(RegLayoutScaleAEssential{}) == 1);
+        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        }
+      }
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accumulation());
+
+      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
+      accumulation.scale_if_needed(tCrScaleAViewAsC);
+
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+    
+    accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
+
+    warpgroup_fence_operand(accumulation());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
new file mode 100644
index 0000000000000..df809e27a3efe
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+namespace cutlass::gemm {
+
+//////////////////////////////////////////////////////////////////////////////
+
+// FP8 related policies (including Blocked Scaled Accumulation)
+//  `ScaleGranularityM` specifies scaling granularity along M, while zero-value
+//  `ScaleGranularityM` indicates that scaling granularity is
+//  `size<0>(TileShape_MNK{})` along M.
+template <int ScaleGranularityM = 0>
+struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum
+    : KernelTmaWarpSpecializedCooperative {};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
+// specialized dynamic schedule For FP8 kernels with Block Scaling
+template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>,
+          class KernelSchedule = KernelTmaWarpSpecialized,
+          int ScaleGranularityM =
+              0  // `ScaleGranularityM` specifies scaling granularity along M,
+                 // while zero-value `ScaleGranularityM` indicates that scaling
+                 // granularity is `size<0>(TileShape_MNK{})` along M.
+          >
+struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
+    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_,
+                                         KernelSchedule> {
+  static_assert(
+      cute::is_same_v<
+          KernelSchedule,
+          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
+              ScaleGranularityM>>,
+      "KernelSchedule must be one of the warp specialized policies");
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/vllm_collective_builder.cuh b/csrc/cutlass_extensions/vllm_collective_builder.cuh
index 085ee1290031f..e7fbba4cd4b0d 100644
--- a/csrc/cutlass_extensions/vllm_collective_builder.cuh
+++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
 
 namespace cutlass::gemm::collective {
 using namespace cute;
diff --git a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
new file mode 100644
index 0000000000000..9ac7eee7204ec
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
@@ -0,0 +1,93 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "core/math.hpp"
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+namespace vllm::c3x {
+
+static inline cute::Shape<int, int, int, int> get_problem_shape(
+    torch::Tensor const& a, torch::Tensor const& b) {
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  return {m, n, k, 1};
+}
+
+template <typename GemmKernel>
+void cutlass_gemm_caller(torch::Device device,
+                         cute::Shape<int, int, int, int> prob_shape,
+                         typename GemmKernel::MainloopArguments mainloop_args,
+                         typename GemmKernel::EpilogueArguments epilogue_args) {
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(device);
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(device.index());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = cute::Stride<int64_t, cute::Int<1>, int64_t>;
+  using StrideB = cute::Stride<int64_t, cute::Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, cute::Int<1>{}, 0};
+  StrideB b_stride{ldb, cute::Int<1>{}, 0};
+  StrideC c_stride{ldc, cute::Int<1>{}, cute::Int<0>{}};
+
+  typename GemmKernel::ProblemShape prob_shape = get_problem_shape(a, b);
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                  epilogue_args);
+}
+
+}  // namespace vllm::c3x
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
similarity index 51%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
rename to csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
index d4bc2f0ade50d..9227ebb735245 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@@ -2,9 +2,6 @@
 
 // clang-format will break include orders
 // clang-format off
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
 
 #include "cutlass/cutlass.h"
 
@@ -32,21 +29,6 @@ using namespace cute;
 
 namespace vllm {
 
-// A wrapper for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
-    Kernel::operator()(std::forward<Args>(args)...);
-#endif
-  }
-};
-
 template <typename ElementAB_, typename ElementD_,
           template <typename, typename, typename> typename Epilogue_,
           typename TileShape, typename ClusterShape, typename KernelSchedule,
@@ -101,60 +83,4 @@ struct cutlass_3x_gemm {
   struct GemmKernel : public KernelType {};
 };
 
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      Gemm::Epilogue::prepare_args(
-          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-
 }  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
new file mode 100644
index 0000000000000..4cd38f4975df7
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_int8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     torch::Tensor const& a_scales,
+                                     torch::Tensor const& b_scales,
+                                     torch::Tensor const& azp_adj,
+                                     std::optional<torch::Tensor> const& azp,
+                                     std::optional<torch::Tensor> const& bias) {
+  if (azp) {
+    return cutlass_scaled_mm_sm90_int8_epilogue<
+        c3x::ScaledEpilogueBiasAzpToken>(out, a, b, a_scales, b_scales, azp_adj,
+                                         *azp, bias);
+  } else {
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogueBiasAzp>(
+        out, a, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
new file mode 100644
index 0000000000000..0501e6da160e2
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
@@ -0,0 +1,24 @@
+
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_blockwise_sm90_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales) {
+  if (out.dtype() == torch::kBFloat16) {
+    cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::bfloat16_t>(
+        out, a, b, a_scales, b_scales);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::half_t>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
new file mode 100644
index 0000000000000..fb7a82b80ee65
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@@ -0,0 +1,168 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
+
+#include "cutlass_gemm_caller.cuh"
+
+namespace vllm {
+
+using namespace cute;
+
+template <typename OutType, int GroupSizeM_, int GroupSizeN_, int GroupSizeK_,
+          int TileSizeM_ = 128, class ClusterShape = Shape<_1, _2, _1>>
+struct cutlass_3x_gemm_fp8_blockwise {
+  using GroupSizeM = Int<GroupSizeM_>;
+  using GroupSizeN = Int<GroupSizeN_>;
+  using GroupSizeK = Int<GroupSizeK_>;
+  using TileSizeM = Int<TileSizeM_>;
+
+  static_assert(TileSizeM_ % GroupSizeM_ == 0,
+                "TileSizeM must be a multiple of GroupSizeM");
+
+  using ElementAB = cutlass::float_e4m3_t;
+
+  using ElementA = ElementAB;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = ElementAB;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementD = OutType;
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementC = void;
+  using StrideC = StrideD;
+  static constexpr int AlignmentC = AlignmentD;
+
+  using ElementAccumulator = float;
+  using ElementBlockScale = float;
+  using ElementCompute = float;
+  using ArchTag = cutlass::arch::Sm90;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using TileShape = Shape<TileSizeM, GroupSizeN, GroupSizeK>;
+
+  using KernelSchedule = cutlass::gemm::
+      KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
+          GroupSizeM_>;
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<
+      cutlass::epilogue::fusion::Sm90AccFetch>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
+          ElementAccumulator, ElementCompute, ElementC, StrideC, AlignmentC,
+          ElementD, StrideD, AlignmentD, EpilogueSchedule,
+          StoreEpilogueCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB,
+          LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+
+  using StrideA = typename GemmKernel::StrideA;
+  using StrideB = typename GemmKernel::StrideB;
+};
+
+template <typename Gemm>
+void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  auto prob_shape = c3x::get_problem_shape(a, b);
+  int32_t m = get<0>(prob_shape), n = get<1>(prob_shape),
+          k = get<2>(prob_shape);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
+
+  // Check is the t is contiguous and is 1D or 2D with one of the dimensions
+  // being 1 (i.e. a row or column vector)
+  auto is_contiguous_vector = [](const torch::Tensor& t) {
+    auto t_sizes = t.sizes();
+    return t.is_contiguous() &&
+           (t.dim() == 1 ||
+            (t.dim() == 2 &&
+             *std::min_element(t_sizes.begin(), t_sizes.end()) == 1));
+  };
+
+  // TODO(lucas): lets clean-up the kernel so that we pass in Strides so
+  //  we don't have to deal with enforcing implicit layouts
+  TORCH_CHECK(a_scales.size(0) == m / Gemm::GroupSizeM::value);
+  TORCH_CHECK(a_scales.size(1) == k / Gemm::GroupSizeK::value);
+  TORCH_CHECK(a_scales.stride(0) == 1 || is_contiguous_vector(a_scales),
+              "a_scales must be M major");
+  TORCH_CHECK(b_scales.size(0) == k / Gemm::GroupSizeK::value);
+  TORCH_CHECK(b_scales.size(1) == n / Gemm::GroupSizeN::value);
+  TORCH_CHECK(b_scales.stride(0) == 1 || is_contiguous_vector(b_scales),
+              "b_scales must be K major");
+  typename GemmKernel::MainloopArguments mainloop_args{
+      a_ptr, a_stride, b_ptr, b_stride, a_scales_ptr, b_scales_ptr};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {}, c_ptr, c_stride, c_ptr, c_stride};
+
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename OutType>
+void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out,
+                                              torch::Tensor const& a,
+                                              torch::Tensor const& b,
+                                              torch::Tensor const& a_scales,
+                                              torch::Tensor const& b_scales) {
+  cutlass_gemm_caller_blockwise<
+      cutlass_3x_gemm_fp8_blockwise<OutType, 1, 128, 128>>(out, a, b, a_scales,
+                                                           b_scales);
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
new file mode 100644
index 0000000000000..7ede9e067477b
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <torch/all.h>
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     torch::Tensor const& a_scales,
+                                     torch::Tensor const& b_scales,
+                                     torch::Tensor const& azp_adj,
+                                     std::optional<torch::Tensor> const& azp,
+                                     std::optional<torch::Tensor> const& bias);
+
+void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          torch::Tensor const& a_scales,
+                                          torch::Tensor const& b_scales);
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
new file mode 100644
index 0000000000000..e092c61abc249
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
similarity index 76%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
rename to csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
index f08419b3122b2..32ea5db3321bc 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
-#include "scaled_mm_c3x.cuh"
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
 
 /**
  * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
@@ -9,6 +10,8 @@
 
 namespace vllm {
 
+using c3x::cutlass_gemm_caller;
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_fp8_config_default {
@@ -93,4 +96,25 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
   }
 }
 
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_fp8_epilogue(torch::Tensor& out,
+                                         torch::Tensor const& a,
+                                         torch::Tensor const& b,
+                                         EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                          cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                          cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
 }  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
new file mode 100644
index 0000000000000..021467b8bde8f
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm90_int8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm90_int8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
similarity index 84%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
rename to csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
index 34e5fd90ba26a..c4fa18101956b 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
-#include "scaled_mm_c3x.cuh"
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
 
 /**
  * This file defines Gemm kernel configurations for SM90 (int8) based on the
@@ -9,6 +10,8 @@
 
 namespace vllm {
 
+using c3x::cutlass_gemm_caller;
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_int8_config_default {
@@ -137,4 +140,24 @@ inline void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out,
   }
 }
 
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_int8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                           Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
 }  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index e18d7d79e5b77..e6f06d72fbfd4 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -1,52 +1,13 @@
 #include <cudaTypedefs.h>
+#include "c3x/scaled_mm_kernels.hpp"
 
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
-
-  #include "scaled_mm_c3x_sm90_fp8_dispatch.cuh"
-  #include "scaled_mm_c3x_sm90_int8_dispatch.cuh"
-
-  #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-using namespace vllm;
+#include "core/math.hpp"
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
    NVIDIA GPUs with sm90a (Hopper) or later.
 */
 
-template <template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     EpilogueArgs&&... epilogue_args) {
-  if (a.dtype() == torch::kInt8) {
-    TORCH_CHECK(b.dtype() == torch::kInt8);
-
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                             Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    }
-  } else {
-    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::bfloat16_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::half_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    }
-  }
-}
-
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -54,14 +15,50 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             std::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  if (bias) {
-    TORCH_CHECK(bias->dtype() == c.dtype(),
-                "currently bias dtype must match output dtype ", c.dtype());
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
-        c, a, b, a_scales, b_scales, *bias);
+
+  using GroupShape = std::array<int64_t, 2>;
+
+  int M = a.size(0), N = b.size(1), K = a.size(1);
+
+  GroupShape a_scale_group_shape = [&, &s = a_scales]() -> GroupShape {
+    if (s.numel() == 1) return {M, K};  // tensor-wise
+    if (s.dim() == 2)
+      return {ceil_div(a.size(0), s.size(0)), ceil_div(a.size(1), s.size(1))};
+    TORCH_CHECK(false, "Unsupported scale shape for scale_a");
+  }();
+
+  GroupShape b_scale_group_shape = [&, &s = b_scales]() -> GroupShape {
+    if (s.numel() == 1) return {K, N};  // tensor-wise
+    if (s.dim() == 2)
+      return {ceil_div(b.size(0), s.size(0)), ceil_div(b.size(1), s.size(1))};
+    TORCH_CHECK(false, "Unsupported scale shape for scale_b");
+  }();
+
+  if ((a_scale_group_shape == GroupShape{M, K} ||
+       a_scale_group_shape == GroupShape{1, K}) &&
+      (b_scale_group_shape == GroupShape{K, N} ||
+       b_scale_group_shape == GroupShape{K, 1})) {
+    // "standard per-tensor/per-token/per-channel" scaling
+    TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+    if (a.dtype() == torch::kFloat8_e4m3fn) {
+      vllm::cutlass_scaled_mm_sm90_fp8(c, a, b, a_scales, b_scales, bias);
+    } else {
+      TORCH_CHECK(a.dtype() == torch::kInt8);
+      vllm::cutlass_scaled_mm_sm90_int8(c, a, b, a_scales, b_scales, bias);
+    }
+  } else if (a_scale_group_shape == GroupShape{1, 128} &&
+             b_scale_group_shape == GroupShape{128, 128}) {
+    // 1x128 per-token group scales for activations
+    // 128x128 blockwise scales for weights
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn &&
+                    b.dtype() == torch::kFloat8_e4m3fn,
+                "Currently only FP8 is supported for A group shape 1x128 and "
+                "B group shape 128x128");
+    TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");
+
+    vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales);
   } else {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogue>(
-        c, a, b, a_scales, b_scales);
+    TORCH_CHECK(false, "Unsupported scale group shapes for CUTLASS 3.x GEMM");
   }
 }
 
@@ -75,13 +72,6 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
-  if (azp) {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzpToken>(
-        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
-  } else {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzp>(
-        out, a, b, a_scales, b_scales, azp_adj, bias);
-  }
+  vllm::cutlass_scaled_mm_azp_sm90_int8(out, a, b, a_scales, b_scales, azp_adj,
+                                        azp, bias);
 }
-
-#endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 3f2b52624f366..da77312bc4b98 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -89,15 +89,12 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
   TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
   TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
               b.size(1) == c.size(1));
-  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
-  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
 
   // Check for strides and alignment
   TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
   TORCH_CHECK(b.stride(0) == 1);                      // Column-major
   TORCH_CHECK(c.stride(0) % 16 == 0 &&
               b.stride(1) % 16 == 0);  // 16 Byte Alignment
-  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
 
   if (bias) {
     TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh
index 4071b19a3564d..572894064dc59 100644
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -272,6 +272,10 @@ struct MacheteCollectiveMma {
   using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
 
   using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
   using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}),
                                              shape<1>(SmemLayoutAtomScale{})));
 
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index c3eddacec2727..f538d492c2dfa 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -10,6 +10,7 @@
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.utils import cdiv
 
 from .utils import baseline_scaled_mm, to_fp8, to_int8
 
@@ -39,6 +40,11 @@
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
 
+# -1 means full extent in that dimension
+TENSORWISE_GROUP_SHAPE = (-1, -1)
+PER_TOKEN_GROUP_SHAPE = (1, -1)
+PER_OUT_CH_GROUP_SHAPE = (-1, 1)
+
 capability = current_platform.get_device_capability()
 capability = capability[0] * 10 + capability[1]
 
@@ -47,11 +53,22 @@ def rand_int8(shape: tuple, device: str = "cuda"):
     return to_int8(torch.rand(shape, device=device) * 255 - 128)
 
 
+def group_scale_helper(shape, group_shape):
+    return [shape[i] if s < 0 else s for i, s in enumerate(group_shape)]
+
+
+def scale_shape(shape, group_shape):
+    assert len(shape) == len(group_shape)
+    group_shape = group_scale_helper(shape, group_shape)
+    return tuple(
+        cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
+
+
 def cutlass_fp8_gemm_helper(m: int,
                             n: int,
                             k: int,
-                            per_token_act_quant: bool,
-                            per_out_channel_weight_quant: bool,
+                            a_scale_group_shape: tuple,
+                            b_scale_group_shape: tuple,
                             use_bias: bool,
                             out_dtype: Type[torch.dtype] = torch.bfloat16,
                             device: str = "cuda"):
@@ -60,13 +77,17 @@ def cutlass_fp8_gemm_helper(m: int,
     a = to_fp8(torch.randn((m, k), device=device))
     b = to_fp8(torch.randn((n, k), device=device).t())
 
-    m_a_scales = m if per_token_act_quant else 1
-    n_b_scales = n if per_out_channel_weight_quant else 1
+    a_scales_shape = scale_shape(a.shape, a_scale_group_shape)
+    b_scales_shape = scale_shape(b.shape, b_scale_group_shape)
+
+    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
+    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+
+    # make scales M-major for blockwise quant, doesn't affect 1D scales
+    scale_a = scale_a.t().contiguous().t()
+    # make scales K-major for blockwise quant, doesn't affect 1D scales
+    scale_b = scale_b.t().contiguous().t()
 
-    scale_a = (torch.randn((m_a_scales, 1), device=device,
-                           dtype=torch.float32))
-    scale_b = (torch.randn((1, n_b_scales), device=device,
-                           dtype=torch.float32))
     if use_bias:
         bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
     else:
@@ -84,8 +105,8 @@ def cutlass_fp8_gemm_helper(m: int,
 def cutlass_int8_gemm_helper(m: int,
                              n: int,
                              k: int,
-                             per_token_act_quant: bool,
-                             per_out_channel_weight_quant: bool,
+                             a_scale_group_shape: tuple,
+                             b_scale_group_shape: tuple,
                              use_bias: bool,
                              out_dtype: Type[torch.dtype] = torch.bfloat16,
                              device: str = "cuda"):
@@ -94,13 +115,11 @@ def cutlass_int8_gemm_helper(m: int,
     a = to_int8(torch.randn((m, k), device=device) * 5)
     b = to_int8(torch.randn((n, k), device=device).t() * 5)
 
-    m_a_scales = m if per_token_act_quant else 1
-    n_b_scales = n if per_out_channel_weight_quant else 1
+    a_scales_shape = scale_shape(a.shape, a_scale_group_shape)
+    b_scales_shape = scale_shape(b.shape, b_scale_group_shape)
 
-    scale_a = (torch.randn((m_a_scales, 1), device=device,
-                           dtype=torch.float32))
-    scale_b = (torch.randn((1, n_b_scales), device=device,
-                           dtype=torch.float32))
+    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
+    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
 
     if use_bias:
         bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
@@ -117,82 +136,135 @@ def cutlass_int8_gemm_helper(m: int,
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                          per_out_ch: bool, use_bias: bool):
-    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
+def test_cutlass_fp8_gemm(m: int, n: int, k: int, a_scale_group_shape,
+                          b_scale_group_shape, use_bias: bool):
+    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
+                            use_bias)
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape,b_scale_group_shape",
+                         [((1, 128), (128, 128))])
+@pytest.mark.parametrize("use_bias", [False])
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="FP8 blockwise is not supported on this GPU type.")
+def test_cutlass_fp8_blockwise_scale_gemm(m: int, n: int, k: int,
+                                          a_scale_group_shape,
+                                          b_scale_group_shape, use_bias: bool):
+    if k % b_scale_group_shape[0] != 0 or n % b_scale_group_shape[1] != 0:
+        return
+    if m % a_scale_group_shape[0] != 0 or k % a_scale_group_shape[1] != 0:
+        return
+    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
+                            use_bias)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                           per_out_ch: bool, use_bias: bool):
-    cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
+def test_cutlass_int8_gemm(m: int, n: int, k: int, a_scale_group_shape,
+                           b_scale_group_shape, use_bias: bool):
+    cutlass_int8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
+                             use_bias)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_int8_gemm_output_dtype(a_scale_group_shape,
+                                        b_scale_group_shape,
                                         out_dtype: Type[torch.dtype],
                                         use_bias: bool):
     cutlass_int8_gemm_helper(512,
                              512,
                              512,
-                             per_act_token,
-                             per_out_ch,
+                             a_scale_group_shape,
+                             b_scale_group_shape,
                              use_bias,
                              out_dtype=out_dtype)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_fp8_gemm_output_dtype(a_scale_group_shape,
+                                       b_scale_group_shape,
                                        out_dtype: Type[torch.dtype],
                                        use_bias: bool):
     cutlass_fp8_gemm_helper(512,
                             512,
                             512,
-                            per_act_token,
-                            per_out_ch,
+                            a_scale_group_shape,
+                            b_scale_group_shape,
                             use_bias,
                             out_dtype=out_dtype)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape,b_scale_group_shape",
+                         [((1, 128), (128, 128))])
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_bias", [False])
+@pytest.mark.skipif(not current_platform.has_device_capability(90),
+                    reason="FP8 blockwise is not supported on this GPU type.")
+def test_cutlass_fp8_blockwise_scale_gemm_dtype(a_scale_group_shape,
+                                                b_scale_group_shape,
+                                                out_dtype: Type[torch.dtype],
+                                                use_bias: bool):
+    cutlass_fp8_gemm_helper(512,
+                            512,
+                            512,
+                            a_scale_group_shape,
+                            b_scale_group_shape,
+                            use_bias,
+                            out_dtype=out_dtype)
+
+
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_fp8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
                                   use_bias: bool, device: str):
-    cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, use_bias,
-                            torch.bfloat16, device)
+    cutlass_fp8_gemm_helper(512, 512, 512, a_scale_group_shape,
+                            b_scale_group_shape, use_bias, torch.bfloat16,
+                            device)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_int8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
                                    use_bias: bool, device: str):
     cutlass_int8_gemm_helper(512,
                              512,
                              512,
-                             per_act_token,
-                             per_out_ch,
+                             a_scale_group_shape,
+                             b_scale_group_shape,
                              use_bias,
                              out_dtype=torch.bfloat16,
                              device=device)
@@ -203,28 +275,32 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
 # of a large power of two. In any case, the kernel will have a naive fallback
 # when N and K are not divisible by 16. But M is the number of tokens and the
 # kernel must handle any M thrown at it.
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_fp8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
                                   use_bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
-                                    use_bias)
+            cutlass_fp8_gemm_helper(m, nk, nk, a_scale_group_shape,
+                                    b_scale_group_shape, use_bias)
 
 
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("a_scale_group_shape",
+                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize("b_scale_group_shape",
+                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
+def test_cutlass_int8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
                                    use_bias: bool):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
-                                     use_bias)
+            cutlass_int8_gemm_helper(m, nk, nk, a_scale_group_shape,
+                                     b_scale_group_shape, use_bias)
 
 
 @pytest.mark.parametrize("m", [32, 64, 128])
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index fb2c9f5d30583..c735c5edd7a36 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1119,8 +1119,36 @@ def baseline_scaled_mm(a: torch.Tensor,
                        scale_b: torch.Tensor,
                        out_dtype: Type[torch.dtype],
                        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    output = (scale_a * (scale_b * (torch.mm(
-        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+
+    # We treat N-dimensional group scaling as extended numpy-style broadcasting
+    # in numpy simply stretches dimensions with an extent of 1 to match the
+    # the target shape by repeating the data along that dimension (broadcasting)
+    # , we extend these semantics to say if the extent of a dimension in the
+    # source shape is not 1 and does not match the target shape we repeat each
+    # element along that dimension src_shape[dim] // target_shape[dim] times
+    # example if we have:
+    #       a = [[1, 2], and target_shape = (2, 4)
+    #            [3, 4]]
+    # then we would expand a to:
+    #       a = [[1, 1, 2, 2],
+    #            [3, 3, 4, 4]]
+    # NOTE this function this function does not explicitly broadcast dimensions
+    # with an extent of 1, since this can be done implicitly by pytorch
+    def group_broadcast(t, shape):
+        for i, s in enumerate(shape):
+            if t.shape[i] != s and t.shape[i] != 1:
+                assert s % t.shape[i] == 0
+                t = t.unsqueeze(i + 1)\
+                  .expand(*t.shape[:i+1], s // t.shape[i], *t.shape[i+1:])\
+                  .flatten(i, i + 1)
+        return t
+
+    scale_a = group_broadcast(scale_a, a.shape)
+    scale_b = group_broadcast(scale_b, b.shape)
+
+    output = torch.mm((scale_a * a.to(dtype=torch.float32)),
+                      (scale_b * b.to(dtype=torch.float32))).to(out_dtype)
+
     if bias is not None:
         output = output + bias
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 85c1121ed6ff8..4a11b0206e003 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -441,6 +441,28 @@ def cutlass_scaled_mm(a: torch.Tensor,
                       scale_b: torch.Tensor,
                       out_dtype: torch.dtype,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    `cutlass_scaled_mm` implements a fused version of 
+        `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
+    where scale_a * a and scale_b * b are implemented using numpy-style 
+    broadcasting. 
+    
+    In order to support blockwise scaling like found in DeepSeek V3 we also 
+    support extended "group" broadcast rules. We extend the numpy-style 
+    broadcasting rules with the following rule: 
+        "if the extent of a dimension in the source shape is between 1 and 
+        corresponding extent in the target shape we repeat each element along 
+        that dimension  src_shape[dim] // target_shape[dim] times consecutively"
+    example if we have:
+          a = [[1, 2], and target_shape = (2, 4)
+               [3, 4]]
+    then we would expand a to:
+          a = [[1, 1, 2, 2],
+               [3, 3, 4, 4]]
+    currently we only support the case:
+        scale_a.shape * [1, 128] == a.shape
+        scale_b.shape * [128, 128] == b.shape
+    """
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
     assert bias is None or bias.shape[0] == b.shape[

From a1fc18c030e4d0466f2b23cb7dd4d11ce4b85603 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Thu, 30 Jan 2025 20:24:28 -0800
Subject: [PATCH 47/69] [ROCm][AMD][Model] llama 3.2 support upstreaming
 (#12421)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
---
 vllm/attention/backends/rocm_flash_attn.py | 376 ++++++++++++++++-----
 vllm/model_executor/models/mllama.py       |  16 +-
 2 files changed, 304 insertions(+), 88 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index ca6fa9ca61b30..12110ec7356d5 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -90,6 +90,17 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     seq_lens: Optional[List[int]]
     # seq_lens stored as a tensor.
     seq_lens_tensor: Optional[torch.Tensor]
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
 
     # NOTE(sang): Definition of context_len, query_len, and seq_len.
     # |---------- N-1 iteration --------|
@@ -100,30 +111,18 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     #                                   |-- query_len ---|
 
     # Maximum query length in the batch. None for decoding.
-    max_query_len: Optional[int]
-    # Maximum sequence length among prefill batch. 0 if there are decoding
-    # requests only.
-    max_prefill_seq_len: int
-    # Maximum sequence length among decode batch. 0 if there are prefill
-    # requests only.
-    max_decode_seq_len: int
+    max_query_len: Optional[int] = None
     # (batch_size + 1,). The cumulative subquery lengths of the sequences in
     # the batch, used to index into subquery. E.g., if the subquery length
     # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor]
+    query_start_loc: Optional[torch.Tensor] = None
     # (batch_size + 1,). The cumulative sequence lengths of the sequences in
     # the batch, used to index into sequence. E.g., if the sequence length is
     # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor]
-
-    # Whether or not if cuda graph is enabled.
-    # Cuda-graph is currently enabled for decoding only.
-    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
-    use_cuda_graph: bool
-
+    seq_start_loc: Optional[torch.Tensor] = None
     # (batch_size,) A tensor of context lengths (tokens that are computed
     # so far).
-    context_lens_tensor: Optional[torch.Tensor]
+    context_lens_tensor: Optional[torch.Tensor] = None
 
     # Max number of query tokens among request in the batch.
     max_decode_query_len: Optional[int] = None
@@ -131,6 +130,23 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     _cached_prefill_metadata: Optional["ROCmFlashAttentionMetadata"] = None
     _cached_decode_metadata: Optional["ROCmFlashAttentionMetadata"] = None
 
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
     @property
     def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
         if self.num_prefills == 0:
@@ -141,10 +157,7 @@ def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
 
         assert self.seq_lens is not None
         assert self.seq_lens_tensor is not None
-        assert self.query_start_loc is not None
-        assert self.context_lens_tensor is not None
         assert self.block_tables is not None
-        assert self.seq_start_loc is not None
 
         self._cached_prefill_metadata = ROCmFlashAttentionMetadata(
             num_prefills=self.num_prefills,
@@ -159,12 +172,20 @@ def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
             max_query_len=self.max_query_len,
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_seq_len=0,
-            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
-            seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
-            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
+            query_start_loc=None if self.query_start_loc is None else
+            self.query_start_loc[:self.num_prefills + 1],
+            seq_start_loc=None if self.seq_start_loc is None else
+            self.seq_start_loc[:self.num_prefills + 1],
+            context_lens_tensor=None if self.context_lens_tensor is None else
+            self.context_lens_tensor[:self.num_prefills],
             block_tables=self.block_tables[:self.num_prefills],
             use_cuda_graph=False,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         return self._cached_prefill_metadata
 
     @property
@@ -194,7 +215,12 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
             context_lens_tensor=None,
             block_tables=self.block_tables[self.num_prefills:],
             use_cuda_graph=self.use_cuda_graph,
-        )
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
         # Batch may be composed of prefill|decodes, adjust query start indices
         # to refer to the start of decodes when the two are split apart.
         # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
@@ -304,6 +330,97 @@ def _make_alibi_bias(alibi_slopes: torch.Tensor,
     return attn_biases
 
 
+def _get_seq_len_block_table_args(
+    attn_metadata: ROCmFlashAttentionMetadata,
+    attn_type: str,
+) -> tuple:
+    '''
+    The particular choice of sequence-length
+    attributes which should be extracted from attn_metadata is dependent
+    on the type of attention operation.
+
+    Decoder attn -> select entirely decoder self-attention-related fields
+    Encoder/decoder cross-attn -> select encoder sequence lengths
+    Encoder attn -> select encoder sequence lengths fields
+    
+    Arguments:
+
+    * attn_metadata: Attention metadata structure associated with attention op
+    * attn_type: encoder attention, decoder self-attention,
+                encoder/decoder cross-attention
+
+    Returns:
+
+    * Appropriate sequence-lengths tensors for query and key
+    * Appropriate max sequence-length scalar
+    '''
+
+    partial_prefix_sum = 0
+    if attn_type == AttentionType.ENCODER:
+        assert attn_metadata.encoder_seq_lens is not None
+        assert attn_metadata.encoder_seq_lens_tensor is not None
+        query_seq_start_loc = torch.tensor(
+            [0] + [
+                partial_prefix_sum := partial_prefix_sum + i
+                for i in attn_metadata.encoder_seq_lens
+            ],
+            device=attn_metadata.encoder_seq_lens_tensor.device,
+            dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
+        causal_mask = False
+
+        # No block tables associated with encoder attention
+        return (query_seq_start_loc, attn_metadata.max_encoder_seq_len,
+                query_seq_start_loc, attn_metadata.max_encoder_seq_len,
+                attn_metadata.encoder_seq_lens, causal_mask)
+    elif attn_type == AttentionType.DECODER:
+        # Decoder self-attention
+        # Choose max_seq_len based on whether we are in prompt_run
+        assert attn_metadata.seq_lens is not None
+        assert attn_metadata.seq_lens_tensor is not None
+        query_seq_start_loc = torch.tensor(
+            [0] + [
+                partial_prefix_sum := partial_prefix_sum + i
+                for i in attn_metadata.seq_lens
+            ],
+            device=attn_metadata.seq_lens_tensor.device,
+            dtype=attn_metadata.seq_lens_tensor.dtype)
+        max_seq_len = attn_metadata.max_prefill_seq_len
+        causal_mask = True
+
+        return (query_seq_start_loc, max_seq_len, query_seq_start_loc,
+                max_seq_len, attn_metadata.seq_lens, causal_mask)
+    elif attn_type == AttentionType.ENCODER_DECODER:
+        assert attn_metadata.seq_lens is not None
+        assert attn_metadata.encoder_seq_lens_tensor is not None
+        query_start_loc = torch.tensor(
+            [0] + [
+                partial_prefix_sum := partial_prefix_sum + i
+                for i in attn_metadata.seq_lens
+            ],
+            device=attn_metadata.encoder_seq_lens_tensor.device,
+            dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
+
+        partial_prefix_sum = 0
+        assert attn_metadata.encoder_seq_lens is not None
+        assert attn_metadata.seq_lens_tensor is not None
+        key_seq_start_loc = torch.tensor(
+            [0] + [
+                partial_prefix_sum := partial_prefix_sum + i
+                for i in attn_metadata.encoder_seq_lens
+            ],
+            device=attn_metadata.seq_lens_tensor.device,
+            dtype=attn_metadata.seq_lens_tensor.dtype)
+        causal_mask = False
+
+        # Enc/dec cross-attention KVs match encoder sequence length;
+        # cross-attention utilizes special "cross" block tables
+        return (query_start_loc, attn_metadata.max_prefill_seq_len,
+                key_seq_start_loc, attn_metadata.max_encoder_seq_len,
+                attn_metadata.seq_lens, causal_mask)
+    else:
+        raise AttributeError(f"Invalid attention type {str(attn_type)}")
+
+
 class ROCmFlashAttentionImpl(AttentionImpl):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
@@ -346,10 +463,13 @@ def __init__(
         if blocksparse_params is not None:
             raise ValueError(
                 "ROCmFlashAttention does not support blocksparse attention.")
-        if logits_soft_cap is not None:
-            raise ValueError(
-                "ROCmFlashAttention does not support attention logits soft "
-                "capping.")
+
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            self.logits_soft_cap = 0.0
+        else:
+            self.logits_soft_cap = logits_soft_cap
+        self.attn_type = attn_type
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -374,6 +494,14 @@ def __init__(
         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
         self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
         if self.use_triton_flash_attn:
+            if logits_soft_cap is not None:
+                raise ValueError(
+                    "ROCm Triton FlashAttention does not support attention"
+                    "logits soft capping."
+                    " please try using the ROCm CK "
+                    "FA backend instead by setting the env var "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+
             from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
                 triton_attention)
             self.attn_func = triton_attention
@@ -398,14 +526,13 @@ def __init__(
                     self.use_naive_attn = True
 
             if self.use_naive_attn:
-                self.attn_func = _sdpa_attention
-                logger.debug("Using naive attention in ROCmBackend")
+                if logits_soft_cap is not None:
+                    raise ValueError(
+                        "ROCm Naive FlashAttention does not support"
+                        "attention logits soft capping.")
 
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "ROCmFlashAttentionImpl")
+                self.attn_func = _sdpa_attention
+                logger.debug("Using naive (SDPA) attention in ROCmBackend")
 
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
         """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
@@ -427,6 +554,37 @@ def forward(
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
+        For decoder-only models: query, key and value must be non-None.
+
+        For encoder/decoder models:
+        * ROCmFlashAttentionImpl.forward() may be invoked for both self- and 
+            cross-attention layers.
+        * For self-attention: query, key and value must be non-None.
+        * For cross-attention:
+            * Query must be non-None
+            * During prefill, key and value must be non-None; key and value
+              get cached for use during decode.
+            * During decode, key and value may be None, since:
+              (1) key and value tensors were cached during prefill, and
+              (2) cross-attention key and value tensors do not grow during
+                  decode
+        
+        A note on how the attn_type (attention type enum) argument impacts
+        attention forward() behavior:
+    
+            * DECODER: normal decoder-only behavior;
+                use decoder self-attention block table
+            * ENCODER: no KV caching; pass encoder sequence
+                attributes (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len) to kernel, in lieu of decoder
+                sequence attributes (seq_lens/seq_lens_tensor/max_seq_len)
+            * ENCODER_DECODER: cross-attention behavior;
+                use cross-attention block table for caching KVs derived
+                from encoder hidden states; since KV sequence lengths
+                will match encoder sequence lengths, pass encoder sequence
+                attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
+                max_encoder_seq_len)
+
         Args:
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
@@ -435,54 +593,80 @@ def forward(
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
+            attn_type: Select attention type, between encoder attention,
+                       decoder self-attention, or encoder/decoder cross-
+                       attention. Defaults to decoder self-attention,
+                       which is the vLLM default generally
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
-        # If the feature combo become valid
-        num_tokens, hidden_size = query.shape
-        # Reshape the query, key, and value tensors.
         query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
+        if key is not None:
+            assert value is not None
+            key = key.view(-1, self.num_kv_heads, self.head_size)
+            value = value.view(-1, self.num_kv_heads, self.head_size)
+        else:
+            assert value is None
 
-        if kv_cache.numel() > 0:
+        if self.attn_type != AttentionType.ENCODER and kv_cache.numel() > 0:
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 
-            # Reshape the input keys and values and store them in the cache.
-            # If kv_cache is not provided, the new key and value tensors are
-            # not cached. This happens during the initial memory profiling run.
-            PagedAttention.write_to_paged_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
-        num_prefill_tokens = attn_metadata.num_prefill_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        assert key.shape[0] == num_prefill_tokens + num_decode_tokens
-        assert value.shape[0] == num_prefill_tokens + num_decode_tokens
+            if key is not None and value is not None:
+                # Reshape the input keys and values and store them in the
+                # cache. If kv_cache is not provided, the new key and value
+                # tensors are not cached. This happens during the initial
+                # memory profiling run.
+                PagedAttention.write_to_paged_cache(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    attn_metadata.cross_slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+
+        if self.attn_type != AttentionType.ENCODER:
+            num_prefill_tokens = attn_metadata.num_prefill_tokens
+        else:
+            assert attn_metadata.num_encoder_tokens is not None
+            num_prefill_tokens = attn_metadata.num_encoder_tokens
 
         output = torch.empty_like(query)
         # Query for decode. KV is not needed because it is already cached.
         decode_query = query[num_prefill_tokens:]
         # QKV for prefill.
         query = query[:num_prefill_tokens]
-        key = key[:num_prefill_tokens]
-        value = value[:num_prefill_tokens]
 
-        assert query.shape[0] == num_prefill_tokens
-        assert decode_query.shape[0] == num_decode_tokens
+        if key is not None and value is not None \
+            and self.attn_type != AttentionType.ENCODER_DECODER:
+            key = key[:num_prefill_tokens]
+            value = value[:num_prefill_tokens]
 
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
-            assert prefill_meta.seq_lens is not None
+            # normal attention and DECODER
+            if self.attn_type == AttentionType.DECODER and (
+                    kv_cache.numel() == 0 or prefill_meta.block_tables is None
+                    or prefill_meta.block_tables.numel() == 0):
+                (query_seq_start_loc, query_max_seq_len, key_seq_start_loc,
+                 key_max_seq_len, seq_lens,
+                 causal_mask) = (prefill_meta.seq_start_loc,
+                                 prefill_meta.max_prefill_seq_len,
+                                 prefill_meta.seq_start_loc,
+                                 prefill_meta.max_prefill_seq_len,
+                                 attn_metadata.seq_lens, True)
+            # prefix-enabled attention and ENCODER/ENCODER_DECODER
+            else:
+                (query_seq_start_loc, query_max_seq_len, key_seq_start_loc,
+                 key_max_seq_len, seq_lens,
+                 causal_mask) = _get_seq_len_block_table_args(
+                     prefill_meta, self.attn_type)
+            # Prompt run.
             if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0:
                 # triton attention
                 # When block_tables are not filled, it means q and k are the
@@ -493,18 +677,18 @@ def forward(
                         attn_masks = _make_alibi_bias(
                             self.alibi_slopes,
                             query.dtype,
-                            attn_metadata.seq_lens,
+                            seq_lens,
                             make_attn_mask=False)  # type: ignore
                     out, _ = self.attn_func(
                         query,
                         key,
                         value,
                         None,
-                        prefill_meta.seq_start_loc,
-                        prefill_meta.seq_start_loc,
-                        prefill_meta.max_prefill_seq_len,
-                        prefill_meta.max_prefill_seq_len,
-                        True,
+                        query_seq_start_loc,
+                        key_seq_start_loc,
+                        query_max_seq_len,
+                        key_max_seq_len,
+                        causal_mask,
                         self.scale,
                         attn_masks[0][None]
                         if attn_masks is not None else None,
@@ -528,11 +712,12 @@ def forward(
                         query,
                         key,
                         value,
-                        prefill_meta.seq_lens,
-                        num_tokens,
+                        query_seq_start_loc,
+                        num_prefill_tokens,
                         self.num_heads,
                         self.head_size,
                         self.scale,
+                        causal_mask,
                         attn_masks,
                     )
                 else:
@@ -540,19 +725,23 @@ def forward(
                         q=query,
                         k=key,
                         v=value,
-                        cu_seqlens_q=prefill_meta.seq_start_loc,
-                        cu_seqlens_k=prefill_meta.seq_start_loc,
+                        cu_seqlens_q=query_seq_start_loc,
+                        cu_seqlens_k=key_seq_start_loc,
                         max_seqlen_q=prefill_meta.max_prefill_seq_len,
-                        max_seqlen_k=prefill_meta.max_prefill_seq_len,
+                        max_seqlen_k=key_max_seq_len,
                         softmax_scale=self.scale,
                         causal=True,
                         window_size=self.sliding_window,
                         alibi_slopes=self.alibi_slopes,
+                        softcap=self.logits_soft_cap,
                     )
 
                 # common code for prefill
                 assert output[:num_prefill_tokens].shape == out.shape
-                output[:num_prefill_tokens] = out
+                if output.shape[0] > num_prefill_tokens:
+                    output[:num_prefill_tokens] = out
+                else:
+                    output = out
             else:
                 # prefix-enabled attention
                 output[:num_prefill_tokens] = PagedAttention.forward_prefix(
@@ -583,7 +772,10 @@ def forward(
                 decode_query.dtype, head_size, block_size, gqa_ratio,
                 decode_meta.max_decode_seq_len)
             if use_custom:
-                max_seq_len = decode_meta.max_decode_seq_len
+                max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type
+                               != AttentionType.ENCODER_DECODER else
+                               decode_meta.max_encoder_seq_len)
+                assert max_seq_len is not None
                 max_num_partitions = (
                     (max_seq_len + _PARTITION_SIZE_ROCM - 1) //
                     _PARTITION_SIZE_ROCM)
@@ -599,8 +791,12 @@ def forward(
                     device=output.device,
                 )
                 max_logits = torch.empty_like(exp_sums)
+                if num_prefill_tokens > 0:
+                    out = output[num_prefill_tokens:]
+                else:
+                    out = output
                 ops.paged_attention_rocm(
-                    output[num_prefill_tokens:],
+                    out,
                     exp_sums,
                     max_logits,
                     tmp_output,
@@ -609,8 +805,12 @@ def forward(
                     value_cache,
                     self.num_kv_heads,
                     self.scale,
-                    decode_meta.block_tables,
-                    decode_meta.seq_lens_tensor,
+                    decode_meta.block_tables
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.cross_block_tables,
+                    decode_meta.seq_lens_tensor
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.encoder_seq_lens_tensor,
                     block_size,
                     max_seq_len,
                     self.alibi_slopes,
@@ -623,9 +823,15 @@ def forward(
                     decode_query,
                     key_cache,
                     value_cache,
-                    decode_meta.block_tables,
-                    decode_meta.seq_lens_tensor,
-                    decode_meta.max_decode_seq_len,
+                    decode_meta.block_tables
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.cross_block_tables,
+                    decode_meta.seq_lens_tensor
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.encoder_seq_lens_tensor,
+                    decode_meta.max_decode_seq_len
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.max_encoder_seq_len,
                     self.kv_cache_dtype,
                     self.num_kv_heads,
                     self.scale,
@@ -635,7 +841,7 @@ def forward(
                 )
 
         # Reshape the output tensor.
-        return output.view(num_tokens, hidden_size)
+        return output.view(-1, self.num_heads * self.head_size)
 
 
 def _sdpa_attention(
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 34b8624647ce6..f7f9d7a186d95 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -48,7 +48,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SequenceData
@@ -847,7 +848,8 @@ def _attention_with_mask(
                     i,
                     i,
                 )
-            elif self.attn.backend in (_Backend.XFORMERS, _Backend.TORCH_SDPA):
+            elif self.attn.backend in (_Backend.XFORMERS, _Backend.ROCM_FLASH,
+                                       _Backend.TORCH_SDPA):
                 key_cache, value_cache = PagedAttention.split_kv_cache(
                     kv_cache, self.num_local_key_value_heads, self.head_dim)
                 cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
@@ -859,7 +861,8 @@ def _attention_with_mask(
                 raise ValueError(
                     f"Unsupported Attention backend {self.attn.backend} "
                     "enum found. Expected the Attention backend to be "
-                    "FLASH_ATTN, FLASH_ATTN_VLLM_V1, XFORMERS or TORCH_SDPA.")
+                    "FLASH_ATTN, FLASH_ATTN_VLLM_V1, "
+                    "XFORMERS or TORCH_SDPA.")
 
         # We have to call torch.sdpa for prefill when using a
         # custom cross-attention mask. Because the mask is not a
@@ -1452,6 +1455,13 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                orig_name = name
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    logger.debug("Missing name %s, orig name %s", name,
+                                 orig_name)
+                    continue
+
                 param = params_dict.pop(name)
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)

From cabaf4eff3c7df30d785769d5a0a1fa1a1c48a8a Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 31 Jan 2025 02:49:37 -0500
Subject: [PATCH 48/69] [Attention] MLA decode optimizations (#12528)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: simon-mo <simon.mo@hey.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
---
 csrc/cache.h                                  |   5 +
 csrc/cache_kernels.cu                         |  95 +++
 csrc/torch_bindings.cpp                       |   9 +
 tests/kernels/test_triton_decode_attention.py |  89 +++
 .../vllm_add_dummy_platform/dummy_platform.py |   2 +-
 tests/weight_loading/models.txt               |   2 +-
 .../run_model_weight_loading_test.sh          |   9 +-
 vllm/_custom_ops.py                           |  13 +
 vllm/attention/backends/abstract.py           |  16 +
 vllm/attention/backends/mla/__init__.py       |   0
 vllm/attention/backends/mla/utils.py          | 365 +++++++++
 vllm/attention/backends/triton_mla.py         | 749 ++++++++++++++++++
 vllm/attention/backends/utils.py              |   4 +
 vllm/attention/layer.py                       |  19 +-
 vllm/attention/ops/triton_decode_attention.py | 667 ++++++++++++++++
 vllm/attention/selector.py                    |   6 +-
 vllm/config.py                                |  35 +-
 vllm/engine/arg_utils.py                      |   1 -
 vllm/envs.py                                  |  14 +
 vllm/model_executor/model_loader/loader.py    |   6 +
 vllm/model_executor/models/deepseek_v2.py     | 154 +++-
 vllm/platforms/cpu.py                         |   3 +-
 vllm/platforms/cuda.py                        |   9 +-
 vllm/platforms/hpu.py                         |   3 +-
 vllm/platforms/interface.py                   |   4 +-
 vllm/platforms/openvino.py                    |   3 +-
 vllm/platforms/rocm.py                        |   3 +-
 vllm/platforms/tpu.py                         |   3 +-
 vllm/platforms/xpu.py                         |   3 +-
 vllm/worker/cache_engine.py                   |   3 +-
 vllm/worker/model_runner.py                   |   4 +-
 31 files changed, 2266 insertions(+), 32 deletions(-)
 create mode 100644 tests/kernels/test_triton_decode_attention.py
 create mode 100644 vllm/attention/backends/mla/__init__.py
 create mode 100644 vllm/attention/backends/mla/utils.py
 create mode 100644 vllm/attention/backends/triton_mla.py
 create mode 100644 vllm/attention/ops/triton_decode_attention.py

diff --git a/csrc/cache.h b/csrc/cache.h
index eedad9fafa3c0..55ed30bd8ce48 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -28,6 +28,11 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                              const std::string& kv_cache_dtype,
                              torch::Tensor& k_scale, torch::Tensor& v_scale);
 
+void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
+                          torch::Tensor& kv_cache, torch::Tensor& slot_mapping,
+                          const std::string& kv_cache_dtype,
+                          torch::Tensor& scale);
+
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 21a0aec0ececc..23a46b6ed8ad8 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -245,6 +245,51 @@ __global__ void reshape_and_cache_flash_kernel(
     }
   }
 }
+
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void concat_and_cache_mla_kernel(
+    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                     // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int block_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size,                      //
+    const float* scale                         //
+) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+
+  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
+                  int src_stride, int dst_stride, int size, int offset) {
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+      const int64_t src_idx = token_idx * src_stride + i;
+      const int64_t dst_idx = block_idx * block_stride +
+                              block_offset * (kv_lora_rank + pe_dim) + i +
+                              offset;
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst[dst_idx] = src[src_idx];
+      } else {
+        dst[dst_idx] =
+            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
+      }
+    }
+  };
+
+  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+}
+
 }  // namespace vllm
 
 // KV_T is the stored data type of kv-cache.
@@ -343,6 +388,56 @@ void reshape_and_cache_flash(
                              CALL_RESHAPE_AND_CACHE_FLASH);
 }
 
+// KV_T is the stored data type of kv-cache.
+// CACHE_T is the data type of key and value tensors.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)             \
+  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>           \
+      <<<grid, block, 0, stream>>>(                                    \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                    \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                    \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),             \
+          slot_mapping.data_ptr<int64_t>(), block_stride, kv_c_stride, \
+          k_pe_stride, kv_lora_rank, pe_dim, block_size,               \
+          reinterpret_cast<const float*>(scale.data_ptr()));
+
+void concat_and_cache_mla(
+    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
+                                  // pe_dim)]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& scale) {
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
+  int kv_lora_rank = kv_c.size(1);
+  int pe_dim = k_pe.size(1);
+  int block_size = kv_cache.size(1);
+
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+
+  int kv_c_stride = kv_c.stride(0);
+  int k_pe_stride = k_pe.stride(0);
+  int block_stride = kv_cache.stride(0);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(kv_lora_rank, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                             CALL_CONCAT_AND_CACHE_MLA);
+}
+
 namespace vllm {
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index ec63170d511f0..1846d9ac29943 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -463,6 +463,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
   cache_ops.impl("reshape_and_cache_flash", torch::kCUDA,
                  &reshape_and_cache_flash);
 
+  // Concat kv_c and k_pe and cache them.
+  cache_ops.def(
+      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
+      "                     Tensor! kv_cache,"
+      "                     Tensor slot_mapping,"
+      "                     str kv_cache_dtype,"
+      "                     Tensor scale) -> ()");
+  cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);
+
   // Convert the key and value cache to fp8 data type.
   cache_ops.def(
       "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
diff --git a/tests/kernels/test_triton_decode_attention.py b/tests/kernels/test_triton_decode_attention.py
new file mode 100644
index 0000000000000..14f5a3b770b69
--- /dev/null
+++ b/tests/kernels/test_triton_decode_attention.py
@@ -0,0 +1,89 @@
+import pytest
+import torch
+
+from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
+
+
+def cdiv(a, b):
+    return (a + b - 1) // b
+
+
+@pytest.mark.parametrize("B", [3, 5])
+@pytest.mark.parametrize("L", [1027, 1025])
+@pytest.mark.parametrize("H_Q", [32])
+@pytest.mark.parametrize("H_KV", [32, 8])
+@pytest.mark.parametrize("D_QK", [128, 192, 576])
+@pytest.mark.parametrize("D_V", [128, 512])
+@pytest.mark.parametrize("CACHE_SIZE", [16384])
+@pytest.mark.parametrize("PAGE_SIZE", [1, 16])
+def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
+    assert CACHE_SIZE % PAGE_SIZE == 0
+    dtype = torch.bfloat16
+    seq_len = L  # This represents the number of tokens already in the sequence
+    sm_scale = 1.0 / (D_QK**0.5)
+    num_kv_splits = 8
+
+    num_pages_per_batch = cdiv(seq_len, PAGE_SIZE)
+    req_to_page = torch.randint(0,
+                                CACHE_SIZE // PAGE_SIZE,
+                                (B, num_pages_per_batch, 1),
+                                device="cuda")
+    req_to_token = req_to_page * PAGE_SIZE
+    req_to_token = req_to_token.expand(B, num_pages_per_batch, PAGE_SIZE)
+    req_to_token = req_to_token + torch.arange(PAGE_SIZE, device="cuda").view(
+        1, 1, -1)
+    req_to_token = req_to_token.view(B, -1)
+    req_to_token = req_to_token[:, :seq_len].contiguous()
+
+    # q represents the new token being generated, one per batch
+    q = torch.randn(B, H_Q, D_QK, dtype=dtype, device="cuda")
+
+    # k_buffer and v_buffer represent all previous tokens
+    # Page size is 1.
+    k_buffer = torch.randn(CACHE_SIZE, H_KV, D_QK, dtype=dtype, device="cuda")
+    v_buffer = torch.randn(CACHE_SIZE, H_KV, D_V, dtype=dtype, device="cuda")
+
+    # o will have the same shape as q
+    o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+
+    b_seq_len = torch.full((B, ), seq_len, device="cuda")
+
+    attn_logits = torch.empty(
+        (B, H_Q, num_kv_splits, D_V + 1),
+        dtype=torch.float32,
+        device="cuda",
+    )
+
+    # Call the original implementation.
+    decode_attention_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        o,
+        req_to_token,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+    )
+
+    # Page size can be larger than 1.
+    k_buffer = k_buffer.view(CACHE_SIZE // PAGE_SIZE, PAGE_SIZE, H_KV, D_QK)
+    v_buffer = v_buffer.view(CACHE_SIZE // PAGE_SIZE, PAGE_SIZE, H_KV, D_V)
+
+    o1 = torch.zeros_like(o)
+
+    decode_attention_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        o1,
+        req_to_page,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+        PAGE_SIZE,
+    )
+
+    assert torch.allclose(o, o1)
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index 84721d5971ccf..d7c6bdd707eb7 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -5,5 +5,5 @@ class DummyPlatform(CudaPlatform):
     device_name = "DummyDevice"
 
     def get_attn_backend_cls(self, backend_name, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1):
+                             kv_cache_dtype, block_size, use_v1, use_mla):
         return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 272206d4502e9..1b797074096ed 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -20,7 +20,7 @@ compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
 compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
-compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
+#compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
 compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
 compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
 awq, casperhansen/mixtral-instruct-awq, main
diff --git a/tests/weight_loading/run_model_weight_loading_test.sh b/tests/weight_loading/run_model_weight_loading_test.sh
index 693128640e07d..8a899bc154f35 100755
--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh
@@ -3,7 +3,7 @@ SUCCESS=0
 
 while getopts "c:" OPT; do
   case ${OPT} in
-    c ) 
+    c )
         CONFIG="$OPTARG"
         ;;
     \? )
@@ -18,9 +18,14 @@ IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
 
 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do
+    if [[ $MODEL_CONFIG == \#* ]]; then
+        echo "=== SKIPPING MODEL: $MODEL_CONFIG ==="
+        continue
+    fi
+
     LOCAL_SUCCESS=0
     IFS=', ' read -r -a array <<< "$MODEL_CONFIG"
-    
+
     echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
 
     export QUANTIZATION=${array[0]}
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 4a11b0206e003..fd94134de0219 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1002,6 +1002,19 @@ def reshape_and_cache_flash(
                                                    v_scale)
 
 
+def concat_and_cache_mla(
+    kv_c: torch.Tensor,
+    k_pe: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    scale: torch.Tensor,
+) -> None:
+    torch.ops._C_cache_ops.concat_and_cache_mla(kv_c, k_pe, kv_cache,
+                                                slot_mapping, kv_cache_dtype,
+                                                scale)
+
+
 def copy_blocks(key_caches: List[torch.Tensor],
                 value_caches: List[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 8027a52b82ffc..b9425f659f7c0 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -276,3 +276,19 @@ def forward(
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
+
+
+class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
+
+    @abstractmethod
+    def forward(
+        self,
+        layer: AttentionLayer,
+        hidden_states_or_cq: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
diff --git a/vllm/attention/backends/mla/__init__.py b/vllm/attention/backends/mla/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
new file mode 100644
index 0000000000000..c6c8a6034e20f
--- /dev/null
+++ b/vllm/attention/backends/mla/utils.py
@@ -0,0 +1,365 @@
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, Generic, List, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.attention.backends.abstract import (AttentionLayer,
+                                              AttentionMetadata,
+                                              MLAAttentionImpl, T)
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.vllm_flash_attn import flash_attn_varlen_func
+
+
+@dataclass
+class MLACommonMetadata(AttentionMetadata):
+    # Input positions for rotrary embeddings since for MLA the rotary
+    # position embeddings are applied inside the attention backend
+    input_positions: torch.Tensor
+
+
+class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
+    """
+    Common class for implementing repeated parts 
+    
+    Main reference: DeepseekV2 paper, and FlashInfer Implementation
+    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+    
+    Deepseek's MLA attention works the following way:
+    * Use a single latent vector to represent the entire KV cache.
+    * The attention "simulates" a multi-head attention, while the compute is
+      similar to multi-query attention.
+    * The dataflow is as follows,
+
+        * B: batch/sequence length
+        * H: hidden size
+        * N: number of attention heads
+        * Lq: latent dimension for Q
+        * Lkv: latent dimension for K/V
+        * P: nope dimension, P+R is the actual head_dim in common attention.
+        * R: rope dimension, this slide of the head_dim goes through rope.
+        * V: V head dim.
+        * kv_c: latent/compressed KV
+        * q_c: latent/compressed Q
+        
+        #
+        # Outside the MLA attention backend
+        #
+
+        1. The hidden states (B, H) are projected down into cq (B, Lq) and
+           kv_c_k_pe (B, Lkv+R).
+        2. The kv_c_k_pe is split into kv_c (B, Lkv) and k_pe (B, R). cq
+           and kv_c are normalized.
+        
+        #
+        # Inside the MLA attention backend
+        #
+
+        * if prefill:
+        
+        3. The q_c is then projected up into the multi-head version. 
+           * q_c goes from (B, Lq) to (B, N, (P+R)), which is split into q_nope 
+             (B, N, P) and q_pe (B, N, R). 
+        4. q_pe, k_pe are then passed through rotary embeddings.
+        5. kv_c and k_pe are concatenated and inserted into the cache
+        6. The kv_c is then projected up into the multi-head version. 
+           * kv_c goes from (B, Lkv) to (B, N, (P+V)) which has the nope 
+             dimensions for K and V, which is split into k_nope (B, N, P) 
+             and v (B, N, V).
+        7. q (B, N, (P+R)) and k (B, N, (P+R)) matrices are assembled from
+           q_nope, q_pe, k_nope, k_pe.
+        8. Attention is computued with q, k, v.
+        9. The attention computation returns (B, N, V), which is projected back
+           to (B, H) using out projection.
+
+        * if decode:
+
+        3. Here's the change, we do not perform up the full up projection for
+           q_c, and there is no up projection at all for kv_c. This is
+           achieved by the technique of "weight absorption". The paper says
+           "Fortunately, due to the associative law of matrix multiplication,
+           we can absorb WUK into WUQ, and WUV into WO"
+           * The q up projection turns (B, Lq) into (B, N, (P+R)), we split it
+             into W_UQ (Lq, N, P) and W_QR (Lq, N, R).
+           * The kv_c up projection turns (B, Lkv) into (B, N, (P+V)), we split
+             it into W_UK (Lkv, N, P) and W_UV (Lkv, N, V).
+           * The out projection shape W_O (N*V, H) turns (B, N, V) into (B, H).
+           * We can precompute the product of W_UQ and W_UK into
+             W_UQ_UK (Lq, N, Lkv), which is possible due to QK^T operation in
+             attention.
+           * We can precompute the product of W_UV and W_O into
+             W_UV_O (N, Lkv, H), which is possible due to V@O as the
+             "epilogue" of attention
+        4. We still need to compute q_pe (B, N, R) by applying W_QR to q_latent.
+        5. q_pe, k_pe are then passed through rotary embeddings.
+        6. kv_c and k_pe are concatenated and inserted into the cache
+        7. By applying W_UQ_UK to q_latent, we have the new q_nope of shape
+           (B, N, Lkv).
+        8. q (B, N, (Lkv+R)), k (B, (Lkv+R)) are assembled from q_nope, q_pe,
+           kv_a, k_pe. v (B, Lkv) is exactly the same vector as kv_a.
+        9. The attention is computed with q, k, v. Note that we just performed
+           a MQA attention with (LKv+R) as our head dim.
+        10. The KV cache is updated using the new entries k (B, N, (Lkv+R)),
+           which included the v and rope values.
+        11. The attention computation returns (B, N, Lkv), which is projected
+           back to (B, H) using W_UV_O.
+
+    From @tsu-bin's calculation, we only want to use the absorption technique
+    for decode. The prefill algorithm should still use the up-projected MHA
+    for less flops and memory usage.
+    
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]],
+        logits_soft_cap: Optional[float],
+        attn_type: str,
+        # MLA Specific Arguments
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        qk_head_dim: int,
+        v_head_dim: int,
+        rotary_emb: RotaryEmbedding,
+        # q_proj should be q_b_proj if q_lora_rank is not None, but from an
+        # attention backend perspective we rely on the layer to pass in the
+        # correct matrix
+        q_proj: ColumnParallelLinear,
+        kv_b_proj: ColumnParallelLinear,
+        o_proj: RowParallelLinear,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.rotary_emb = rotary_emb
+        self.q_proj = q_proj
+        self.kv_b_proj = kv_b_proj
+        self.o_proj = o_proj
+
+    def _v_up_proj_and_o_proj(self, x):
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            return self.o_proj_absorbed(
+                x.reshape(-1, self.num_heads * self.kv_lora_rank))[0]
+        else:
+            x = torch.einsum("bnl,lnv->bnv", x, self.W_UV)
+            return self.o_proj(x.reshape(-1,
+                                         self.num_heads * self.v_head_dim))[0]
+
+    def _q_proj_and_k_up_proj(self, x):
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            return torch.matmul(x, self.W_Q_UK)\
+                .view(-1, self.num_heads, self.kv_lora_rank)
+        else:
+            x = torch.matmul(x, self.W_Q)\
+                .view(-1, self.num_heads, self.qk_nope_head_dim)
+            return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\
+                .view(-1, self.num_heads, self.kv_lora_rank)
+
+    def process_weights_after_loading(self):
+        kv_b_proj_weight = self.kv_b_proj.weight.T
+        assert kv_b_proj_weight.shape == (
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
+                f"{kv_b_proj_weight.shape=}, "
+                f"{self.kv_lora_rank=}, "
+                f"{self.num_heads=}, "
+                f"{self.qk_nope_head_dim=}, "
+                f"{self.v_head_dim=}")
+        kv_b_proj_weight = kv_b_proj_weight.view(
+            self.kv_lora_rank,
+            self.num_heads,
+            self.qk_nope_head_dim + self.v_head_dim,
+        )
+
+        W_UK, W_UV = kv_b_proj_weight.split(
+            [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        q_proj = self.q_proj.weight.T\
+                .view(-1, self.num_heads, self.qk_head_dim)
+
+        # can be W_Q or W_UQ depending q_lora_rank, the former if
+        # q_lora_rank is None, the latter otherwise. From the Attention backend
+        # perspective though we call these both W_Q and rely on the layer
+        # to pass in the correct matrix
+        W_Q = q_proj[..., :self.qk_nope_head_dim]
+        self.W_QR = q_proj[..., self.qk_nope_head_dim:]\
+            .flatten(start_dim=1).contiguous()
+
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            #
+            # Perform matrix-absorption following
+            #     https://github.com/flashinfer-ai/flashinfer/pull/551
+            # for decode, as a result we end up with absorbed weights for decode
+            # and another copy of raw weights for prefill.
+            #
+            self.W_UK, self.W_UV = kv_b_proj_weight.split(
+                [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            # We absorb `W_UK` into `W_Q` resulting in either W_Q_UK or W_UQ_UK
+            # depending q_lora_rank, the former if q_lora_rank is None, the
+            # latter otherwise
+            # basically if q_lora_rank is none we are absorbing into q_proj
+            # instead of UQ
+            self.W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
+                .flatten(start_dim=1).contiguous()
+
+            W_O = self.o_proj.weight\
+                .view(-1, self.num_heads, self.v_head_dim)
+            self.W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
+                .flatten(start_dim=0, end_dim=1).contiguous()
+
+            tp_size = get_tensor_model_parallel_world_size()
+            self.o_proj_absorbed = RowParallelLinear(
+                self.W_UV_O.shape[0] * tp_size,
+                self.W_UV_O.shape[1],
+                bias=False,
+                # TODO(lucas) figure out how to properly forward quant_method
+                #quant_config=self.o_proj.quant_method,
+            )
+
+            self.o_proj_absorbed.weight = torch.nn.Parameter(self.W_UV_O.T)
+        else:
+            self.W_UV = W_UV
+            self.W_UK = W_UK
+            self.W_Q = W_Q.flatten(start_dim=1)
+
+    @abstractmethod
+    def _forward_prefill(
+        self,
+        q: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        attn_metadata: T,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        hidden_states_or_q_c: torch.Tensor,  # query in unified attn
+        k_c_normed: torch.Tensor,  # key in unified attn
+        k_pe: torch.Tensor,  # value in unified attn
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if output is not None:
+            raise NotImplementedError(
+                "output is not yet supported for MLAImplBase")
+
+        is_decode = attn_metadata.decode_metadata is not None
+        is_prefill = attn_metadata.prefill_metadata is not None
+
+        if (is_decode and is_prefill):
+            raise NotImplementedError(
+                "chunked prefill is not supported for MLAImplBase")
+
+        # Restore head dim (for rotary embedding)
+        k_pe = k_pe.unsqueeze(1)
+        assert hasattr(attn_metadata, "input_positions")
+
+        if is_decode:
+            q_nope = self._q_proj_and_k_up_proj(hidden_states_or_q_c)
+            q_pe = torch.matmul(hidden_states_or_q_c, self.W_QR)\
+                .view(-1, self.num_heads, self.qk_rope_head_dim)
+            q_pe, k_pe = \
+                self.rotary_emb(attn_metadata.input_positions, q_pe, k_pe)
+        else:
+            assert is_prefill
+            q = self.q_proj(hidden_states_or_q_c)[0]\
+                .view(-1, self.num_heads, self.qk_head_dim)
+
+            # TODO(lucas): there must be a nicer way to write this line
+            q[..., self.qk_nope_head_dim:], k_pe = \
+                self.rotary_emb(
+                    attn_metadata.input_positions,
+                    q[..., self.qk_nope_head_dim:], k_pe)
+
+        # write the latent and rope to kv cache
+        if kv_cache.numel() > 0:
+            ops.concat_and_cache_mla(
+                k_c_normed,
+                k_pe.squeeze(1),
+                kv_cache,
+                attn_metadata.slot_mapping.flatten(),
+                kv_cache_dtype=self.kv_cache_dtype,
+                scale=layer._k_scale,
+            )
+
+        if attn_metadata.prefill_metadata is not None:
+            return self._forward_prefill(q, k_c_normed, k_pe, attn_metadata)
+
+        if attn_metadata.decode_metadata is not None:
+            return self._forward_decode(q_nope, q_pe, kv_cache, attn_metadata)
+
+    # Optional common flash-attn based prefill
+    def _forward_prefill_flash(
+        self,
+        q: torch.Tensor,
+        k_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        seq_start_loc: torch.Tensor,
+        max_prefill_seq_len: int,
+    ) -> torch.Tensor:
+
+        kv_nope = self.kv_b_proj(k_c_normed)[0]\
+            .view(-1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv_nope\
+            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim
+        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
+                                           value=0)
+
+        attn_output = flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v_padded,
+            cu_seqlens_q=seq_start_loc,
+            cu_seqlens_k=seq_start_loc,
+            max_seqlen_q=max_prefill_seq_len,
+            max_seqlen_k=max_prefill_seq_len,
+            softmax_scale=self.scale,
+            causal=True,
+        )
+        attn_output = attn_output\
+            .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
+                .reshape(-1, self.num_heads * v.shape[-1])
+
+        return self.o_proj(attn_output)[0]
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
new file mode 100644
index 0000000000000..da09bb70b4f1a
--- /dev/null
+++ b/vllm/attention/backends/triton_mla.py
@@ -0,0 +1,749 @@
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+
+from vllm.multimodal import MultiModalPlaceholderMap
+
+try:
+    from flashinfer import BatchDecodeMlaWithPagedKVCacheWrapper
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+except ImportError:
+    BatchDecodeMlaWithPagedKVCacheWrapper = None
+    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder,
+                                              AttentionState, AttentionType)
+from vllm.attention.backends.mla.utils import MLACommonImpl, MLACommonMetadata
+from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
+from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
+
+
+class TritonMLABackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRITON_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> Type["TritonMLAImpl"]:
+        return TritonMLAImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return TritonMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["TritonMLAMetadataBuilder"]:
+        return TritonMLAMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["TritonMLAState"]:
+        return TritonMLAState
+
+    @staticmethod
+    def get_kv_cache_shape(
+            num_blocks: int,
+            block_size: int,
+            num_kv_heads: int,  # assumed to be 1 for MLA
+            kv_lora_rank: int,  # passed via head_size
+    ) -> Tuple[int, ...]:
+        # TODO(lucas): remove hardcoding k_pe size as 1/8th of kv_lora_rank
+        k_pe_size = kv_lora_rank // 8
+        return (num_blocks, block_size, kv_lora_rank + k_pe_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [512]
+
+
+class TritonMLAState(AttentionState):
+
+    def __init__(self, runner):
+        self.runner = runner
+        self._is_graph_capturing = False
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        self._is_graph_capturing = True
+
+        self._graph_slot_mapping = torch.full((max_batch_size, ),
+                                              PAD_SLOT_ID,
+                                              dtype=torch.long,
+                                              device=self.runner.device)
+        self._graph_seq_lens = torch.ones(max_batch_size,
+                                          dtype=torch.int32,
+                                          device=self.runner.device)
+        self._graph_block_tables = torch.from_numpy(
+            self.runner.graph_block_tables).to(device=self.runner.device)
+
+        self._positions = torch.zeros((max_batch_size, ),
+                                      dtype=torch.long,
+                                      device=self.runner.device)
+
+        yield
+
+        self._is_graph_capturing = False
+        del self._graph_slot_mapping
+        del self._graph_seq_lens
+        del self._graph_block_tables
+        del self._positions
+
+    def graph_clone(self, batch_size: int):
+        assert self._is_graph_capturing
+        return self.__class__(self.runner)
+
+    def graph_capture_get_metadata_for_batch(
+            self, batch_size: int, is_encoder_decoder_model: bool = False):
+        assert self._is_graph_capturing
+
+        attn_metadata = self.runner.attn_backend.make_metadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=batch_size,
+            slot_mapping=self._graph_slot_mapping[:batch_size],
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            seq_lens=None,
+            seq_lens_tensor=self._graph_seq_lens[:batch_size],
+            max_query_len=1,
+            max_decode_query_len=1,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.runner.max_seq_len_to_capture,
+            query_start_loc=None,
+            seq_start_loc=None,
+            context_lens_tensor=None,
+            block_tables=self._graph_block_tables[:batch_size],
+            use_cuda_graph=True,
+            input_positions=self._positions[:batch_size],
+            head_dim=self.runner.model_config.get_head_size())
+
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "TritonMLAState does not support encoder/decoder yet")
+
+        return attn_metadata
+
+    def get_graph_input_buffers(self,
+                                attn_metadata,
+                                is_encoder_decoder_model: bool = False):
+        input_buffers = {
+            "slot_mapping": attn_metadata.slot_mapping,
+            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
+            "block_tables": attn_metadata.decode_metadata.block_tables,
+            "input_positions": attn_metadata.decode_metadata.input_positions,
+        }
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "TritonMLAState does not support encoder/decoder yet")
+
+        return input_buffers
+
+    def prepare_graph_input_buffers(self,
+                                    input_buffers,
+                                    attn_metadata,
+                                    is_encoder_decoder_model: bool = False):
+        input_positions = attn_metadata.input_positions
+        num_positions = input_positions.shape[0]
+        input_buffers["seq_lens_tensor"].copy_(
+            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
+        input_buffers["block_tables"].copy_(
+            attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        # CUDA graph buffer is padded so only perform a partial copy based on
+        # num_positions
+        input_buffers["input_positions"][:num_positions].copy_(
+            input_positions, non_blocking=True)
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "TritonMLAState does not support encoder/decoder yet")
+
+    def begin_forward(self, model_input):
+        return
+
+
+@dataclass
+class TritonMLAMetadata(MLACommonMetadata):
+    """Metadata for TritonMLAMetadata.
+
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
+
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+
+    use_cuda_graph: bool
+
+    # Maximum query length in the batch.
+    max_query_len: Optional[int] = None
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    _cached_prefill_metadata: Optional["TritonMLAMetadata"] = None
+    _cached_decode_metadata: Optional["TritonMLAMetadata"] = None
+
+    num_prefill_tokens: int
+
+    num_kv_splits: int = 4  # TODO(lucas) add heuristic
+    attn_logits: Optional[torch.Tensor] = None
+    req_idx: Optional[torch.Tensor] = None
+
+    # The dimension of the attention heads
+    head_dim: Optional[int] = None
+
+    def __post_init__(self):
+        supported_head_sizes = TritonMLABackend.get_supported_head_sizes()
+        if self.head_dim is not None and self.head_dim \
+                not in supported_head_sizes:
+            raise ValueError(
+                f"Only {supported_head_sizes} are supported for head_dim,",
+                f"received {self.head_dim}.")
+
+    @property
+    def prefill_metadata(self) -> Optional["TritonMLAMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+
+        assert self.seq_lens is not None
+        assert self.seq_lens_tensor is not None
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+        input_positions = (None if self.input_positions is None else
+                           self.input_positions[:self.num_prefill_tokens])
+
+        self._cached_prefill_metadata = TritonMLAMetadata(
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
+            input_positions=input_positions,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_query_len=0,
+            max_decode_seq_len=0,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=False,
+            head_dim=self.head_dim)
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["TritonMLAMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            return self._cached_decode_metadata
+        assert self.seq_lens_tensor is not None
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+        input_positions = (None if self.input_positions is None else
+                           self.input_positions[self.num_prefill_tokens:])
+
+        self._cached_decode_metadata = TritonMLAMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            seq_lens=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_query_len=self.max_decode_query_len,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            # Batch may be composed of prefill|decodes, adjust query start
+            # indices to refer to the start of decodes. E.g.
+            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+            query_start_loc=(self.query_start_loc[self.num_prefills:] -
+                             self.query_start_loc[self.num_prefills])
+            if self.query_start_loc is not None else None,
+            seq_start_loc=self.seq_start_loc[self.num_prefills:]
+            if self.seq_start_loc is not None else None,
+            context_lens_tensor=None,
+            block_tables=block_tables,
+            use_cuda_graph=self.use_cuda_graph,
+            input_positions=input_positions,
+            head_dim=self.head_dim)
+        return self._cached_decode_metadata
+
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        if turn_prefills_into_decodes:
+            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes. This update reflects that
+            # conversion.
+            assert self.num_decode_tokens + self.num_prefills == num_seqs
+            self.num_decode_tokens += self.num_prefills
+            self.num_prefills = 0
+            self.num_prefill_tokens = 0
+            self.max_prefill_seq_len = 0
+            self.max_query_len = 1
+
+            self.slot_mapping = self.slot_mapping[:num_seqs]
+        else:
+            assert self.seq_lens is not None
+            assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+        assert self.slot_mapping.shape == (num_seqs, )
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        assert self.block_tables is not None
+        assert self.block_tables.shape[0] == num_seqs
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        ops.advance_step_flashattn(num_seqs=num_seqs,
+                                   num_queries=num_queries,
+                                   block_size=block_size,
+                                   input_tokens=model_input.input_tokens,
+                                   sampled_token_ids=sampled_token_ids,
+                                   input_positions=model_input.input_positions,
+                                   seq_lens=self.seq_lens_tensor,
+                                   slot_mapping=self.slot_mapping,
+                                   block_tables=self.block_tables)
+
+
+class TritonMLAMetadataBuilder(AttentionMetadataBuilder[TritonMLAMetadata]):
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+
+    def prepare(self):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.input_positions: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+        self.has_prefix_cache_hit = False
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block, input_positions) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks,
+                 inter_data.input_positions):
+            self.input_positions.extend(input_positions)
+            self.context_lens.append(context_len)
+            if is_prompt:
+                mm_maps = inter_data.multi_modal_placeholder_maps
+                if mm_maps:
+                    for modality, placeholders in mm_maps.items():
+                        self.multimodal_placeholder_maps[modality].extend(
+                            placeholders)
+
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+    def _get_graph_runner_block_tables(
+            self, num_seqs: int,
+            block_tables: List[List[int]]) -> torch.Tensor:
+        # The shape of graph_block_tables is
+        # [max batch size, max context len // block size].
+        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
+        assert max_batch_size >= num_seqs
+
+        graph_block_tables = self.runner.graph_block_tables[:num_seqs]
+        for i, block_table in enumerate(block_tables):
+            if block_table:
+                num_blocks = len(block_table)
+                if num_blocks <= max_blocks:
+                    graph_block_tables[i, :num_blocks] = block_table
+                else:
+                    # It may be possible to have more blocks allocated due
+                    # to lookahead slots of multi-step, however, they are
+                    # not used anyway, so can be safely ignored.
+                    graph_block_tables[
+                        i, :max_blocks] = block_table[:max_blocks]
+
+        return torch.from_numpy(graph_block_tables).to(
+            device=self.runner.device, non_blocking=True)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
+        prefix_cache_hit = any([
+            inter_data.prefix_cache_hit
+            for inter_data in self.input_builder.inter_data_list
+        ])
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled,
+                                prefix_cache_hit)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        max_query_len = max(query_lens)
+        decode_query_lens = query_lens[self.num_prefills:]
+        if len(decode_query_lens) > 0:
+            max_decode_query_len = max(decode_query_lens)
+        else:
+            max_decode_query_len = 1
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
+
+        num_seqs = len(seq_lens)
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend([] * cuda_graph_pad_size)
+            num_decode_tokens = batch_size - self.num_prefill_tokens
+            block_tables = self._get_graph_runner_block_tables(
+                num_seqs, self.block_tables)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
+
+        assert device is not None
+        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
+                                               device, self.runner.pin_memory)
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        input_positions = async_tensor_h2d(self.input_positions, torch.long,
+                                           device, self.runner.pin_memory)
+        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
+                                               device, self.runner.pin_memory)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
+                                                device, self.runner.pin_memory)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
+
+        num_kv_splits = 8
+
+        return TritonMLAMetadata(
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=True,
+            input_positions=input_positions,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_decode_query_len=max_decode_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=use_captured_graph,
+            num_kv_splits=num_kv_splits,
+            head_dim=self.runner.model_config.get_head_size(),
+        )
+
+
+class TritonMLAImpl(MLACommonImpl[TritonMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[List[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[Dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **kwargs) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **kwargs)
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "TritonMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "TritonMLAImpl")
+
+    def _forward_prefill(
+        self,
+        q: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        attn_metadata: TritonMLAMetadata,
+    ) -> torch.Tensor:
+        assert isinstance(attn_metadata, TritonMLAMetadata)
+        return self._forward_prefill_flash(q, kv_c_normed, k_pe,
+                                           attn_metadata.seq_start_loc,
+                                           attn_metadata.max_prefill_seq_len)
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: TritonMLAMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError("FP8 Triton MLA not yet supported")
+
+        decode_meta = attn_metadata.decode_metadata
+        assert decode_meta is not None
+        B = q_nope.shape[0]
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = torch.zeros(B,
+                        self.num_heads,
+                        self.kv_lora_rank,
+                        dtype=q.dtype,
+                        device=q.device)
+
+        # TODO(lucas) Allocate ahead of time
+        attn_logits = torch.empty(
+            (
+                B,
+                self.num_heads,
+                attn_metadata.num_kv_splits,
+                # NOTE(lucas) idk why the +1 is here but sglang has it so we
+                # just mirror that
+                self.kv_lora_rank + 1,
+            ),
+            dtype=torch.float32,
+            device=q.device,
+        )
+
+        # Add a head dim of 1
+        kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.unsqueeze(2)
+        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
+        PAGE_SIZE = kv_c_and_k_pe_cache.size(1)
+
+        # Run MQA
+        decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o,
+                             decode_meta.block_tables,
+                             decode_meta.seq_lens_tensor, attn_logits,
+                             attn_metadata.num_kv_splits, self.scale,
+                             PAGE_SIZE)
+
+        return self._v_up_proj_and_o_proj(o)
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 84fe89b7df360..7f2fe7e831064 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -289,7 +289,9 @@ def __init__(self, runner: "ModelRunnerBase"):
 
     @contextmanager
     def graph_capture(self, max_batch_size: int):
+
         self._is_graph_capturing = True
+
         self._graph_slot_mapping = torch.full((max_batch_size, ),
                                               PAD_SLOT_ID,
                                               dtype=torch.long,
@@ -299,7 +301,9 @@ def graph_capture(self, max_batch_size: int):
                                           device=self.runner.device)
         self._graph_block_tables = torch.from_numpy(
             self.runner.graph_block_tables).to(device=self.runner.device)
+
         yield
+
         self._is_graph_capturing = False
         del self._graph_slot_mapping
         del self._graph_seq_lens
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 962c45a65ae23..9b804a29a485d 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -41,8 +41,10 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         per_layer_sliding_window: Optional[int] = None,
+        use_mla: bool = False,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
+        **extra_impl_args,
     ) -> None:
         super().__init__()
         if per_layer_sliding_window is not None:
@@ -101,13 +103,18 @@ def __init__(
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
         dtype = torch.get_default_dtype()
-        attn_backend = get_attn_backend(head_size, dtype, kv_cache_dtype,
-                                        block_size, is_attention_free,
-                                        blocksparse_params is not None)
+        attn_backend = get_attn_backend(head_size,
+                                        dtype,
+                                        kv_cache_dtype,
+                                        block_size,
+                                        is_attention_free,
+                                        blocksparse_params is not None,
+                                        use_mla=use_mla)
         impl_cls = attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
-                             blocksparse_params, logits_soft_cap, attn_type)
+                             blocksparse_params, logits_soft_cap, attn_type,
+                             **extra_impl_args)
         self.num_heads = num_heads
         self.head_size = head_size
         self.num_kv_heads = num_kv_heads
@@ -193,6 +200,10 @@ def extra_repr(self) -> str:
         s += f", backend={self.impl.__class__.__name__}"
         return s
 
+    def process_weights_after_loading(self):
+        if hasattr(self.impl, "process_weights_after_loading"):
+            self.impl.process_weights_after_loading()
+
 
 class MultiHeadAttention(nn.Module):
     """Multi-headed attention without any cache, used for ViT."""
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
new file mode 100644
index 0000000000000..675df109b6c0e
--- /dev/null
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -0,0 +1,667 @@
+# Adapted from
+# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
+# which was originally adapted from
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py
+
+# Changes:
+# - Add support for page size >= 1.
+
+# Copyright 2025 vLLM Team
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for decoding.
+It supports page size >= 1.
+"""
+
+import logging
+
+import triton
+import triton.language as tl
+
+from vllm.platforms import current_platform
+
+is_hip_ = current_platform.is_rocm()
+
+logger = logging.getLogger(__name__)
+
+# TODO: Remove this when triton>=3.2.0. This issue will not affect performance
+# and accuracy.
+logger.warning(
+    "The following error message 'operation scheduled before its operands' "
+    "can be ignored.")
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _fwd_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale,
+    Req_to_tokens,
+    B_Seqlen,
+    Att_Out,
+    stride_req_to_tokens_b,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    NUM_KV_SPLITS: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    split_kv_id = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_req_idx = cur_batch
+
+    off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+    q = tl.load(Q + off_q, mask=mask_d, other=0.0)
+
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                              cur_batch_seq_len)
+
+    e_max = -float("inf")
+    e_sum = 0.0
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_page_number = tl.load(
+                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
+                offs_n // PAGE_SIZE,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
+            offs_buf_k = (kv_loc[:, None] * stride_buf_kbs +
+                          cur_kv_head * stride_buf_kh + offs_d[None, :])
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_d[None, :]),
+                other=0.0,
+            )
+            qk = tl.sum(q[None, :] * k, 1)
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            qk = tl.where(offs_n < split_kv_end, qk, float("-inf"))
+
+            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
+                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 0), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max)
+            acc *= re_scale
+            acc += tl.sum(p[:, None] * v, 0)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 0)
+            e_max = n_e_max
+
+        offs_mid_o = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                      split_kv_id * stride_mid_os + offs_dv)
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum,
+            mask=(mask_dv),
+        )
+
+        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                        split_kv_id * stride_mid_os + Lv)
+
+        tl.store(
+            Att_Out + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+        )
+
+
+def _decode_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    Req_to_tokens,
+    B_Seqlen,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap,
+):
+    BLOCK = 64
+    NUM_KV_SPLITS = num_kv_splits
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    batch, head_num = q.shape[0], q.shape[1]
+
+    grid = (batch, head_num, NUM_KV_SPLITS)
+    kv_group_num = q.shape[1] // k_buffer.shape[-2]
+
+    num_warps = 4 if kv_group_num == 1 else 2
+
+    BLOCK_DMODEL = triton.next_power_of_2(Lk)
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    _fwd_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale,
+        Req_to_tokens,
+        B_Seqlen,
+        att_out,
+        Req_to_tokens.stride(0),
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(-2),
+        k_buffer.stride(-1),
+        v_buffer.stride(-2),
+        v_buffer.stride(-1),
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        PAGE_SIZE=page_size,
+        logit_cap=logit_cap,
+        num_warps=num_warps,
+        num_stages=2,
+        Lk=Lk,
+        Lv=Lv,
+    )
+
+
+@triton.jit
+def _fwd_grouped_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale,
+    Req_to_tokens,
+    B_Seqlen,
+    Att_Out,
+    stride_req_to_tokens_b,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    q_head_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    NUM_KV_SPLITS: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head_id = tl.program_id(1)
+    cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
+    split_kv_id = tl.program_id(2)
+
+    if kv_group_num > BLOCK_H:
+        VALID_BLOCK_H: tl.constexpr = BLOCK_H
+    else:
+        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
+    mask_h = mask_h & (cur_head < q_head_num)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_req_idx = cur_batch
+
+    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[
+        None, :]
+    q = tl.load(Q + offs_q,
+                mask=(mask_h[:, None]) & (mask_d[None, :]),
+                other=0.0)
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        mask_dpe = offs_dpe < Lk
+        off_qpe = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
+                   offs_dpe[None, :])
+        qpe = tl.load(Q + off_qpe,
+                      mask=(mask_h[:, None]) & (mask_dpe[None, :]),
+                      other=0.0)
+
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                              cur_batch_seq_len)
+
+    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_page_number = tl.load(
+                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx +
+                offs_n // PAGE_SIZE,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE
+            offs_buf_k = (kv_loc[None, :] * stride_buf_kbs +
+                          cur_kv_head * stride_buf_kh + offs_d[:, None])
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]),
+                other=0.0,
+            )
+            qk = tl.dot(q, k.to(q.dtype))
+            if BLOCK_DPE > 0:
+                offs_buf_kpe = (kv_loc[None, :] * stride_buf_kbs +
+                                cur_kv_head * stride_buf_kh +
+                                offs_dpe[:, None])
+                kpe = tl.load(
+                    K_Buffer + offs_buf_kpe,
+                    mask=(offs_n[None, :] < split_kv_end) &
+                    (mask_dpe[:, None]),
+                    other=0.0,
+                )
+                qk += tl.dot(qpe, kpe.to(qpe.dtype))
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            qk = tl.where(mask_h[:, None] & (offs_n[None, :] < split_kv_end),
+                          qk, float("-inf"))
+
+            offs_buf_v = (kv_loc[:, None] * stride_buf_vbs +
+                          cur_kv_head * stride_buf_vh + offs_dv[None, :])
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            acc *= re_scale[:, None]
+            acc += tl.dot(p.to(v.dtype), v)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 1)
+            e_max = n_e_max
+
+        offs_mid_o = (cur_batch * stride_mid_ob +
+                      cur_head[:, None] * stride_mid_oh +
+                      split_kv_id * stride_mid_os + offs_dv[None, :])
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum[:, None],
+            mask=(mask_h[:, None]) & (mask_dv[None, :]),
+        )
+
+        offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh +
+                        split_kv_id * stride_mid_os + Lv)
+
+        tl.store(
+            Att_Out + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+            mask=mask_h,
+        )
+
+
+def _decode_grouped_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    Req_to_tokens,
+    B_Seqlen,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap,
+):
+    BLOCK = 32
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    # [TODO] work around shmem limit on MI3xx
+    if is_hip_ and Lk >= 576:
+        BLOCK = 16
+
+    if Lk == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    elif Lk == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
+    else:
+        BLOCK_DMODEL = triton.next_power_of_2(Lk)
+        BLOCK_DPE = 0
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    batch, head_num = q.shape[0], q.shape[1]
+    kv_group_num = q.shape[1] // k_buffer.shape[-2]
+
+    BLOCK_H = 16
+    NUM_KV_SPLITS = num_kv_splits
+    grid = (
+        batch,
+        triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
+        NUM_KV_SPLITS,
+    )
+
+    extra_kargs = {}
+    if is_hip_:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {
+            "waves_per_eu": 4,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2
+        }
+
+    _fwd_grouped_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale,
+        Req_to_tokens,
+        B_Seqlen,
+        att_out,
+        Req_to_tokens.stride(0),
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(-2),
+        k_buffer.stride(-1),
+        v_buffer.stride(-2),
+        v_buffer.stride(-1),
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        q_head_num=head_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        BLOCK_H=BLOCK_H,
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        PAGE_SIZE=page_size,
+        logit_cap=logit_cap,
+        num_warps=4,
+        num_stages=2,
+        Lk=Lk,
+        Lv=Lv,
+        **extra_kargs,
+    )
+
+
+@triton.jit
+def _fwd_kernel_stage2(
+    Mid_O,
+    o,
+    B_Seqlen,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_obs,
+    stride_oh,
+    NUM_KV_SPLITS: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+
+    offs_d = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lv
+
+    e_sum = 0.0
+    e_max = -float("inf")
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + Lv
+
+    for split_kv_id in range(0, NUM_KV_SPLITS):
+        kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+        split_kv_start = kv_len_per_split * split_kv_id
+        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split,
+                                  cur_batch_seq_len)
+
+        if split_kv_end > split_kv_start:
+            tv = tl.load(Mid_O + offs_v + split_kv_id * stride_mid_os,
+                         mask=mask_d,
+                         other=0.0)
+            tlogic = tl.load(Mid_O + offs_logic + split_kv_id * stride_mid_os)
+            n_e_max = tl.maximum(tlogic, e_max)
+
+            old_scale = tl.exp(e_max - n_e_max)
+            acc *= old_scale
+            exp_logic = tl.exp(tlogic - n_e_max)
+            acc += exp_logic * tv
+
+            e_sum = e_sum * old_scale + exp_logic
+            e_max = n_e_max
+
+    tl.store(
+        o + cur_batch * stride_obs + cur_head * stride_oh + offs_d,
+        acc / e_sum,
+        mask=mask_d,
+    )
+
+
+def _decode_softmax_reducev_fwd(
+    logits,
+    q,
+    o,
+    v_buffer,
+    b_seq_len,
+    num_kv_splits,
+):
+    batch, head_num = q.shape[0], q.shape[1]
+    Lv = v_buffer.shape[-1]
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    NUM_KV_SPLITS = num_kv_splits
+
+    extra_kargs = {}
+    if is_hip_:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {
+            "waves_per_eu": 4,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2
+        }
+
+    grid = (batch, head_num)
+    _fwd_kernel_stage2[grid](
+        logits,
+        o,
+        b_seq_len,
+        logits.stride(0),
+        logits.stride(1),
+        logits.stride(2),
+        o.stride(0),
+        o.stride(1),
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        BLOCK_DV=BLOCK_DV,
+        Lv=Lv,
+        num_warps=4,
+        num_stages=2,
+        **extra_kargs,
+    )
+
+
+def decode_attention_fwd_normal(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap=0.0,
+):
+    _decode_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        req_to_token,
+        b_seq_len,
+        num_kv_splits,
+        sm_scale,
+        page_size,
+        logit_cap,
+    )
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
+                                num_kv_splits)
+
+
+def decode_attention_fwd_grouped(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size,
+    logit_cap=0.0,
+):
+    _decode_grouped_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        req_to_token,
+        b_seq_len,
+        num_kv_splits,
+        sm_scale,
+        page_size,
+        logit_cap,
+    )
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
+                                num_kv_splits)
+
+
+def decode_attention_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_seq_len,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    page_size=1,
+    logit_cap=0.0,
+):
+    assert num_kv_splits == attn_logits.shape[2]
+    kv_group_num = q.shape[1] // v_buffer.shape[-2]
+
+    if kv_group_num == 1:
+        # MHA
+        decode_attention_fwd_normal(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            req_to_token,
+            b_seq_len,
+            attn_logits,
+            num_kv_splits,
+            sm_scale,
+            page_size,
+            logit_cap,
+        )
+    else:
+        # GQA/MQA/MLA
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            req_to_token,
+            b_seq_len,
+            attn_logits,
+            num_kv_splits,
+            sm_scale,
+            page_size,
+            logit_cap,
+        )
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 1376274d57777..4c6bbc7272280 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -83,6 +83,7 @@ def get_attn_backend(
     block_size: int,
     is_attention_free: bool,
     is_blocksparse: bool = False,
+    use_mla: bool = False,
 ) -> Type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
     # Accessing envs.* behind an @lru_cache decorator can cause the wrong
@@ -97,6 +98,7 @@ def get_attn_backend(
         is_attention_free=is_attention_free,
         is_blocksparse=is_blocksparse,
         use_v1=envs.VLLM_USE_V1,
+        use_mla=use_mla,
     )
 
 
@@ -109,6 +111,7 @@ def _cached_get_attn_backend(
     is_attention_free: bool,
     is_blocksparse: bool = False,
     use_v1: bool = False,
+    use_mla: bool = False,
 ) -> Type[AttentionBackend]:
     if is_blocksparse:
         logger.info("Using BlocksparseFlashAttention backend.")
@@ -141,7 +144,8 @@ def _cached_get_attn_backend(
 
     # get device-specific attn_backend
     attention_cls = current_platform.get_attn_backend_cls(
-        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1)
+        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
+        use_mla)
     if not attention_cls:
         raise ValueError(
             f"Invalid attention backend for {current_platform.device_name}")
diff --git a/vllm/config.py b/vllm/config.py
index 58464eae80b82..f6bd8b1ad8f14 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -736,17 +736,25 @@ def get_vocab_size(self) -> int:
     def get_hidden_size(self) -> int:
         return self.hf_text_config.hidden_size
 
+    @property
+    def is_deepseek_mla(self) -> bool:
+        # TODO add deepseek_v3
+        return hasattr(self.hf_text_config,
+                       "model_type") and (self.hf_text_config.model_type
+                                          in ('deepseek_v2'))
+
     def get_head_size(self) -> int:
         # TODO remove hard code
-        if hasattr(self.hf_text_config,
-                   "model_type") and (self.hf_text_config.model_type
-                                      in ('deepseek_v2', 'deepseek_v3')):
-            qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim",
-                                       0)
-            qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim",
-                                       0)
-            if qk_rope_head_dim and qk_nope_head_dim:
-                return qk_rope_head_dim + qk_nope_head_dim
+        if self.is_deepseek_mla:
+            if self.use_mla:
+                return self.hf_text_config.kv_lora_rank
+            else:
+                qk_rope_head_dim = getattr(self.hf_text_config,
+                                           "qk_rope_head_dim", 0)
+                qk_nope_head_dim = getattr(self.hf_text_config,
+                                           "qk_nope_head_dim", 0)
+                if qk_rope_head_dim and qk_nope_head_dim:
+                    return qk_rope_head_dim + qk_nope_head_dim
 
         if self.is_attention_free:
             return 0
@@ -805,6 +813,10 @@ def get_total_num_kv_heads(self) -> int:
 
     def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
         """Returns the number of KV heads per GPU."""
+        if self.use_mla:
+            # When using MLA during decode it becomes MQA
+            return 1
+
         total_num_kv_heads = self.get_total_num_kv_heads()
         # If tensor parallelism is used, we divide the number of KV heads by
         # the tensor parallel size. We will replicate the KV heads in the
@@ -955,6 +967,11 @@ def is_cross_encoder(self) -> bool:
         architectures = getattr(self.hf_config, "architectures", [])
         return ModelRegistry.is_cross_encoder_model(architectures)
 
+    @property
+    def use_mla(self) -> bool:
+        use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
+        return use_mla
+
     @property
     def supported_runner_types(self) -> Set[RunnerType]:
         return {_TASK_RUNNER[task] for task in self.supported_tasks}
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1f203b6eaeb33..cc7c99e50ac4d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -931,7 +931,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=str,
             default="auto",
             help='The worker class to use for distributed execution.')
-
         parser.add_argument(
             "--generation-config",
             type=nullable_str,
diff --git a/vllm/envs.py b/vllm/envs.py
index 8627caec7790d..2a18e3b9bc51d 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -77,6 +77,8 @@
     V_SCALE_CONSTANT: int = 100
     VLLM_SERVER_DEV_MODE: bool = False
     VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
+    VLLM_MLA_DISABLE: bool = False
+    VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
 
 
 def get_default_cache_root():
@@ -506,6 +508,18 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # TTFT and overall throughput.
     "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
     lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
+
+    # If set, vLLM will disable the MLA attention optimizations.
+    "VLLM_MLA_DISABLE":
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
+
+    # Flag that can control whether or not we perform matrix-absorption for MLA
+    # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the
+    # matrices reduces the runtime FLOPs needed to compute MLA but requires
+    # storing more weights, W_Q_UK and W_UV_O, so can increase memory usage,
+    # the is enabled by default
+    "VLLM_MLA_PERFORM_MATRIX_ABSORPTION":
+    lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1")))
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 712266ee42639..62babcddd61b1 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -23,6 +23,7 @@
 from transformers import AutoModelForCausalLM
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
+from vllm.attention import Attention
 from vllm.config import (LoadConfig, LoadFormat, ModelConfig, ParallelConfig,
                          VllmConfig, set_current_vllm_config)
 from vllm.distributed import (get_tensor_model_parallel_rank,
@@ -397,6 +398,11 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                     # parameters onto device for processing and back off after.
                     with device_loading_context(module, target_device):
                         quant_method.process_weights_after_loading(module)
+                elif isinstance(module, Attention) and \
+                    hasattr(module, "process_weights_after_loading"):
+                    # When attention modules need to process weights after
+                    # currently only used by MLA
+                    module.process_weights_after_loading()
         return model.eval()
 
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index af6810a140b43..73388cd269853 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -326,12 +326,156 @@ def forward(
         return output
 
 
+class DeepseekV2MLAAttention(nn.Module):
+    """
+    Main reference: DeepseekV2 paper, and FlashInfer Implementation
+    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+    
+    For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.q_lora_rank,
+                                             bias=False,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.q_a_proj")
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.q_b_proj")
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.q_proj")
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa")
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj")
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        rope_scaling["rope_type"] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        self.mla_attn = Attention(
+            num_heads=self.num_local_heads,
+            head_size=self.kv_lora_rank,
+            scale=self.scaling,
+            num_kv_heads=1,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_mla=True,
+            # MLA Args
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            qk_head_dim=self.qk_head_dim,
+            v_head_dim=self.v_head_dim,
+            rotary_emb=self.rotary_emb,
+            q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
+            kv_b_proj=self.kv_b_proj,
+            o_proj=self.o_proj,
+        )
+
+        self.prefix = prefix
+        self.debug_layer_idx = int(self.prefix.split(".")[-2])
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            ckq = self.q_a_proj(hidden_states)[0]
+            hidden_states_or_q_c = self.q_a_layernorm(ckq)
+        else:
+            hidden_states_or_q_c = hidden_states
+        kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
+        return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, kv_cache,
+                             attn_metadata)
+
+
 class DeepseekV2DecoderLayer(nn.Module):
 
     def __init__(
         self,
         config: PretrainedConfig,
         prefix: str,
+        model_config: ModelConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -344,7 +488,11 @@ def __init__(
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
         layer_idx = int(prefix.split(sep='.')[-1])
-        self.self_attn = DeepseekV2Attention(
+        if model_config.use_mla:
+            attn_cls = DeepseekV2MLAAttention
+        else:
+            attn_cls = DeepseekV2Attention
+        self.self_attn = attn_cls(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -421,6 +569,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
@@ -440,6 +589,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             lambda prefix: DeepseekV2DecoderLayer(
                 config,
                 prefix,
+                model_config=model_config,
                 cache_config=cache_config,
                 quant_config=quant_config,
             ),
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 74948202cbe48..159ea94f99a27 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -31,7 +31,8 @@ def get_device_name(cls, device_id: int = 0) -> str:
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         if selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         logger.info("Using Torch SDPA backend.")
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index e4b436edf7588..91dcdff006e3e 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -157,10 +157,14 @@ def get_current_memory_usage(cls,
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1) -> str:
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
         if use_v1:
             logger.info("Using Flash Attention backend on V1 engine.")
             return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+        if use_mla:
+            logger.info("Using Triton MLA backend.")
+            return "vllm.attention.backends.triton_mla.TritonMLABackend"
         if selected_backend == _Backend.FLASHINFER:
             logger.info("Using FlashInfer backend.")
             return "vllm.attention.backends.flashinfer.FlashInferBackend"
@@ -171,7 +175,8 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
             pass
         elif selected_backend:
             raise ValueError(
-                f"Invalid attention backend for {cls.device_name}")
+                f"Invalid attention backend for {cls.device_name}, "
+                f"with use_v1: {use_v1} use_mla: {use_mla}")
 
         target_backend = _Backend.FLASH_ATTN
         if not cls.has_device_capability(80):
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index a32c262c84efa..0e1c4c0c5949f 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -27,7 +27,8 @@ class HpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         logger.info("Using HPUAttention backend.")
         return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f2ecec3203fb7..186fa54bfc14c 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -30,6 +30,7 @@ class _Backend(enum.Enum):
     TORCH_SDPA = enum.auto()
     OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
+    TRITON_MLA = enum.auto()
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     IPEX = enum.auto()
@@ -139,7 +140,8 @@ def is_cuda_alike(self) -> bool:
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         """Get the attention backend class of a device."""
         return ""
 
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 7d414165a8188..3282c061714d3 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -30,7 +30,8 @@ class OpenVinoPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         if selected_backend != _Backend.OPENVINO:
             logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
         logger.info("Using OpenVINO Attention backend.")
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 5ef56406e1935..8888521631481 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -75,7 +75,8 @@ class RocmPlatform(Platform):
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1) -> str:
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if selected_backend == _Backend.ROCM_FLASH:
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 05a3aa4305cfa..494a17633974d 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -29,7 +29,8 @@ class TpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
         logger.info("Using Pallas backend.")
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index c34b5b58672e7..a5ca77f57cf47 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -27,7 +27,8 @@ class XPUPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool) -> str:
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
         if selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
         logger.info("Using IPEX attention backend.")
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 7ccd4571b19df..08316ba74aad8 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -56,7 +56,8 @@ def __init__(
                                              model_config.dtype,
                                              cache_config.cache_dtype,
                                              self.block_size,
-                                             model_config.is_attention_free)
+                                             model_config.is_attention_free,
+                                             use_mla=model_config.use_mla)
 
         # Initialize the cache.
         self.gpu_cache = self._allocate_kv_cache(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index bf1a40d48a789..160c0662ce976 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1066,6 +1066,7 @@ def __init__(
             self.kv_cache_dtype,
             self.block_size,
             self.model_config.is_attention_free,
+            use_mla=self.model_config.use_mla,
         ) if needs_attn_backend else None
         if self.attn_backend:
             self.attn_state = self.attn_backend.get_state_cls()(
@@ -1973,7 +1974,8 @@ def forward(
 
         # Copy the input tensors to the input buffers.
         self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
-        self.input_buffers["positions"].copy_(positions, non_blocking=True)
+        if positions is not None:
+            self.input_buffers["positions"].copy_(positions, non_blocking=True)
 
         if self.backend_name != "NO_ATTENTION":
             self.input_buffers["slot_mapping"].copy_(

From 7a8987dac5f0ed0c798a73e8b4ec8f5e640bc63a Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 31 Jan 2025 00:19:35 -0800
Subject: [PATCH 49/69] [Bugfix] Gracefully handle huggingface hub http error
 (#12571)

---
 vllm/transformers_utils/config.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index c97acffa1a719..5805f4ad0b7f7 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -7,7 +7,8 @@
 import huggingface_hub
 from huggingface_hub import (file_exists, hf_hub_download,
                              try_to_load_from_cache)
-from huggingface_hub.utils import (EntryNotFoundError, LocalEntryNotFoundError,
+from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
+                                   LocalEntryNotFoundError,
                                    RepositoryNotFoundError,
                                    RevisionNotFoundError)
 from torch import nn
@@ -294,6 +295,13 @@ def get_hf_file_to_dict(file_name: str,
                 logger.debug("File or repository not found in hf_hub_download",
                              e)
                 return None
+            except HfHubHTTPError as e:
+                logger.warning(
+                    "Cannot connect to Hugging Face Hub. Skipping file "
+                    "download for '%s':",
+                    file_name,
+                    exc_info=e)
+                return None
             file_path = Path(hf_hub_file)
 
         with open(file_path) as file:

From e3f7ff65e7a6c08cd354f7f333bce543a4f0607e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 31 Jan 2025 17:20:34 +0000
Subject: [PATCH 50/69] Add favicon to docs (#12611)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../assets/logos/vllm-logo-only-light.ico       | Bin 0 -> 16958 bytes
 docs/source/conf.py                             |   1 +
 2 files changed, 1 insertion(+)
 create mode 100644 docs/source/assets/logos/vllm-logo-only-light.ico

diff --git a/docs/source/assets/logos/vllm-logo-only-light.ico b/docs/source/assets/logos/vllm-logo-only-light.ico
new file mode 100644
index 0000000000000000000000000000000000000000..27528ceebfff401d0516b73099381c7425aaff3a
GIT binary patch
literal 16958
zcmeI4%WoS+9LF8?&?8b0aOK!5Hx5Wt{TH}$OG=?sAVsPu5<vw~q*g@&sfelwsys;_
zDTL<HkXK7XLTH*MZInh$?AS>hJATA(JGOWC$M2VAisS6AcV{1QoKE!1WOu#e-)FwF
zznz_(iOW@oKgW)_@b~MkqsLvYS6wdGQ8e7;I)_I4)e-=>D%cS`{w`Vz+8kOEO+u@-
zL_U#k<Rke?K9ld2_|%TShUZ>Hi=b5_FS@0f$Y=7M7>GrUvwVLYe)b)-HMCku#6nEO
zR?b_xeF%>?p%u|;EfEv35o0Mc`<v>9j*Dounuv`UmD&Ndk6P5dTuWjkR;D}U6d!8o
zdPf}1M69JTkz(XQOT<jt(>;>z6Sk*ZE(ft#%&^7daS%nz!xAwQJM|N6zrV-hNk|R1
zLcIT~Ve?IoLnf24S|4I&{W`15-cg>2-*o+P2y&fgL1?ZwY|){12#3Qv%Cu;M^=U0C
zPPv$+NFL4%2O!pW)i68NSLXJ4@Oiz;g`3;U`t#;W(Kws5zc?9(#-RWNC+>3b7h697
z_uMQLi<aM>>3PA{YoD}#vo|dBFD-z8EXJ>a2Jlb(0jX3<qo%nI*~-qef1~F={$K!v
zI~O?k>mfF94K_A5xFGlHB&I#%C;!E%UM~LJ%_dm(xIvO^mCUlLD%Jjf{t!rOcnvf+
z7_Nh2%Sl*%G6jV~!7?B1ET~lbQhvy$Kxq5WFh9kNz*r9?5(zuqwxsNG?RW5F`xW}X
z<Kj>BU4g**x+Ol^SxC9|JNWU7#hF1a{`@azz_a+gs$+nX_T~Iizzsr+@faYqoB-eB
z3E0}&vQs`&Wh-f4&JX!42pu1BwSkTMw_$U0)0B^Povfz)E`E#w;@}l7{?wgsVQsBC
zhbq*x-^Gsth;#S3_zUeHgL~nb9nZt2nuV73O8ipDhcVy;M;$c4%G95b&1OycYxT)m
z+AHxxVGD%LMh^aZh>UbXB;v@Bjjn2{q`e|P#emUkT>R-@FTje=r^`1dK2%A2MSgTl
zS{UczFSecn_xw{31i=YB&F|2oy()jyk1^mqjyh<7waH;fr_-9gIdM>r_G<hf6hXLg
zmV>_@HV2y_6bd<ksQMjxwO8fGp%?K%2N!?#))(Mi_Uvh2Sg-c#{1^ikr?~j(dD!!G
z+Kw?XRlQ3O7|~vnKOVv{v6EbNus+@o$s`ToIiHMZuf-3d07BR2T>SC=A0ZG3I1X=A
zdrf}yUYxka#lLm^Q}8S;IvxXzYF~+8TAt<Nr+cS&W)kxGykl*ENBc_rkle%=@IF@^
z1Rr#7zfW|k0(i8q$dB(fh3>Dp_>(=C!SDAwg3qgcMSh9_4}a(4&v!I}`}u++F@RTl
zJ^a$jJo596iLE>u1A4b^XELL`9)8Fqa9sQpR~rbA-h^m0Y9~IJ(Owt7gfXD^8!rBI
z_a#_eU9pYdtoFM2(R*?FE*F2H?KHTb&Dl04W>$N>{E~l>F9u-TSp8?zjyWP=h4y;+
zA(sK6{WMn_hz_;cGDpN6Ex_@<2)xx7Hf3M_6@_fB!g~uE6C3!Ri$Bx#l`V5bG6fN$
z8NswAa{A_PiBCqj_>0(obkEJ~+4Hbv!8e-(@$W9Y7aE@@;-46>V~&VrX!;A1K#B!%
zY^dGP*g$OHXV?e^^?yyZe-h(?6kY{!<R@L@q`4cP@yrpa{s|>i4*?Ix#><-TjEeX?
zygX}YjtH^u%>WWl+r#HXX>|dFo=d9zgdollq4}~p6S31A!2OkQ+<be?<hS#?>LJnh
z1FY+xBf{1S?Js`jZ84AI#}9G*u<6A;)8HHtwoa=0V*q1T%JVr8=g06}$|u|LVRd#=
z&m0kAruolD(5mNUsuG{xeYrG<v%?@PJ<{`Tkj;OmH2}4>{sG6-btJa_;pO@lc^$|>
zUjIaG;GnO6qdGXq>mP}qwF9~qs&kSVI>wQRg_xAqJu>d{oYu!tjHLU-e(WER&*VEX
zl&<wF=d83%{R8SJu(d+eXRgNjAfH%&p7ra<cWplT9|M3O=Woa5{1va9AHFB&hgg12
j-mYW0y!rC?Uey58f#vsPM_%aVmA4NwU75}x?}vW@qr)|O

literal 0
HcmV?d00001

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7aa52db092e36..6b0a1dad142b7 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -70,6 +70,7 @@
 html_title = project
 html_theme = 'sphinx_book_theme'
 html_logo = 'assets/logos/vllm-logo-text-light.png'
+html_favicon = 'assets/logos/vllm-logo-only-light.ico'
 html_theme_options = {
     'path_to_docs': 'docs/source',
     'repository_url': 'https://github.com/vllm-project/vllm',

From 325f679f324c1044cfaa0c594bf0d817eeda4451 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 31 Jan 2025 15:06:39 -0500
Subject: [PATCH 51/69] [BugFix] Fix Torch.Compile For DeepSeek (#12594)

Co-authored-by: simon-mo <xmo@berkeley.edu>
---
 .../model_executor/layers/quantization/fp8.py | 54 ++++++++++---------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 21d4355b36ab0..57dd6e310297d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -245,20 +245,24 @@ def create_weights(
                 layer.register_parameter("input_scale", None)
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        # Block quant doesn't need to process weights after loading
+        # TODO(rob): refactor block quant into separate class.
         if self.block_quant:
+            assert self.quant_config.activation_scheme == "dynamic"
             if current_platform.is_rocm():
-                weight, weight_scale, _ = \
+                weight, weight_scale_inv, _ = \
                     normalize_e4m3fn_to_e4m3fnuz(
                         weight=layer.weight,
-                        weight_scale=layer.weight_scale_inv,
-                        input_scale=layer.input_scale)
-                layer.weight = Parameter(weight, requires_grad=False)
-                layer.weight_scale_inv = Parameter(weight_scale,
-                                                   requires_grad=False)
+                        weight_scale=layer.weight_scale_inv)
+            else:
+                weight = layer.weight.data
+                weight_scale_inv = layer.weight_scale_inv.data
+
+            # Torch.compile cannot use Parameter subclasses.
+            layer.weight = Parameter(weight, requires_grad=False)
+            layer.weight_scale_inv = Parameter(weight_scale_inv,
+                                               requires_grad=False)
             return
-        layer.weight = torch.nn.Parameter(layer.weight.data,
-                                          requires_grad=False)
+
         # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
@@ -507,8 +511,9 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
             layer.w2_input_scale = None
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        # Block quant doesn't need to process weights after loading
+        # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
+            assert self.quant_config.activation_scheme == "dynamic"
             if current_platform.is_rocm():
                 w13_weight, w13_weight_scale_inv, w13_input_scale = \
                     normalize_e4m3fn_to_e4m3fnuz(
@@ -518,22 +523,21 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     normalize_e4m3fn_to_e4m3fnuz(
                         layer.w2_weight, layer.w2_weight_scale_inv,
                         layer.w2_input_scale)
-                # Reset the parameter
-                layer.w13_weight = torch.nn.Parameter(w13_weight,
-                                                      requires_grad=False)
-                layer.w13_weight_scale_inv = torch.nn.Parameter(
-                    w13_weight_scale_inv, requires_grad=False)
-                if w13_input_scale is not None:
-                    layer.w13_input_scale = torch.nn.Parameter(
-                        w13_input_scale, requires_grad=False)
-                layer.w2_weight = torch.nn.Parameter(w2_weight,
-                                                     requires_grad=False)
-                layer.w2_weight_scale_inv = torch.nn.Parameter(
-                    w2_weight_scale_inv, requires_grad=False)
-                if w2_input_scale is not None:
-                    layer.w2_input_scale = torch.nn.Parameter(
-                        w2_input_scale, requires_grad=False)
+            else:
+                w13_weight = layer.w13_weight.data
+                w13_weight_scale_inv = layer.w13_weight_scale_inv.data
+                w2_weight = layer.w2_weight
+                w2_weight_scale_inv = layer.w2_weight_scale_inv
+
+            # torch.compile() cannot use Parameter subclasses.
+            layer.w13_weight = Parameter(w13_weight, requires_grad=False)
+            layer.w13_weight_scale_inv = Parameter(w13_weight_scale_inv,
+                                                   requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight, requires_grad=False)
+            layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
+                                                  requires_grad=False)
             return
+
         # If checkpoint is fp16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             # If rocm, use float8_e4m3fnuz as dtype

From 847f883232cedef583775d6f4e13baa2446ba1c7 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Fri, 31 Jan 2025 12:30:33 -0800
Subject: [PATCH 52/69] [Git] Automatically sign-off commits (#12595)

It's very annoying when I forgot to add `-s` in `git commit` to
sign-off, because I then need to `git rebase HEAD~1 --signoff` and `git
push -f` to fix the DCO. This PR adds a hook to sign off commits
automatically when `-s` is missing to solve this problem. The only
change from the user side is now users have to install 2 hooks, so
instead of just

```
pre-commit install
```

Now we need to

```
pre-commit install --hook-type pre-commit --hook-type commit-msg
```

Note that even if users still only install the pre-commit hook, they
won't get any error in `git commit`. Just the sign-off hook won't run.

cc @hmellor @youkaichao

---------

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 .pre-commit-config.yaml              | 13 +++++++++++++
 docs/source/contributing/overview.md |  2 +-
 format.sh                            |  3 ++-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 77010090965d4..ae518e1902f53 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -85,9 +85,22 @@ repos:
     entry: tools/png-lint.sh
     language: script
     types: [png]
+  - id: signoff-commit
+    name: Sign-off Commit
+    entry: bash
+    args:
+      - -c
+      - |
+        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
+          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
+        fi
+    language: system
+    verbose: true
+    stages: [commit-msg]
   - id: suggestion
     name: Suggestion
     entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
     language: system
     verbose: true
     pass_filenames: false
+
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 908c7cb4d38ee..af09bfecc6499 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -26,7 +26,7 @@ Check out the [building from source](#build-from-source) documentation for detai
 pip install -r requirements-dev.txt
 
 # Linting, formatting and static type checking
-pre-commit install
+pre-commit install --hook-type pre-commit --hook-type commit-msg
 
 # You can manually run pre-commit with
 pre-commit run --all-files
diff --git a/format.sh b/format.sh
index 4bcd0be0c96e5..3e78bf9865f0d 100755
--- a/format.sh
+++ b/format.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
 echo "vLLM linting system has been moved from format.sh to pre-commit hook."
-echo "Please run 'pip install -r requirements-lint.txt' and 'pre-commit install' to install the pre-commit hook."
+echo "Please run 'pip install -r requirements-lint.txt', followed by"
+echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
 echo "Then linters will run automatically before each commit."

From 60bcef000ebcfaf120edc1972a8136344d9bfa0d Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Fri, 31 Jan 2025 12:30:46 -0800
Subject: [PATCH 53/69] [Docs][V1] Prefix caching design (#12598)

- Create v1 design document section in docs.
- Add prefix caching design doc.

@WoosukKwon @ywang96

---------

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 .../v1/prefix_caching/example-time-1.png      | Bin 0 -> 34837 bytes
 .../v1/prefix_caching/example-time-3.png      | Bin 0 -> 37069 bytes
 .../v1/prefix_caching/example-time-4.png      | Bin 0 -> 41530 bytes
 .../v1/prefix_caching/example-time-5.png      | Bin 0 -> 39727 bytes
 .../v1/prefix_caching/example-time-6.png      | Bin 0 -> 25462 bytes
 .../v1/prefix_caching/example-time-7.png      | Bin 0 -> 33144 bytes
 .../assets/design/v1/prefix_caching/free.png  | Bin 0 -> 17933 bytes
 .../design/v1/prefix_caching/overview.png     | Bin 0 -> 33028 bytes
 docs/source/design/v1/prefix_caching.md       | 228 ++++++++++++++++++
 docs/source/index.md                          |   7 +
 10 files changed, 235 insertions(+)
 create mode 100644 docs/source/assets/design/v1/prefix_caching/example-time-1.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/example-time-3.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/example-time-4.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/example-time-5.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/example-time-6.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/example-time-7.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/free.png
 create mode 100644 docs/source/assets/design/v1/prefix_caching/overview.png
 create mode 100644 docs/source/design/v1/prefix_caching.md

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-1.png b/docs/source/assets/design/v1/prefix_caching/example-time-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8849ca0237c39b4c428c4ab74c08b512812846f5
GIT binary patch
literal 34837
zcmdqIcU03|w=OC#A_&q11gRn-pj7Fhy-1TTMd@7x1Oz1XD!oUF^ddF%BE1HQfKmdX
zg&KMZE%Xk#L4Cjd-E;Oici%h4-Q$kSACLhgYp%Iwd7fv@U-(Nk1!6*4!W%bk5G%fr
z)x2>7*Yw5>oCgHAZrr#r^onf$#tr`)in7w$?#7!Lc*!i?DIs2PV*&oM6sP_g+qcHp
z7uZ*Q*e_e8U^RnSRlG<zb009NxmWt^AW=JY$b?JPRv=D17wwaYzVG5vTMWKh*SwU=
zc=VirIf(4P>6<K3vi9KC{ibZ-dvJh!f`!{H8NyWLE!ta;oiaT#j0Sh{xC~-u@J>;S
z#JsDXY0!9c0QD5bZ{xy_cBG|kgwH&riB5^mMAIk_76Znr+3p0!iDZ2R%5$`>4rz*X
zgxfw=f7A(V`^xrbQ(ND}Ofj**=)P&K!Gr)`JwT}O5%qzEW7`GB0EByFuPOT1k+o)@
zx}Wdpkl+J+m0#WnBk2+|g;cS{EsQ|39(C?xo>;pt+jI{Gtkiydl`LixGyF}vjZYEI
zQH*yCDEKm=(qL7RXGKC26Jwd1w{gKC?aqjAr!CDt)&4{!i4p_m^jz?d-A)b&Npbw`
zY+0U{y*Rk?&Mi=0FEU9#=>f_EseEUQ@u0n8-D10Eq159}_ozEYF9)yid@?A9kLS|K
zjugpmO(k_`h){h9$rD*P0Og&`U^v%M0ryB2+VA4ECIe@lp2<kBvUMIBJ6cOah+y$d
zc{=??8!#LeRlh6$2Q8_%LaLg;k>qXE0OkpYn%Q@!n$1{Bu+mw>lGMhLBT`(CWc;?G
zj&UYc&_0GgZWTh>YBC6rrvGDd;K+ye*PLq(io4>`&I5*cmvxje)_vy#v+7S2)M2+_
zmu}J!Zl9x=TeFLe)s#oz2X~I11hdf7B@1x?-btpT;=HA@{IVz&rf!4=VWJ(=+d1%j
zGyw8RNelBACzUS)l~=j4agW|gFXC<`Q)W=ZVufOI*m`jmn~e!O%0#G<_#|ag$0mwN
z(Ss$5h!w^$oGC7jm^XKi-V7}pZW|HGWIgT_-<v(nQq;*L`fW1pckJ1x0i&aM_wE92
zq^HgCusbSqT%hsFUJ@E`plHJZUn#r7?&oR-`D9Wu2%EC~jV3r4!4WG4_8r@45vK8=
zXgeDlDeF8?G<@$?8=JmZ#g}V5DA?Od0(Q;_7D?8U?%8$hCIuuOQz!iR(Ec2Mtsp?2
ztrL}nxH;dQun-3pF$!vZ84(HXw9W+e>I)GtRJz1-8_ea+w|i#t1oP*hQusLlvB#`(
zV9noI2qzku#M_-E^sM|M*i#kB>jT<9PTf&Zny*NRapPvl9sG#9K!()9->e<lJ@(GK
zYn9k>d0cJ;kcYg8Q0sTa>(>S_4GjyL)&}u!WN2e;GUj36kD!hLI$eSDoufEAz7O%~
zS5w}KI<U{sR)ZgqjGHJg@O$t+3&SP-44dy;Nas_Hjg~2nzPkO}g`Qz<?T(&Z)HQO_
z=)?|Peiw7mx8(AMEx7FhwH3I04_D|;9qq{6sx*^B$=N&hMNH=jV%^#=DCG-&1Ru{K
zQkmsz9nEPt_RNeiI)h6AiHc2YMYwMf{!THm!ef?BSNMvEns!e7KCzl3FMo1&7+-4~
zS|k@3MU!6~?AHE#{!Q6kd8LTlB&2J015d%Huac_&IO<Iq4)o_<Gt<L@a}URN-Xgir
z>a7S?31bKrxQz+-Pi&&}NN92enejh&TpM)nRK!cjhi99L=BJqwvlhvvfFe)9?K5~K
z@<Y4$F`JRo3P-S|sA;L*qDcB*YrDBxBB{JZz^HpWvBhKn)7gaYt8Q^8tb?0xJX|%?
zpR8BxHVK9ODKXamocMQ=1G*-8U{$WxMRVzHG|8DX9oo%$fr~q>`laW4bM6;~l#BpV
zB!|UPzZe@?xx*gG_{b1nt}?TT=RRI{6w(o$_Bm$a(D5+O_RVcReLRC(W*bQz#j)Ku
zQ24IXh4rgMT%lTEU)b=DCXFN7SAuaCPbBW@%u#<eO3}$VwmaUPm?qaiMfHs{nI9M3
z2lw&DIB~slU=(pxlG}%ki7w;cUegGCM?zC+)Z$PH$pDJpK_qoJ_O<K>Kv&$DTgoXz
z_`d<4&xV;`(@KdiX8|G%Y=;qc9CHNE(*z9@w&Hfxp(Ohp&iEdqCU-vxnv-pJgzYcv
zrn!Hq%CC$LsEm&{pUh^VxeG9rQi)*E%GOn09`G}4y7iU5(lLTH*Hk6)yUq8My?LZr
z2YgS9&M%P?(ZCg_9a9(0(@88me?z{tlEo~F{wo~cVA?B`79R#-iPm^Ih9HLt{^<b5
zUlMV<2viy@#k6of$^)W3nHbeJ7LD|Mn;7O^oH}QF=Sl(F6IGwgjxHaX?M#T7ijdf0
zGF?bj5KxZ*RD92|C11``eTRZw_qJJ*EJQ3<=VZJ3Q%jm7?^(?U6fHJgPBw&jf5y$i
z;WC+%=LhC*1}qqO+ixEs7Ir$g6`fS+%{D7fT!xmH%s|n~yMH<iaYwHg_)#Y{x7Ek}
zb`RM=q=QKeMc0d@*?f>jadZ4;*_4}l6TK*MrgJ;Ym(X<0A7`V|u_5{DhkzW?qtp5m
zB=oc?r-__CouGjEE|XQLi?Oe4FSHPDshGqPQvL0`Tuo^qi9VhY#vFt1^xW#%P4LfI
zt;F%U#g@De8(tzJs0fO3=)(CC*6D73Y7r!?Eb3l2ot_J+M9j_$51epvy!X7jZTMr2
z$>}-x4mWqOR1)Pr*%${yywGg-q=@_?joF_Ld*D<oYWfcn@Vx)8*YJl+{(s7WY~(mM
z$M(G+O{~Am!+#rc_}?$|-7Q#yB@|P=!}dAvDhDMWeUbHuEmtA>1pR1HEKf;QYNn7~
zrFi6vbYKoI$Ji(4z+CK)9kbypg@-|8IkRzB=nBcWJYSZc8m=Y5u<|5xj;yAnJ@%{w
z^;22NSFc}TopMH@(aEFeqnBV~@WYnOh^JFi(uHSPLy{wFFWkGhv>0pP0b5RlN!;02
z_TjouhzCu^5&7Qh#yz>-nv%;Dr`me2us&Q<m!*O{I<(X%uxGB22DVZvpN_O9D4gbb
z_pt31eJ#HGoB*B`_QY3ly*Ec?>bY^?nkPr%UXd3EXixDpL48Tf@)axAVkz=h7nd4-
zr@M5hr=V#{)Rxt@3g)uJE$8G?nyJmv<ZMVrI1h5N6uf5Wch7S2-X5(i0+Gvubnjc<
z1tC2j0+RAECo3}tL;Nv{nAIev*Fk=VSJma>D_oJYS!XY_6Qz8Xv)8=*E>X%Tt|Zln
z@(P{k6^gTsB!@ipS)W&wnfbH6n9m0V%4*J8epkEua18CbgOmq<6LMoQY1_@i5(-aK
zpUJ%1Y_*zRWokLHJ{z}Bl6P9ya9GkP#GD;n9%NXdl}x;M&JIEyBpEXXh7e5RNl2gm
z#_fHyRMx`YDwBy0k5(4sO84yB{;~YLao#1;*-c?Sx-iz;({E2=+ap(H+*$AJTzyul
zLgZZJ_%tc2Wo<9;nb{(9mdR4ML{ffM+d0s4^_5lg>S?S2cX-ddl;4%K=Wbr^5ay~A
zA71P6?<YIUbof4-8Hxd{+EwRStfW+zG6&*U+k2dyY$eGdymbyojC2aR8Ct9;w-2*s
z+mhb5eVasa^7nr#2e_sHCv8Fs5GL+Qlob#{v0tZl)&f{*pP6_9FtQz`GGt_GoGsU&
zFBt|Qr6ewsadUQ}$Mhr^L>x^gP-|+<cFo}!siy+7)6NaI-qE;StUqlE$-j*89C;zh
zOb?xTY?oO6wPOh0$hnJ|aXA#--mj%jr@YL{E*30=H`$h_Z`T}lX^5oS9_bA$b~CO?
zF4FK+tpzkPpg09n6|x;SscHE^ULiWhzD2;)Y#E~~cwxPhj7Cgm>*~AX(?+ifr{jbE
z&;hyKmZJ&FjZ~r2pJM8b-lz3htqd6(MV@i$epm1fRB8!lmYS4jLbFL`eYLvCqX;XP
zk*j7=wy=SEW8V;EuMm`cGhnDoVyp0S`ZQeZ{rwCgIlT$L`mO2xrjw3+ux*9V*bE|S
z7{r~VSCLD)rUWtm+*42XJj_=~d-qtu1HVsbjOPB7`;6dFidA~sTlHNxMFBONs0qi4
zNHwV*(ImeU^9V8DgE8fd+|%VLbvdW=+{)@-={k`6^VlejB+Zo;@p2^@&K6@Ez4HlE
z-Ty3??lvLxel3lFwW(+OHpLKCqMmpYw{ijw{nPWywDS4_XF{m)1%eSGXaxLr>N!mT
z@=R3^iw(9HHW=_xDC=sYhArHU<b28455>v7m~{6@K;Xy-3>FdDwY0o@1S%cVv&aVq
zI+Y}D<e^*lOabO6l?N-z>YxQr3E;3CYW*pff#fzZ{vC#f?wbfFQ>a&Q<`M;inn<4C
z#k{tg3LEaM#m4^2tk$zn)l~yBo4x!1&yldc)dZ^`Si?4b<qVr$CJeA2h3X}p)g@tx
z)&flnGw_=FmpJ&GpnZ3W&+W|)`hC^i7BEEX9N2n%8CAG|1{JUPXR~cLOlJ_$mCgDJ
zOdJ|M(|5uUQ_D&Ws2BmzOVFWeHIZ>6g+PCKk=5kKGK*|cn0MJ4r)L!PpFuWv`0azy
zkewQqUajnwHIKLNh*ndcuYl`$?ISeGMqe8WwXRMsXtO-^EQN;TllBf<Ofv;>sot-U
z1Nty-HH^xmT+_;!f|=<M8-uNGN~D<$GL*h6yZHIaJZ+9=i}hJRVv(NWXp7Iw{>%lu
z+`O}f8Z2>{o;_*NSvnH^V5!SBLE(jVcInJ6$&TAOo9BzKMNOmj25)pvR5|Szf<4~I
z8Oiwa4KajRIHWVv&D!QKC#s+u_Dc7h97Ii-THQ?m!*_-oQQk*}>Lc;qO_x3V!}#x8
zD!rqeVTE6^-lybCxQ#!Qm>&#ZwxF-Xwv1-jQ&_N5O1Z>&DSsom?D^)n`?M6~{t<mn
zu30SJN(sS~OS9^5Jn1Z3#8k!$w$j4uGS8C}0g)<B#C82)EY_rfD-RXv(xVkMZ?GlU
zgEo#ow=e_=XsL?~H1YZVsGoFiNehvei9t^6tBX!&fa?=a%xI?^s-&9`Y6gu1Dt>x?
zTd#&ZR?_=bHM03sJ$=_{Y+s8!PB$K9r8XFhCK&AXoT9R7x^)z=-=D!)vw8%_bTpC6
zYcaN+E=24w)J+fm>_US&V81$!B<O3pVY<`io5hE;qXOGy7HTqg4Kke<vbWu%KPQz;
z+&H)njz!XxwVu~B@+oihi=)%9ZfB$2^7Z7&xIH6yq*oC}5y8I>)nE3{wihxIMraRl
z9twc=!ZftxHECw*R>^WDRKhK4;Gu_!xFPK7$Hs2Plt>D}RCS=Sk?)vOIZ62uWL|||
zbvK%=8PM8-d$g3H)1Zc9l9g*gItg|8#2a9^acc4CQ&EU(Sw1z(;)UNhG!*_(L9g1W
znPQgWwf1$ZrPk=2Wm;ya<PxG^^^_vlQgnPDLULadS?JI7nm!$RJk#jSwL!YZ(0bNF
zF~c|P-NC4x4oJ^zAsK(QnmOm0;CI-X)U&GR_jgmC-?n0CelJ(8^(p<Zt~JfsAxCY&
zgRdSq7_iYkG!?HvVB+Lb;{sC<g?QK`30g^ZS=n3g<=Iqfh>r9=J@2>T(Fw9(>xprA
z@%2^|?1q$2P3k*QQ$)I7+&Rp#hdCEt#9tyKWPI40@N+kgg?7@RfEf)5jhbGA>u}_1
zhpz%VJ@0mFYAJ3AOUq`Q(SWhUdbD`YU6;RHMyH+oUS|eaxh#uzH=nWG*-*u|+fW@-
zJf5<jVsBx4_b7{oDvy4j;pmh|&gveCjmW!I23~p|dS0oH7gt|h?^>0K;YYzssgewy
z5gwZZ_>M#3SE0$YD!Ryii54YGZ31RlDc0hX^IUJ-WEUY3F%>Eci5n~=X1KvjiVGR;
z<Hql6env7G(J~*D?m^!p$Q&graO)<{Y(?>(Uid<8q-Xlkk_w2}+?jTS7rhT{*=fdZ
z`@nx+B5EOc{$f@@-u~i>{R$It*EntMOx&EGnzr?N<F+k!hO#^&%l#qe(H!F%#6Ej_
zxaBhK>fbEZZ5kGtw7w8jEc^2T{BiFc4e2=F*e?d3Qa%CUcCkMk=7aLm^Im1`c<k;C
zFf)*vNmi9G&nI|LX6~EI{sfqc_zUc8`7C1rL@X7{B=CrLeZfA9RVa<7esVmbau5fK
zBEY!0>O4|2#*fnYb~vWbN{9}p{mItDGhB^&GbfsTXIvyK=FXT+uo)6(kzZ|$amry^
za?yRhZ@(poghuNpR2-VJ@t);=LHRx=cv#{7Wr5^zTJ93nEX9b{bxhvBj>*r0c)Mj_
zwBrUrviurU7c3_}rU(CWyR(oc_jNicxb8uEu|COoP(Ol4%+y=Bb>iqe)<I;ba{5E=
z?)xVXQGAxMvAnR+5ADfF5%--h*3o|rS2p0&L8KxsirMFD!^;P9O9fq`GI@<gyW|82
zY-B^tqDr4lc1q}0G2bA5PXCoRz1Gjrpcls?<Gz+Tl(#_E$wyt-9AzUJ+_UJ<=Mz_}
zVy2tP5mxFzv>#5+(y@^d@Z%OCuu?nC{IzmxtU|V~RQKh87-Ex3Yw~Lix_nBOF6?^q
zV$cgc_bU&hbZiDGhN8DzEo$Z^G#r*GSU!x(KSi~hYCpY0#K5~368oB7tAJ5+n}C#|
zvZ|L`Bi3r9--_PtQ(Egw`$wmz=M_)8iJ_+%w-J@g8XQK~Eh*u;CG~u0C$oJ*aI7Vt
zywc6L(bC}_r84h5KgSzqhLG8@*ka)I?VETeDy}ZmgXL_mU2*ZpviQ!rj)&9Kd3fdp
z$ks&jcemXhmSCI47_*V%IyseDRKrBqTR*C!63zudtlwD1hV|$i?Y7oC#R@iS{x&A|
z$EMu9uH_l!>7*GoNo_&=C}Yn8>7l|q(23iz=o*{lSL*v5+Kl*7t8g)VV-Nbucf|;t
z+*o6K%4)Ggc}5TTM8}H<hf0~pCw%4(`*p(L2VYO|GgO3H*wz&#;is0uvEMk}T;@eb
zx=T?vGlZCB<97Xc$nvFBeaT}NLNuc)ePP!FeJ@ZldFw@AoT<JSO;_pdD=T@p-G%-_
zYWX`QY<~K&T<0g2atVZ?oUCFSXM+Oa#A6Ysa+56w@l3XutH-p!DA|*Y+bP=$6^(A~
z;}WsPnh@haq;0wo+P?}Jg|{$Iq-Mh#f2(<%p=y>Jr5VYkjg(9|H<b;X*%NuIbeGOQ
z2&2GW!2j&-V!haEXjYdWGO6*%#mwp#g~?JNMFO*J=T2vyrbxb6<em-yP!bCd?ZpV@
z%%?5WpSXn8%5#?ChAhGoiio=_R>tZp@3BS=XqROI?iKz{T=~FcD^_oHt)~o?S~27G
z=1-43N9a73f?57%Fg4M8sXD^p+GFGZFij}Qe6MCc8t-lHCN1B55!r*flMAb3AT|bm
ziB%I>76P`D<%VpMM0o4Ek<fJ9U!EXkcn*+EzSV<n=YQ)7XkJ{X8T~4g*ns(xdCiap
z!k%1R6=J0f^HgeLaf}75u2%q-+|e;2MO&l7aX{4ci?}>f=3Jn?p+fftz?=qj$B$yP
z?O4;Zd=xcs!03g42Ml8J7~>rbD*&g3e8kLm&FA94Mp%Skrapf;T~Ny2%i4#0@@<Gu
zqK<hq94rE239nv7mr=+flb;z!VoP3!qB#RIOZ*a_U&_&I+jUykJgdwBBy+Xx?N(hP
zxs+0&vyZr4hp(fOy>~8JHN0ulW&}BHi~e{Gd~3KinQ;@^<V9XROV6f!2rv}~xxTY8
z#zr2qiO|<2@N50uOb%L%8Jn^(eqX`Ij_8-V&>-k;2QESmaV#tGlMO&Qj?4gf#JeCI
z84SmJ+*0*4x)(k@+uMYe;;ty9ro3K*?unQ#;j_d<8Sm%V0h&)#d@@FHoM(dB6gFu0
z3*THzq`$LsAv87ifz&(8&cBX}Kr@<>1>ny%;ZgKKX!KrAXl56`md1D$Nhl}q!;4Uj
zxbt$U*$k^C=ypJ->%O<tZi_Qj!N7<NjGro+gMZ#be&EEfxK%t={x5@v&Jx4`uQsX3
zXZA!M#5o%q<hD2^pUIyZbF?35NiM#zkh0a`8N)cXO%~iw$Z!n2;dz0gsoW*A9BmV+
zGsYr{FpUn*)j`70a0LH}MV8L!caL)hV_?H49U=N%wbq?WFA=_|w&2`^!$RRbf~we<
zOQzP%$()zKu@l7opjoZ5n7>C9g^0f=ykZKCVNQ>9)c#6-D3G;gXYNdtS-*2t9G-G>
zQ-2Lq<>1cX@(5$wu*i<~uRa6{Ix)BipxJYwUnZM)0XXi6Tw=7^zY}lG%daHKBa(k8
zyiavde82S*vHS_=eo8UkBcOhU&eQ#H4udH6YS*qz+X9-?J+Z|@*|NS|PWZ>~sqE)N
zC-QP&=`xKj&(UJ<4!%{7>I{Q2N|v~&xK|q|C9MB00~GoG#5vy%`{Thh4%)9O=U?ah
z@3Ppb`1PcE8i4dO)Jui0L4D57+i>`uF}-I><^#*GO7_{6RWw?57lw0oG9$LuJv!xU
zV<$5UKcC9pUoWe*l^{<$JhACMjQDruqS)l1?wNgGh;kYAAIU&itCYN#->9no`^c1h
z2G7X4xA{S&s+^sB_4k8*9^?<<!_rj!3<bRoWT{Q<yaZJ<<h4QQxK{16EE?iB#YMyp
z!L&YOKfyi4zslXi;uF>)o`1q2U$+OmoiE<QwyY%?tN!`bXDk)GlGWZ-)-?G?u~4o`
zP@hruNKo$qdq2#2Qu5?FWWx&=lJeR(|1Xr9S;TkG*xmsEHz68y1~*V|D2v~iK}B)p
zd4NdM{=Nn^+eTJGB(|&t;-1cwNQ7)kU2w}S_+i;y+gl}N<Yo$zm?1r$y2h3K*)k*p
zSuKCoI1P*Kx#yUerM)-w{LEb*>x5C5_;b6gtlDKsgDorjzHi(9Ep6=~!UkJCY-7*&
zlt|CxKJXaJ?d|kaE6lejlnEn7m#O|u2l6u+cEU%eOpu*(M;-?T^W5k!Sxz$UYjr#f
z7vl!OUQwV6&Aishp`ne`GCa*oIVgV|+-a{s-^12&H-*ci^NNO~jJ>M|=L|lIe?+O-
zC-%(~p4)gZp>+Fk>P$Z*`$f($=mW+8c}#g&F!@;>NWb&J#+dW{!0yo0wFE&ZWqN#j
z+qa}OsYvgg4kNRx(^4?H<;zrN^V|qa4^PUmc_4xDvjL)6`-`_Fqo=Z=crFY)X2vd?
z&$c@c*cjT}C><a(b<u#m2G`x-RX4=cN&mf5%KrfINwOexdUgL$qQ*!0n%ySbyijzr
zGo`U|l)frI&>G;>%{6m)<efM^5gmfh^N85ZW@))El1re=K5KwT&F92MVJqnu=Q%!L
z{U_a?0ymbe@&fTmdw}ikHXF=dTe{@(q~at3$5V<VG{lyB9KASGSlMBy7baf!Fe(X5
zk#)ZH2;c6UOUM!9hSiTF{XJ~FX<SkFyBkh2=KM1}E~H_i`GMgUbz@-Mp0+-Zw#L;k
z*A8*_1rqM8U&7~_J(lqCgu6XqaGu*<wkSwxx!8$DSyf0b`MTe~PXtc)K0VSZkChKR
zI<O#d<l$kZ$KOuuecZ@P%QpXm3?cQQmlfq8=_T6gKBR%lvmoVk*t)#9L^Rr#x6bA5
zCGfRv)IEg(TZeQqyaL3ePTOpIXS)yBk^QoyHCcNd2T9H4l5A&~=kUl@q<%?5Q9x7x
z-|~;HsvI+UeLbyvemj%7<xbx)ThJ4eP|~!q%L*Z16Y_sD*dRgL7}oQ{J`q<w@U*D8
zoRdkbJkikZ+rbFS@fJIN)U7@GYS<=y87ID7pSbhuB|L+l6indg^DRpJ&6uYHAC@h!
ztR$SC*Z79G=Lmr?*KjCbhtjE-Im8Iifzy24w4c-DvT`4+UIE!j*yCgH5aP+?5GfO}
zY)LmDNyer$@-Y_VL|pA2iox^<5L|p^Ae5_fzhsH+-219|5yXvMG-cCys`Cu!39+EO
zpPING55mzxz`CgOo|3wxH`D5p3RZL4;8;%Wc7SkZ*eBkfso|v+S$s{mA)k}HUNiJT
zxrxbo;k=6!eUrr`N%zf1V0W325=hciSeUT$(lOIT_u$6|WN0k6(<X-Is)E0t*T{32
zwX>F9g-ZPDQ@BRU%NB8$rLcKH6KgTwL#p9^Vcq73c8&S>R4YW9(d9)f=wCvu4}UOA
z%Vb0^Kp``v+NJ#~ZJ$6w{gmD1wS0->BN@-~JMaWxTAUlLNV|stra(qV$vfCT*A7vT
zWUSG`rXAQsOr?B|ZB|wI0Byr#bP6Oi8LA^zSIM41o*>U4t7!m3*-Np#pOHmL7ieff
zMwf)T<V75Rj2h{*#q-l-685*R2lZUXCB=T?V8wd)hAfqz33&{)mZEMH8tT|iGir9T
zFBH!H4L-+DLW9|_J~lLx97qsLuW%g3E!c#Y^jXn<G~CCEuuT@U@ah48R#R6`?`Jv>
zgpvDKTao?is}kd@^Al#Pksi$26h|0Vpc*+SInLr@GMMp$;=v~J%zAc3*%;Spjpq-f
zpWXvGGwBiaww}yhVo<eMBGG^Z($!alVH$IS5RaSMQgXZ(+O^()rRSY8eH-){c2h`Y
z4V#0<x6{C0=$6{^bSw*0QpFFGdV4dqHd>rCW5ItHQ&q4(9Z%xijlH+iTGJI<0~N)u
zB`i3DMNREY`JupEqcgh?6q(R(Ihn`Z2JZzVCEj7oFS)L7>}C0}fUxah!71Mp$`E*a
zCm$hH7sG2k_%}iXibDr@^P)4MCAh0^kv?^Yw%~D`u!z<L@8TNVYjdEQ?*_4qp9(ZZ
z;AE45*AoRr=P6JrL=TMNY^c%2m~hPiTLq0k!sed$j=y7|9tC0xOCEgvYg3@d&F7}o
zf1UdnmkM^A-ho=D4<^?-okyHxJ&kl%&Dls}mDf%=TVmMa#OJi*t-enPqb9-Y#%@!>
zaWNRix5MxhyUT$<G@029$?1!f)dnqk-u2Q^9sRlszqg{MBld0l5ezp_BIe(-7S){F
z*S>$*K439bA!SSFSOjKX9`>4hSaiPt<a_(@{F?)4$pl_vX>=u4xv75`C8%|gb#Y^{
zQWqR;QTLV<s!P!>C}8Z^p8SC()5NKfb%vJTZ!<$&(hI+D`*Fkf9HK-p?%1M#6?Y_h
zMXYQMhYc1<YHUK@HrlnA5&yHMOJ7X3yt+$?`#nw9+H*15x2^e_j7zLdZF=VG)WmPc
zZa$R1Z$q#*Ash$2#m^$X4-@nfm9)LlXp+9Ny50Kt?CNdtt5exrYO4sFsw<4KS8dh%
z9`3A(xXfGoz+Cx627$3YxY^v|Vvr*Q%+%_4_TGlp>FG!@w#9O(E8HE<y2Pj|5N=4E
zs#sA}lqOrjCVCLOhpDSG-Cz=P*{}gA-}9gB(Qk`BpK7Z{-F)^cW1ug@R6BK`Pu6r&
zNXyqpa{huMX#fbEK)*};0j|us<`IxzVRKK1pNr=X6vv`uo;QkX_EaMuEYgbu<s($6
zCX?z}A{Mhc9Kj(1OE27kkMFX)J`^#{<KO7xZxS>zJCQ)#18bdSVmRS{8(<~mC-Uf)
z@!PRpbwnc*x8X4&2duGdpXFK}!w>uK#&8o$@YK~aK69@;?<>r==dI@EjRL7Qn2a%;
zA<thIy*m9)+pyy&FG3RaawkQckghN68Dm%X?RnqRO@*KO*{_$$%|`}SC|qvZzB-MH
z_q-O4R3k0~RqTlus|!Ap+>nGEPtR!3_G>=fzg_iu4AE??GW7^@q9U;=)<wfma~iR)
z4!)7)xBHU_ek`oKq@}BIeIaG?tCY~$I${nY&ao0qIw7Q)Iqg0#tJ45Oo#S;f5*yLB
z`Mx7TGv-uvSmSj|xU?Tl+B~gaORKbnN&OUjcv(oZ!7GBUuu_m|=6bf^>1k4(E-0nn
zUL)uQi{udgZWb8eXr-_hgI>l|#xQKme!47gd$@;8*wvRs)GN6XK2-Z`a-5K_mJ(oX
z34Rblyu#_yLn-@I3@?_WmvjcGK{-&ORq#Obc=&d(^8lX<FAR)PHN|}lDX~u!z|;6w
z_NP;f2?Ryoq33lKd6dqmT;RC7XAUXz#5O($K7)>7nw8FSJBR+Yn^VEgZD0~Cn%e~a
zD0hn*sruXs8UWr~_(ak@>MH%_O^~}UwZc<c{Q0wBP9OAV{Fwt`AvR`-0{agY6hbCa
zsgJ`?F!znB=Y8asbS%j$6oEb^Zy0#fm+&2$cq3L>@k`yOjRN&ZbgP_0Y*Oc&F{OY`
zoLq}&R7%tv`>)%y@0TJ0DX1Xe%PJv+TY{zHuP*tP?A3*jVhe6m8S;`3TmEX20b$jh
zlZBPcU7+0!_33*)q@EinvFR>G5sCT745bW2^-T&);(iqdq4OZ8H{+I(14tYTTkEm)
z%D`-&ihEKg4pPow<;e3Ap}*v5g|=RL^UQvm#Pfnhc&%V8^zVs1n%oZ)<X8E8$1QQw
z_Tir%SMojVjqbe(@0{0d!y5i&Q@caEt}7JLZ?I|V!R-Lq>q{kmn<=;V^BUszv3zfO
z_1!8xvJ%RsY&Rss@CKSz=DI|Yd!b@<c-YJ%Wf!)8%~p41__jVD%TMmSjr=#RfkjiO
zP?0;xZ@VPP(Fi3E{89#~$><0U$0*puPTQkB<gXjc{}uj6+nP_kcMV&trmxJ548BxJ
zvFj#Kc8F-H{{`VSIhVDLH5r%V_c*eC{2TW;PfhMP?uzd$GA#A5URHahXP!zqTrl1T
z&BWqwlt``dtr?wvXu$vKGyZ${3o$M}$%V6K=^AU1NuhRubM{`Ada(Cd5^?-P4$ued
zc7L26F#VII;jFt@LWI>O|Nr&vr2Ltxbm+YU6=|ubYb61hPpr+QWtwsXqr>rLj0cGy
zx7^>vPQ>*LMdLAO4;FsdeCYhh$f-*wbKNCTxoJp$I-Mei5vIQIY}>_BT8vVlkiPjx
zL*7%E;fkR{wa?J!F;i!5Qfk&+%(BZnX(FJUJ>Z<J6Ik(;t(ezYT8e!wd}wzkZv$Ds
z7fWcUNto-q%`tpWz|GNHPP25C`gX^)ui?+{JzWFi<-a;`mF7>h5Dra?GuOwwR~Ptb
z5P#xVa|S4~tDGY^brz^e{kfv3qrZ{r{Coroif^zpXie63{kln)Zt#r2;TH8HBOCZ_
zZ5|QwBh2kXCOmu1m>I)u#R!`#WM*Lpmm;4e!*gC46ivo93)M*-<&ibnYtkv!PLkT0
zfJ!IDU9qax>(kf2BXxF#&&c<e>l<k+>Vy3UEnulD51S-hS3k`O+bdtkZR~vELwuD3
zRqSm{cfe!T&eh4JLkFeR!EgB|gVYR_Qf!|Gc1e}z*GTb*_o)S9J?Sg)h|XpMu8ar6
zG<5~>H@^iGPy%0*B_@52D@Wg9sKmkZ(IcFFKtPRMU6!bU7j^bg5|*ATJ?EE)`TTwO
z)d@M1*5G<R#Px~RkM=woD^x&1iTH9Q3C%9GY6N1$ydZD<MfzxFq4b*n#Kw!`{h>yO
zfE{rXnqkVM7ZF;KO@Y89#jjxZ!|yn0WnZfB`m-p(bhmM^vVBGHvp|Y=k92%>H<Dz(
zZBZ-m`ddDA>E6c^19I5=zpPZp^@s9q7j6!^9+Mxn<TL7CO3U;&a?+5m`jrdOePP7k
zTz>>Ren+CqRfzdva0QB8p)ZXC`dDE{a$q9!DuTZ^bZW18L0-Tt1MLamcT|yn%J$h-
zV2ux~GlOZ!g|K$Q`H5nTXpS*)3r+a6{+WX~xmA=gJ@B;trb04Wm-zUegIUD|cRlQ|
z<Xm8$`*zub(nQ=kWlh6gawO5Pkj;hek8Z~VV!lh~>IZ7rznE+G(RKL70hebcvEz`0
z$7ch14T)t`tyc@ps$BSYR#A}n1rLC^_$WPZrXPLa?9J82tJp`)EFV`NddI-&6qlR2
zII${rZ|FSzh(6uxejMC&E{ud2-n>YFF!cdw?gDFa?Q6XZ>LvUE!xO)jR7!@dPc0-g
zFh&;F@2nu~=T??-gpCKEakqdmiV=0~9JmESA*-#0VrI2Ne1}fD)YnR3PB^drf?tA@
z5_XW1Kt;=&3-y6yP~S)+em}sGmSMi83lvwb!H@QbvtR}Se6<Iyqzz*vhS@p=^@Av=
zLvOEwEUsMZcO-%BBu}je;;buqw>~+_UvvcTV3&smI@11a7Pt?I>me4S+1-nlq%FT7
zE!EgXc!u@^OfRI68dq{df=^!LhI6O{4gZ~o6E?h7(ON*M4&+ASKy4=Z$>{o9^Z>_6
zd3z~EEB+FcLHjM|&rrR~=~x3>1Dq2y#vZ^2S5I^l;S)DD{Nd!^s@r!H9!)V5V*h)-
zMpd<`Z3>|%l3^B##~^coxuWTQVn`awvNWK9*zr5F*z=DeSI1r}sZn4UqS`$6u$NJe
zZH&vljO=UqdA0)kjhZdn^t^jxtB0_B-s@s4aDNsz0>-!cPI_@`CU3lbH=0UlZ)a9+
zfPY5@?KN&Ae$t?j)G^qV&x`iYc2+iX)FG2J;n<8C;)}*Fkm}%6M94p^$4>u|158IL
zk+^R}O;zOfNGOT_hS+rPDLT#Eo)-}JA(bR%>2Q=yd3Mu6sXJ)6J$MkuA_*`Xwju_F
z2Qmb@n=QI&O1n`4|0a73h~E~PpF5rYO5gkfD}Vr)nYx-3T{i=b;<>vcU_|Y)9k-dI
z;7EA>PNm_n6HYd6o&3;ftPEHw6BUibAwu}ifATgWRQSHjAQ2z1vlzcnC^fKZDJ_!m
zjMrgk`3vu)ECcQ|7`-#!c+8KFb&v0x7#K}~xEJ<iRL;5kbY!UK?J4>+;3Wim_ZndG
z)U8b7+D&0Ae^*puoZjXYv(0Ijy&MczM7XIw$(UjzfKKcWx*G@<*_j5DBs;AAeZn9j
zGzZ?{>*rbO(-djFtR=v5wKxdL5)|%o74T>Zhls8iJ1!}n9j_=k%$|>*VR3~Z+Z#Am
zk{N$_7(+k#FbsF&=IT`{C7lWWy&6XvMW6r@XP>3q7{VQV?(B6&6laH3JFxSZO|HbN
z7T-h*T0;(d7^TH^`HKu?r~syIrTghi8FyD1D&LtW#MH(zW-&40oB2ccdWS^Qx!PlQ
zXDMQU-}JtTj@3TL21=~Wf{%U2KETu>Zh=6;40wZ2HNI#zs?wmzxxL_Vr?c-<?_Xh(
z;yAMI>I@5ve?#3pyr>L)r=)4<umpkHYH~38`RF#}d)-6g<nB^ECvC=%xE=(|;+c&K
z<WLGn4&qeXwy%|e`4$Gex}Wso5DD{E(cOvS31%s<fTwMPpk1%-W#iuK0r9$5;zmEC
zBlStv2DIkz?~>sAaaqKbtGepqF84Yn;;w+}o|&7A3ZfzgesFK3UhrvI+hU_Go8&gj
z=mXoEcdCBkv9zn`9Pv&rDby6=a{30*Dv-VBWLVrh(XbmB%XhX;)6~V@P=MB_Y~no_
zcpGXc-*#$mIm%XxVM|xEv%}=C(L!qyffpBN__WV_pQ!%$saWOIRl^8T3qpAq%0QQX
z=A8t*ln2`_&L=$RQ*SH@acfTwpu8)z(t^7pjAMGRe)NP^FCtA2TMRp5S)~N|u_NK>
zq}3!VkH3jtg^C7P&}Ll618o$N`4I%lp};uQ#8N~*`gdvTF43Nxm1Zwa;GoCe@JB3K
zEXrT3Q0``Kll&Vfu=ByvPerd>Zp|NFwdaZAx2?VaRO!3p_W}6zUP4Ya*1Y;fljA4{
z`0n0P>D@nmyAfY$JXu4Th``y8VX!rnS$AspC%Gq&+G2lT`z?{Z69vQZ65`0TAWr$!
zly6GSUExnw^zKxa{{RW`4K35{`}Pmb<Xg<8V={-8%%<?nyb?QZfJ4!y4ELR|t;y~k
zD!KE3S;fb^_>(=S9yU~d;QKM+Q(jNc65op+BlDcQ6fL;9uzBX4Hc8dL>#jL{c9l5f
zwB}#M7)<_REY7KKk_OE_kV)wlm97i8HMHP$ap?LkZ;ccCQ615OW8ByC%{rfBP&ap?
zy}lUwX#*6x2`u9|Jg{TLT&%+vZHf`Qm+y<Rapl=yIf{rT_)}%4cN!qvTTyo?Fgl9M
z+}TlAr-xSJpmu^(Nd4`6UXWYe6%Q(vI|b{KDai0g$~f!|YB@s^CECH7BZ}mUqkdjR
zZpWc{pME9pUZ<#76Zdt}KfVjlTbs{o_<1W=Z2l}a@&Zp$M<%#dSLHC5+1Z)n&Z!0q
z?6QcmOaD$?z+0fr2ePDE)0ZIJdqPn#m5AY6HrACS5tTYhyVuq6!X!R!;oe9+TW42N
zRsrSFBP;9#x8;7WQ<RXC$zt5XV`f8T5iM_V?3}ZA(|3s{sLAjC;{Jm;`&td-QP5CT
z&fwD!`i{2(zIyT0YSAN}l`!mMAAZUU8s<`yL5?rj#Xc?j?|R~!sGX_-)W(z}Hb{J?
zDU_TzLVfYIgi_}kX|l8>-H`XO(I+)JA?77wF78%ah`_Ok&QnKs1_&udi1D1c<X?c0
z<d$Wr6<Rd07Eg9h@=i7v!tTO=&#^tlZZ(ev1n3ZZ`3ZqD)eMY+DMz@n45Wn5K~c##
zW5K7DQ*q`qcgyF`3xKdGwQHo$6(G0ndZu3pj_+|X6~4J@ZORAe`pX+#8O&l)`guoU
z^7TB-04q@h5_vKckan_bjmu=ojgytK3f<eqHWGiC@O{e)If3U;VmWKmQ={(EHpM2-
zp=x?w_h!nFGTpS$MMdbaZ<0<!c-Wds3+`)WyQRvHkOx0q+x=mGq1|;QlC`(jx?c#W
zWkrW(sy8iQ#j2>C6Sz6R_MJuru6$GWFiiv|nt@Pe{3PU}$_ehJh#&!Bcb7Zd>xWOb
z<BshKwd+#fYo1Ffsj<bj`a!Y@HWpj7wldVNOJWU~r<`7OXH9#B@<M1jQXgZnosCE=
zl2b`zL9!@Tg{BvIX~iwDYmU=Ezga{OMYAqIQYpcYLS}noEDUYw3{|pO==mgm%2e>6
zT4O086x|^j4X1%3qau>1Gqvww6I&mxDLt(Q_#K9*vgGxCc;jVYv8AvRj|#<ikKRdw
zAa6+)4k4QG-r3$bC57l(2FbF@k6oT5Ot1q5)23xbMY!nVIazddDM~(P|BGRW<y6E6
zrBmYjI)KoXXRGeqG>WIbnw+*gY@M}obus>UaSNVoLLm>f#T1>~UP=w!nqo7unb&nM
zbn_M&FczeVen>KmUN@V{({=Dkp+pno$8q%O@kaC&P^)A-;vS%dS&Mk$0s9<zqUKz>
z*%aZsW+Od;)k~Pw9flxZpk$OX<5!K_F4V_C#>x>c=E;!1*^(>L3VB+!`ctL0KWbd?
zqJw|a!yFSQM@^vbxJQ+dVJsyYXZ6q1Cd_ACDn@U7Hl^ZEsP2`uLyn^ksGqH3&*3`K
za%Sh<3@nv^C1!+=ZA{*)|KXXkenBoecCz2!7nI#v1=V`nU0fgB{~IJU2hHm^*DghU
zl@~XnMNBKO!=p%Y>kiEh_|f9o+am3{-<vy8i!JcOyo~eY?D@o{{VJ^7zTm|G(*L(f
zTaVv0C}~RM+?ja(DtK-3ZN$MQ@o!V%?{h**9*wg6f9_3T>v!k{!j2nHz2n<hXa(GN
zOyZ4Y(Y_b&i?O4$>&M(mr`X?pg0SW+7b4HGzuahTlOJ*XoaZTf|7&nDA-uMq5;Z|b
zCa5TNdJ0jT<ujwUExZ*z^l6f3)0}wpfd;MSd9YDwwybiP9doV8cqeUfd>CzcMqa&1
z8&~UM2>%dvo;zdBFwxk3ftFfoWFKUban+4RMl85Su*=lFe8t>UC)Rp((T}o6WMoR7
z|M-l;TrFR&d2_{Pw&G~S7k)_iA6BA$q$KkWQW+_jV0}B5EANP?Sy#s8t_HML>!1SS
z^NG8rT^uDad`A@x*=kwyP8_3ye7GK@1@^T#CkkSxq<~%b<-r3Ip0BX^{Ldb1f#W!0
z1^fd7K21|GX$AW*2`|9=eySnN#pRSn%e#jXUbs1rHI~_bZ%{f%;P<kEFA_pVe6b#3
zhu?i@@M4SFwJK!&QBH_0c&+xQaDJ{li8@Hc+6vK7zgiuHTelw+{iNgSthWeNXHcF-
zLMNnpHmsf%R*pjd6HxCw7lRFkm}PEZDfbDV+eMOzij&%S{wf1iXW8SSYT*a(?!Joo
z?eKrOK|4^^=J(QfxtPBO|FWl0?cuuy*NCHO%YEQRe>XU*#lr+(+U|d_f6NrIpo?wX
zVX-NVE9rs~7ws82&X-0B%m#(UWJU(6_*-J%mR)FEhxokmOG+EGDeF_O!@p2bTU6=S
z{@#6JCoa0|;2wKPd^i91i7V9v{kZqrDnNMQ{N(Mk^9mC;t=mjZ`+vE$Yz{>>a5xBM
zz7oX0vnA}525hvv7v6E+{Lks9j_G-I9(=lVD<rUJomtah$!Qps^*p*ItD4iJ@#~=f
zV~78-cn+&<-Jcj#lqE)}4fg9~t&iDXBA6m<Be)@RYrrNr%dj72i%w(uWi5TBBHLCL
zOma#^!`+YC^2fd6C$fmw!+X8lshQrJNGdbXj8_$1(FFvuG8GkwP3o{z+0VJZUHM-K
z_AjHajUUfA&b%7t7(lLle?jB%(W2|r<kI~^r&{QD<`(Q|??tRv-hQ8i0VQ%)uI+dA
zc&Bd@%sQO_<=upPamV3~8RqoxDXX_$*#K6j_Y1V7F2@+u$L*QiqE=F^+JK#-V@mZ%
z>}dTi2a^I~1%5xtFsr;3*`?tQCd6g{HWcL(kymhZx+;?|O2W;15Dre*ih8YU84Vn^
zcU3`qc)kPLb7>gYP5UCg9*N9_v8%ED^F~3gmJ?0X;jtu2GRyVbD3z<R>*f=bXWIME
zNXJ<%xJxZ?wz>km@xCSaJcd|!R>)^NV<1-~gB@D|SoNR3iR5fO?Q}3q@j3WRI=zdd
zkddH=o;mNch1ZTAz=i#kL6Tc7z*yEqwIpM+{DUQD9?$}?7@NS_Y~+H5=gn;=b7odR
zAr}HS{x>)6f-R|?oyNbgh`Y^6PBrde#~<I)mEA8hSic#Y^t)W73K{#k)RS=DX<kkp
z9!hze&K+T(nrG$k_sFAS`s(kb6iz0mOHbWekdF-t7KNv=#(i5gWO+Z-iO63Zn|;K<
z_u<Xi+`fYX0&y2MvqR1@)S0Bu4L}q<tSHOIW)+LFfNYquOzNW1k>t2wSJl2Msd!qZ
z%LFm%?bLQ(q@}Yk%;aLO7dY4cu{_FAqG>6_E5|*9eQ%Ps7+V;MZAR8mL%j|j?LLxc
zBhJGfU7T~9QLx5^0${hq@oI3<c6F<evO47S61Y?dN8PqEij(9!q(CrN&pnI;Gx?n#
zv%d%5#vW*RBH6@4We4x8r&hV=Y!Fn8+vG+&u#rkf@DQnn`ot_VOo#RZG)XFpI92xv
zDR}m0fB@b0Max|U_)K2<lob?m#s<IZB4ne1<><v~mV;V7u@BAF_sgWFjvZuzSgvgg
zc34pAeDf!8ZIBuDikgW!V*~wgz%IM^InZ7(!~NhpuzxhMguo&&-jm+&pYo%JoxXG(
zw}7xSs!b@BR2WJ6UbQk&(H=r#7Cd}TMokRS4`6BZmE5t2UnL?oANw@p)VR9<Y&FC0
z+lR%oWsq;G+U4e|hB)w$0hG7TQ)5lZT{{5#0Wgi^Zd9RM-HC@Ofz!<7g`f58mSlNv
zy;)?YEt}eGu{uruS~L6|_njmr<Ke$T4_nJF5+~fJ&}|NYx8H($yLo(0;ty)kn1>Oe
zZ<B3S4mNuoutN&qA?Z`3I)<imOW@kD4A~LKt%0*FDM%7bXd?<`!?u~`@jhn4N{wy1
zNe&1j!Z+<zMW{ipzG<o1#rw|MuNDyTh&VJYEf9?yW)@#P{L|;l3e;!YW?N0R^4Th!
zyY0|Y(1YAzF9!*>e-BCQQy#hXj&on6Q5#Yx`C^$q<sL&8YEn`Wt1fdIY|Z3Uo)gRd
zF5xY8A=vsB&AIH-PE-2`$-)Cc$Dp{7_Omj%?rVdKY>jA<?I@z*+s)e<nSO`%v{W47
z;sUPk64;~PVW)NH-T4Szw`M@ii_vxIpUNSqF%9vW@wncCq-l2t&vOf&MysnX{LJv*
zsrRdQe#IJQ@g~U~q<;E?frsC98F+k23R>X1{FG1OCLx+n-{ruDkUUsu*0@lkIDKhT
z0K;~bBP+h1PUze$k7ef%BJ}>*{(wm|j2Wv}agO{Thh_{PHD|TbbfRy5-eV?(*VcX)
zgSaaSiD{e1Umi~i(bkr-{zn{b_*bn2x?d?2j1kaCsW}gbbzPqP#3`Zlu-+VD*Pzww
z+3-KTbt~6^9zlyUWm;`-XyI282vR#1TL=Q4-M7M)LmLpgF<ZnR$6b3NThA(sUt;;<
z>CCD=i|N5h#n%<U`(mn$C*ByC7XxMIhU`Dx`VB_~ZdVlXf{_=(V!L6CU_2R*+R4`X
za#vJmdb7OsaDFwnA>;90_K-@3+;yxh1h>ul?y6v)*8<J9qU7s}qu*Q}fq4=YAcJU#
zCr8Av%|A_iJ4mbd7Q2MUH^E=PDf~(I^^tD=cp(U{mYn^Is8RHYzW4vUO}^+5_cmrt
z<x)TQ%~8_cZZ|2xQ6puCuhCOB51F3ykML^Hj$#ih&8a|d`Z0shbOhSKXZmazt~wU*
zj21Z@?Qs_W4<NceRIlUS*=Bdd%)TrTwjMFX_`$YoJRu}HG?|sx?qufx)@>5L;F|hF
z#sVCJJ<<OW&|ddGi$pB35?DKd&xiJ861jX=v}z8?n5A+E{;cC>t9lYeVY=v+`os1Y
z6#SuSjKA5oiget~^l7$go>mpJYcNQE4;52g%C6GNm9^2<x}J;9h=mz2c`yU4I^P$v
zrF%HZjc+=`GK@WkeKe-kD6q)s8otO~uM2!k&V%001d3P1#KC!7$yO#a$5Jj8mt^u&
z0Fj0BPZSTsp1mP@MU`Wl#_=D~gdJGtvR@wUZrm!_!Vb!MS6%f+8h4~vWz6^-5nYMt
zE!7HM1BqYfPFbCwWWq-{aCdi&_^4Y&=Z%gmBqk^Rx@2BAQo&`X_9M{)-TIjQ+zJeB
zMzy*9`1U!&YsmtNTMB99>U8zPL4PoBY(7@|Hh@=e_l8)7Yb=T}`ZaocFK>}HTPbds
z9E!^l>nIj0{6SY!D*s^GK27Y?F4fERRNy2~$dO8Q-)lIu`!7p~KLmRPU8VUiQ+si8
z&9=8xWPECC5y}ct^vm+!E?n$tzOB$6oqw7YjY4~`1-ng)EsjlC@Be!yd?lfYmT|BW
zQu&r`E+Cfv*0aY$Cyy=D$L7FjtxmxN*xXV@j;|G4pIm9%^!?e>!f&Vh1><^4ELi_z
zXKaK%-`;3_h0sTgdP$O)8=MlAH%gpI3haJinIVB-hsCx&A;eXh)+Ya~c#U}fC+c$z
z7~h6-Pz$7AZim<vv{zBH=yHnG5gmBk-;hJhQu$<2Ij`5xSejEJi~fMgSXOziO0Y>*
zr&z{wF@D;v!mtc^sO#b=?Mf;2)N$qFv{?3dOlHVWF{v?jwm<LDP;EJ(2Zs+(8@pKr
zXCMsSScFq85g!=6`--+N<&T<M++#d!xS~}HD9%aJ)UNUQ?<h-lk!}Pp`Y$dHv=n|t
zPMK=ypeKq#TVir2i4teee=-dj9gBW1b9p;*M*-G_(!M4=*O2-@OO^jJOpMb8{eyW^
z{_ir1|J1+#RY3lqCd_}8Mf^YBvIWhvH#?9x^II=8K>}MfG8{59&#?SEF|jsVCUmKZ
zN8@oZef}@@_AeUsAH?z+RucWD85yzyA0GUtDc`g(hCkeyWGqRTb+Ju4EV`PBxU5}W
zhd;3`k2m(|X3=nG<f>mNMg`3rg?CKc^Q-qEjqW%PxJI7Wg#eWa24e@xG1XG9g8Z(5
z%ZZV?T6lSpLQ8ui7~L`7s@)>h*QOIeo%z0IIf&iYgbW)#zdE!81_?@+*s7bf{%P4=
zkTm1%M`;XlAH9Ko@pay#X5Sb;|7mGUlb4S)4J^{GyS(WwC4`6IUzl#hR-AOi*-Yf{
z!Td<%QL5*6qV$1=r89PM(AX64=`m8(0G;vS>ND!G0h+86O>`aU*Kp;3rsUV1WjAi-
zzEvtKq2F0Qx><a+LP7`RE;Z6AJk1ci3a5AO-`JkVHh^M+Rn`uH6Z?y{2UkoMVEOQ&
zA46rJmbsz2=vmUXvuYy9a`sQS;9*#j2$S>K>9yk^L+`d3)c~tsBOHz6dYV1$uHT!b
z#4o6VB^vG^I;Tr7(DU7Aht3r(tb`Q`2Iv|EkeHMZ@|~BPgBOE0P)-E>B+aYsH21_u
z`a-HpEk{tc?#JCjTBoAb4##!rjw0$U?0O{f#_!)l_pK+YN*cW8Sl7%>V%f6d%UjMG
zuKkTs|KC;o;%*!-6f~;=`A)vy*d5bNwTs2&%B$=%2kq6{`4GuzF;h_MnVIL$=;MNc
zmczikMo`;Ti7l@_xSa^!F5L3<3#&|*BjCD~6i+Bt=slJ=4zc|qA-TL5;VA!?<k^Y?
z%4eW$d&uu}G3!<OD&w~I#pi=Kx^1Ui{3AwBgQcS1QB;KOZ7pp_<=uw9didV!;Mpc}
zRTu=zhkE9bI4qI~><8vXY*ga%$Pq(Z9NR1Rv5ezMj{^4Ij3}sTUxueei(JhkPv?9-
z3F(Yd_g#%-XXv?Mxr-8(jD#K5{m}Y-TAC6h;|5X}_MVHE>dn-}``#f=Ra87CD;`;U
zt-n`-{9MJ)@@rlIWM&IPPFxa%I&ds%d<N(KkLtcVs;Tr{S46=P5D=sn6%~+Pq!&>E
z9T7y7UIIvyUPA{#=>aLyi-3Sg4M^`DDG3mI2_^IZA)$oM-RL-He&^0_&N^qUv(~--
zCoB8g`z!k`&-1+RZd#dAafV0%SnX(#o#X>7PEpo*r#rgH3!a5B;AROC^i5&)TJT@i
zwG9R0sGrsm;3Asc`SJrNt3Dx7Rajsm=&$&>>9jX|IUNL<NJasN&mJ~juCavbg?N3>
zikrzx5ch~p8+09!q}o=H)IFC~j(_J8fT+u5?K8;$y=mguv)^>wAlQ|eriTN0zpIht
zPKtLpk$Q`nrg*_{gYr!L40<Gd<)1A-lhX?hic2Q9q_aULSUos@XPx!cvs-80IOr{(
z%4Gujpd3|V**cxBW;-d#qLfyyc&urJcY769Z7A3r487=vJj-Dgsh#|iX;`)QqPxn&
zxQqCd5FxwfwVl1^14&`H4WxVk*W^8~s3KaIMDw^l(kI8j6(R|q$mFX&$|B7Nvg{Qn
z1pK*;uknO@obDaw(wXKJ?JTsi7&^*9Lbw{5Rmb;^P=PG~ZC7~|lMECOs<tbLp+?=x
z#TS1cnQ{$vXnC{Y)=_!b7OP~&=9Nqg>j&k8QJ8J5yV!?!0{Jral%j12uxw|s{8_AM
z<Gm?v)@_frox|BDS_FRFy0GMyonYpS$m~Q@vMFYXPU2%E0|^yaQ2eSyc8Pe2<u48l
zj=e$#?t`35k1W`LQg6KM<TEaP&}>_mNxuZBHQg#wWL8OM8+LLzFF&#n3JWMqRviJE
zWq)-D52xw6v_rl_=-ezKiJ?Dxls^vw5RB34I@WMG8rF%f%%LZ%bXdLXbup@nlyJ!h
zgIu9JRaWwKwBhsQOk#b`cB6+6#6TTsZjoH@7!w1``zv%iEGq}@5hc9_vRP4r8))fN
zvH`r>NiKe`V3B^#ZDwC~^GkH*;qtS{oOWeqAk6-I8#n7q#w6VNYChe32FEN0I=aOD
zZTad7b3}t~z0HK)o!F(^qUzylnB3Nq%hVFZhzHD!shCV;BKrvS0QF|a9$f=>Q)pZs
zLUq_I_gVbN&EE%J8ExuFUQOnh?tTnEauVu^KMO=R;#of*Uy-dpN)cwL^KU<GjG!eG
z4Y$AZIIv&JgRky3ZtzErUosEWE)I?-+F^XLEg0t4t?rTxjh;yNjw!?xUNYY^yf$aH
z*^G&wiF2C&lT=^lu6+$7K??!fwEamfibeutrtkhx;!t<1GSxiX;IxVF2S~uc&xF#o
z7BC#afo&QN>tR)!_$TyO^fU(>AmB_K_gLXMTO`QUtjOEIZI84iw~{@G!(dT;a;D5A
z_IqJ$?LugHis#0{LZujJi>LVcSZvHSCSlf!8cIOuk<1Cy;pr^<3f%0#?^C|Td*g^l
zo=>5C*(L=bN~?^kSHj69XGurSEtc}Jona2(9<M&o&W?i6BOoOH2}K|YO5uKK$PH%x
zRr=GUw{|a&s#|#~rpt6sj>M-Efb*ZQy@Tw}Nm=sd@|xUGXjB&4GSXm$Fi7i-*#pI%
z$@e#g;OEkjC@n0U?X+86a!vgNc9zSAPurE^yabyUf#cC@*b<mSuH?;C9f!Q?_GWER
z_1^{1SAhsu-@WxAB@8^u8Kl%Sqv#)!!s5^jB!?G9%$w7j3@`iG*)_uIkJ@npkXwai
z_#f-G{{f{8mUca&01hXNK-PWM`3N*{Zb|6MsGdr#wv1vXJR-Yavn5vyUBcLvkrRD7
zg?wroELHZ?@4FGN9eAYW7YmW;H?LDaQjjvI=%WB_1JV-)x~A9EE46tx&JY40vMZzM
z9V&NOx9Y+&Mh`oqz>RxKTIrGZtIn3ei|KGvqKo;;y)?kcrY=&!>%UF<0tL?^`<K?k
zsN^1Y{J7yCw!n>;^l;QHG4;^y^3Cwe3v+`cg%0~ITk?DNz%r}J_uR+rdzvk3fF@b(
zVoC(Jq-(04#?}(HFEO&ZwnIWno;+Cg#d-=}rqu5^<|%0%*bstDZMt<-;YI{sH17Ao
zo)#6qs7B}$E8quo=6pn5t`?jnV6}t_Ueq!qgZ}R{l;n=KCmiZ3239R|p7J$hkeI1o
zV0Luc9CarrUfZOCbDvF7eeCBHdoUSke#}5^=1MVwWZiKYx>89d;(RvCD?;@V9!pz-
z#u_e}6SG_Vm?M&E_n|-L`46rF4lHqy-J9hs<Clh*hv82|$Bk-A2-}xwWX}sn*@Nub
zkJ{;7sID-KYSr+Xaym6G10CqtAb@UVJZZ`+Vrsb3KnCSICA;5?6W>Td`p85u-5Wpl
zy(&tL9$l;3j_p+YBYj131_}Fqr&ZwGpUr^+#lx+gcccr$xcfM;ZnmPc*U3w}Ud28j
zCY^mbE7ebdy+XGDQ)Zl(+C>Rn(X7-W$rKT8-&cwdhA>+$70(V#+*JAHQ*y3HO^+rV
zEx0vpAPF8iX@fj)*A%~FJk3RCK|C}oV-6p)`hdSjg=yJQYOuSqhxA)W$Ty?uOiV&(
z)1aK?J4?ml_)fEIhuJS$l82ljy%eW4&}ehANS=}y@a4HwVir{`%z8C=jNsu2^0<=g
zDF6fp=HZWEfxxPaV5y}P%cvzlL1T1cDuHQE>SF}P+H9p1&1f+duWrCRHBg`H$Ihdb
zOs~Hl_Q;wZH2p6;H&9nU&g$MuPq4ML|26{*feYImJ*J0^yVR^LET%e6>Hzh#_R_N6
z!tJ$$0aQT`Y4khDKiKjQ1Lf(})VtZJ$AVb}2}$-Ja{j;tr2s8XZK?T08awN8&~3JB
zhWdhS?;~%?;Ij!8#EoClNFKId5HWppr#4*hz81=nhd2yrB<Ze@P)4%op*C-vmB4W4
z%yTpGXYU9^(JS|6f+r(11-qETg9OP!?iqP*JR=V4CDjxPNU0-Kz(`r%Ye6q#JF4b@
zkw}+%V<{E54@O_AMMiYeXq{ENueL4~+qA4;7e1^6%zmpoj*#kiX4<R*I;3QS5Z_lu
zM6b3lTZ=K%VGkD+(=Ek8POr!Mg;wkGA(rk>&z|NVp$RA<;;h;(o2bYW_sSi1O|)j^
zsy7cUghi&*H7%tho@SZRq70bH{eHH%A<aj_1P}A~XX=4>`Q;sv;ZKpBFpGS=;RRR&
zaW0@g&y3zN3VYXQmqi=1!Zy3%UB|OAHJYoMAzTszG>y1?h0M|WGgPQX?}f;bHg7VW
zLW?GsVhiKahzc#xyV3Y%=6Fpz&_M#gr0;+Y>ILNJTYvlXR@px|fT*M1UJqynpiOD=
zPTz7Z4ABcS*1~kH^4McOn!brYi%sksmv0$L+=M8tg%#NOc2dnn$}$=A;Ut*ImVu!Q
zNY&`9#{nQW=-b$rMoG|`L(o>QV;2~8Uv*?p!4%lRvJ6A|^uneMyv&xi6W~!#z{c$T
z3^#RNa{{N!RB6y7Y;jAfPe5*JVZ=e;fzm#HhXvs*I_aqcE(H2hjGcF3SK&kZ-tdmD
z<iq&x3qG6rJ;8LHw&O&!00^k9E_TTx7;|YynF6QV6{gp<o=$rF+?VkuiIr%Ym3X%$
zrG~wkS%8k*O{#oW4QJ7jpEaMd=gL}oCUNUB=1(3@>fq=&aKIvmIq_sG0`q9;P4FL_
zka6r@(+qK2fY4<*`3=RoFQfZ%8A}oUm+pdT4h7$|Cf;yiG!}l>E`ixkk4r(vi?Z%!
z3w>;f(6(c?;4d*jC@ML=CjQFbgSmXIP;`zsD1e?!z?-gxr)&Q$DPyJ>Z9e-}hw!!*
zdowvbo4bqT29oAi%87lbtUnQOY5z&(bngeGz5>zM%=x5}hqp5mtg0IewZ#la>2!qV
zHplTkvxS+n%!N%EZz*EA>NCd&mEGM+vk7s>vz^zH8fTV`x!l3@2|G*XlYvRCeL)$s
z^AVhEN|y85$!&IeAfYo{>jw!U#y^UvzjJ<;|8R;_EAK&-*@X~`I<XQZA0tM~t_zt-
zBz*NTj$y6=$9RGD=;}rv_F2QK<|{)kHJ=@CIi=P<=JLdH?-1Za3OGc)MW4QV))Fac
zrJQ&Yk6Z1&v|G4Hx*Pi^V)>{}VShWR+*MWoH+&7fVDHD3wLy=3lWd#xHE1e5Hc7Fr
zV|g&#W5WP&Re=vQ!2%wQD!qzQYWd=&`1gA(6HP(!iHt4|&$Dw&IS@*1SHNV86(2^v
z>>2BSV;N46k~|V|;DO7RDHkKJ0?YXaK?|1k5MY*YdR&bgy^UxgpX8KC%PJ6IjI)7$
zA6FIAcADKEK!uO8{pC<>Y=2L}XF0;}<vo3PMH3NxSMfjbplAHgAH;Cyp5L8|bCoRZ
z>|Tclwf{<0_TOG&N#HFVY+k9WNR3toddY3GHh2hy7~dx5rul=L93B|UVVPH`A2u*E
zi3?yV&ZhbY-uDefGR<jS5XyZSJ+gHBUn$^NO{=%t+f;bR@DYGo9ndKh=lM@JBJi+H
zsy`#(^WfA=K%tVQ#6J+-Ux=CvvD2Yqg@N_nUi$ySd9yF4XxM<0E*xzm*e3<h)#|$5
z!5pdAu8c}K!WW0bwVsK(e`YEke>Acf$y6Lm`H%Bq`gim3+nu(Ya9Y2o&cWJ%ZRRPw
zUMss($YH}F4uW=y{-Cgv6dfA175;&({wmkbE}zpDigKq*+g?X+T^)PylV$$NMjpkq
z>@Oy;F*gir=Ga>n&M36*BE6o{E&+so)f8pjl}R&6+Ct7C=pRs?NW{EvS=fI=rO<$e
zF~XH{5=_nA(PcR~;7L0LEsr%oZtOYwZN<P_;P&xQ1iMCZ5^fE$p$$yyKi#M2s!m|3
zd;zV_ddb2%G@92ALsEiG$_26(WZm{iePC$8@)*NZJ0|o`P__KkmGw1)-S5LQyHlVS
zs6YRXd`~-`U_C!(<1r_vNRygppoq<C{}a}}?JX&Yt}|9qs~frS3;o;$3(S?R$D$Qx
z8}?=$S3b!RBDi>^?nnSdN`D^QUz@8nxA0A<S1Qj#<F{%69GQH|>So9Wab#>rm^)^l
z!~>9A?xm#^<68bP;92KROMeE~*b{csi>{JeEMKBwRXugCiFoAb-V5VDpIJXZSy$gX
zJjD&qfT>5-SzG3p8d4j)Q33$-pNv+kEivE06|^+cxT8DDt*H{ickj-bPUTD^aKbGz
z^+^@G=N0#8sV|$cFp{^tGk)8^=kU5j;Jk0!FZxCbpyCD3sCY3@Dg2%aHLP28$-rC{
zzdxi>uRMJ)Yl{GAjHWvCfsXWwlmLA?1-=>|H%E<8+b%;s?@g2a+p+3mpfD#g2C2%U
zX3dxdUuisS$!0s8b~S*_8F2p}YevkF2X8&TFJu;ZQe<uI#Y9Yb1@c+heEbDWewPG^
zc%ISKEs9}wX>PfK<+E9}qvIWJ0MS}iJO4gSrpQe|Xm%EiZ65fJIqCCizoJ%;e-@*k
za|Sa0Py*Bvnp!_>p*q~cIi{*wwK(3d+T_b!5N`kwQ@w{f$7#pk-wXV{xi*oe;YqvX
zNjKfmown}f6ap|hKcp<Lzl($XiR5R`a1iwCc_EY0A&>Xd(9~fM8KoV4Gg)##ZX=7X
z;{<1$rFz~UYL`l~OST_4Sz&E(yDbs80Vr>_>nC84=Ki5LrnLIK`Le&rE*ASWrhU4`
z=6z0a-cE(FztknBxrv13)0FI=NqDN?9oEh+PGlQv=n19R9MBI`Ehg|CZ{{_8_F#2T
zk~j_%%3`z@Z)(%=IwC4GY$vCawZa_FlP_S#9S-#n%=pP^ulA;KfHBC}rnoXXEV&-J
z;k?<u0-&KfsUhaf{0fT-kW^khTSUQROca10*B@nydbjir40RNw!6Z-pR?YzAEA)cG
z(ztoYy91zBTR=hKWG^yml{Bfwv!7mWxbsUn5eqQrVc}swV&R4KI&DELXlU~=OHXaD
zkP7v#F_lokYDX9D?SI?Gab-W{l^njOp6b1I2vcoO-<@uw9<XP$T9w%X12|CdK%OF>
z&rvBye$k(Mcs8_^(ACZMr3ME?DD*a*uiV>cmAz2Nf5{~UBBG+9?PV=$B(k5vms~2c
zUOc-}2d`B>DOBUDysPJ#yD({%YB@J?xJ-S<cb~~+I$r>_H!)j9P9|yanhnRt)y-Xb
zacJq)3KXoeSQo=nHmS*;nv$b`D$Fe9`dB`vMJA91oysUv1mIsAcv$ub=h=Js@1f{1
zAP?7)SdDSK41q?PbX+91YLce`*UgF<R7eSrrQqvk-a%bLb3@NW4o;~8B$Y(WOUul+
z3R!QXZ7z@!hdDjAt<l?cm<3uN;Ym|W$8nuB7Nw@vNhx@z4l$50uk79`d}9xgkANLd
z_dUmY@inZP)5Sgj*C|Q7k>70-v~imnog*^k-T~+tEHWJbW0TgXAI)daQCl?t$!@in
z7>LQ375A~7=!%tCv$)OivjG6L>G3kP?mt5>WQ@Vdj<|S~g1a9SZga?zsH|38Dm|j{
zXJ7;rm>R`In+=FwwQqC{uq!SW+BK!#20-?^KI8~r3s0O|sWsRI^3o@~{cgB;+P(9`
zXcwCOAgjpRk{2mc?1pa_rcRh({Pmc+MAWX@7rU}7fB1x~@xdi+NI6evqitd~>>DXJ
zT;IPMwu66=RW!T7*1;tD0<%N>%MAY=BV!y36zzY%M7u8J*LrZ>TQ|rbP&hZz9UHmd
z3=gF}O9;e#I_-1f6d%AE4$7To+u}k7de49fPdGwNPlZh_=fRJRUP(jW*t5-9%;V@1
zjvlByp!2z678xligZR>_N{UIkIF@8xqvpgX%)K9(;(-)+k2Z74By`*Q7Q1)lXEF`o
zGWIPOSB^0AIbe1BM!HwqBQv!F&wtyp4E?PpcHM1vK{QrW(Qw|eJ;9XIM?>Wu>Yiam
z9l|1z%yDJ$c)C7x4*5s=j3h|%f>m|9=-aftni6Un)ScU-<oD_QSRuQ{kZh(uZeqp+
zI-Ts}d`S2)H$-W@%`NF*r9yT5hZ5(9B28<q!aqDG#Jy<q(0=cUnYdY~*irdDjA(u&
z==}~$=02b{36q37Hs_PbY#lz(J=II${liuK8_u?jGbr7zaaR`CiW5MD+bPArp}gD?
zx~1fkXgY3K7}?(JGp#e40w4tfW&R#5O}Hy`hG^>&lQ!qG?A)?=i=KE5ka2v3&(iJX
z-bu2m^LJ!g_nJ5~uBJh#bha3_DDIVvEVW9=oXWmR@}s*<Y?a6C!o`Y8jw}*s2<q6M
zTVCW4W%zEM#Y&G3X7N0+#_RbLk%gR)=T<*p%zUFp@lX}1p=UXT!&Ye^=M-`aQoOV_
zGn1sulupSQ^HK_<m>;2ots=9|emU#APfr~*O`d%BPzqSq;kVe@&3yY5^0@7PPg$!l
z6=$kQi;mNz>N~j;9t&ie3(siDHqUt<rf30#vt^g2LA$UV#`jtEvUY{c?b`6%sO~Vm
zF|Wb}U+8X_YTBlD_8VxVB3;1IrO_i2zEuS(8a*Qm4I=AW_crh!ehc`nfu8GBicMT=
zF*wdT<=Bbw5#c*ZRoosxkA&8jvh3#uiWxoSF7+&$2iLUL{iN$-fvce3&mk-m{1#g5
zg5Xtii#V*DV%l74uK)>el)mFqW<S{F>z~dom;K7<#gWNWnBWA{LazlygY_)+Q~`K$
z>NvGeHLk2*?T4c7OjG(QYL(LDr&>@%?N5o>AQf>~fZ&yW0Dy^<*&oLb<Y8P}t><S(
zPCt<p*0qQmxO@x2RGsO<gg-bpQlKN^Z>})&59iHL7uy8uO^#-TIBw55)&m*qHs^44
zUnu-a_+&@%Y|9iimj_?ECSkj<e764BU+mY}{1jjjPBOczfxLWXXZ*+kt5XMxPg1f}
zTyam2+sMqHJV}A&<B0DwNp6Y17%OSu6`VEt{ia$Ft2<V&VwT8~->u$@$%T1gtm(wC
zWjmS6#sN`f#gK9n(69Y`YgU7S)^jpy*K$ep>hPOeup7p2G9p@R&+q7qeWb-&^*=F3
zEK~opEn4nvZz`Ehk6(P3NfXeIt~klosbRhA5@aO(+%**<Jj#Rult^dspW2Z3r_oln
zNoZdvrD#JoDOCJI4PTT<jnLfMzT@`q0n&J34z0Ash%uuS%-pz1N8yVh`kogC$cvPU
z<uy~Qnp-A}2rDjlH6IgHK2~#ltW#f6{L_dSsqxokL-u6KwgScRi%>rRxPU^5k}?Mw
z078F6IVr;mLD0M`m<h|LCt>Dk-9Hln+$2?GG#ktA|Ka0)c+0Dn!Rz_NTufXdsh3+^
z!!_~pbsgh;T~&4AJ0(3=FSW#zfzW-*zcR}IF9~`vyP01Ez4-?&fS(5>!T>)n*H*%V
z`=^BSUxdaC;vfDmvG_mb#%FB8Uox)Sxh$#yKP87k^8ZcQ@88?$fqjkXSeW$m(F&EC
zZ0@Brz%AJ1?b6Eb307ybV3EAYOrsXbS7%CfAYU(O4SfV<#}q-AM%|OGSFK&8=TZzu
zrVn5LRhN=#yyn@z1D0PJO--fEp4fD6eOl#)Nh8CY2}6Dc%)wwOoYZFh0}mS@viXcx
z@fHF|@j;JT<Ng4%m3O7QPk<bve>;oZ=LO|PnwI{QQPGO!KZjG;65TnRa(+bpzK;0O
zN0oYq*;?{8eYFTzs36~7*=)*P+~7~p&uLxnbQ<aA+x=?L+VlLax3|9a{kAY*`Qc{U
z^Q(T3G0cx11PgQe#q?bbp?DtR>3be0$8nb<{gT|H#}Ce3yh!qbI?FTUap`986vlH)
z($N&N)-&dTSzJU;)f}FrE_rQiBsjXjQpF`t-Exv8rQse~uhhFw`_1D;UU&AbEv`h|
z>KJ4@(a;5-suZTQmW7jijl1O*vyv1cm9kVf+Ibr{Z5OZLFLadV=9u~2ppth26x|!O
zzJ9mRS@(H{!Q-wBB`2C<z7(bSEPm^^7m3Ue8*akl*(_cM^FmvvPv~v=5ZIiQxU{Ex
zKAe%Ky)wsLHQxmYpLs^C8ol9*`XM~pNZ-r;D(7Hj5OOp}MWF|L0>Y%(xBpPV9|Fkf
zD&pw*6-t-o%s#V`(vZvSJlJgwZOE%l+iFe%=*I=jZq3@qb>}`q!h^I`*i-vwD8JR)
zPgsBfAeyd`lfKNJt@f`hpad<S^gJe{$A-$WxYkhHSLHrSD{*-z^gC*N4SF%hI%9Hy
ztV!0=-huiN@#P^Lah0FkpF}ApcpBSGk%_>{CoJ$^zm59#qr6cmLM2=c2N0`875yTs
zoIsf)McC1>mw1}%;{KQf+IAEjo}f-SBEiei&v(m2^$zK1b0~Kr92azI=xWu7bB7N{
z4|=H7Zy$EK7-?x@A{TwgiIv49L|@h;*=zzDvYylND>TwDh;Y|zcA>c@x=HC8n!0WG
zv}GJ;6AT&*#$av~8s9OYBh!&ysC4#dPZn3)(!Q+DQrhzl3oo8}McwD?j|R?o*j8!k
z>6iB&hp#`4H1Is1@ieLnszk@n@KU%ZfVc9w3%q9uw+<!9l&kN|2sN3osxU5JC{OE*
z)$@MeRUhV1P={2+tvnI#l2q3bN^%{bACuRZisyk23hvhjGCRJA0!xNiGSI%{dxGPj
zA`5Mc(-che6FWUm^70u88QCZ^c@HdB(_zJGtDqbp`3*;`=C3||#lRWnz$5vlZPH5i
za6GL9BFY%OUO#<f%(-GZDamzr3Q)>M)lR6!Jk@<(ck6qmQoR6pq-7fJ16M1ms*0jv
zD7KZwdLlVYghL~{<!N=KYF4u)w}cg{_|bGWNKEmB%veYH81LbDnD}l*oq~vGYx9`;
zuvLd!MP}u&y<D*O4bD(O!^SW0FPJa*X3o)cf|mT}KB^eVVJq)GS_k9BtzsNwS*6Og
z5D@Ba+Y(_Dx$f4O?xyrw@^Axs%D&|l95EP-b-PYF8gD<bq(&K>h5Xh=w3eK!Vb&5r
z4u|_!{+>gh9rEp1_MX6mQ;*2LZNl`e!}rW@I#L~6PIMMUYlht!=MUT2e7rdSi;Y71
zVh>P~m;?yY72$c;$h`3qsW6hSa*pdGDqLdrn^h@K$3oEx<*mn_i`(DFWQbW)MSFgG
z`I#~vP#>!4A}T+vzJ9$e11uphP+}_Rd}Ue)9YgV}5lu<S4aX(*aLL0cz6Au>^;yCI
zXc`}_xOfQqCNTNA|8YNctFkJus`yS!rKdFT8eL{|%t`jJmnm6SOnXQuV!GC`HT_jK
zT;GSmrXJe9OwX2hw7BvJn`V#98Zp0Ybp1b!0&r)B%&W>xSIFp`>9O6c;NhjBo(gPY
zxcuDBMPmOJj@Z+%VD?n~dl#Vq!%aHsXzTf*sAC@Aw!10XZ9V~rO71ehUaLXN5HF>q
z)-ExrYvyLYzzkSCIDXdC{%ByuVR|iX-ePcs(fR=T6*Dw4=h1p`Ncq+ez2@XY?6X(?
z^P~0D`)%ykt8e8pbyG9&`w>I7qv6$Gx_qI&ZhfER+~7=Pf~}Wmr9)&rcgXjl6;rZY
zY1+``IW{iorN>11+CLU@9;1#ZD8GwjcZ=Z^1__r4ydq}tRysj4^83lAbqEU?z;aT-
z!8PPo42Oaa&C8vDJP4;sbGk1G8k$SoKc7d{1rAu@Hc$bqNpm|BM9>}GfvMC<T_mGx
z&|*jn`~A=%71zdRw$&@^vtW2p>Ydq&NgSNfX}0Cn>&CBbW#L^<w#lDpo!vYBjyIuq
zYbRqd{j^WhkViD{mm`+tTF^ub{OIO1e!|gvSPSEYC;kYJAk);q5pkbN=|l*+%j?hD
z?q=lKe4&%&^Nx-e^o9c#X4-T7IO0@THdZ1<SwNxe7<_+Xy;f1?GELf3gJ8$pyICS^
z7H49~IhGLHuMD?3cC$M9WC;3iKUvVs?(@Bonkw>o+0TufDeWymjp_K5+d0r1dG2Yd
zy*e|WkN8g!+V%@ycR)W@p#9=2_7fqQZ|>0}vj^|AFO$(V+Zle+GM*uex4pDhJ3{b8
z+(YddoJ+b`DIE);x<#m7kP)^><g+H5k1Us%u2hBIiV$o=BrKB|*rgkewP{Sh6aeMl
zFl5b+Z(mq@cCT&;ZCm-o+xDG;4zOeLTEQpmz+3*Fi1bx9$Ok<eLV!A1-0E7DHs}!O
z_8(OUBN?fJ6%af$(|2{Nmw|9j&ckh82IMtfXNn$}57{4Oubj(5T_6nbWKak<4sJ=4
zIQZ=O`|Bg-@RPOA^STGWCqH!-D5HJSQZ}`{mON3)87-4?jO#!|e-o4zK3-kx)=>Vv
zydtkMa4zX9(RZlcN|CK>!n2x_qgt!NWcS~9C*sU*9Xj2{g{s_ldh$X%mQw^2L~K?(
z><@4Bg${?FqY$ZdbK>p5C0blniUje{sU<Z#->)J_a(xhG%RNjRnf5Y{-*Bwhuw{$*
z%brD=NeCHThVHUJ)(t;`7BPfa)lPlFM=$>2oHQRTlN)<TVwKMB_!85)3nuX04bu27
zaN)b(P#A-~iC6pXZA7GAHn>60*NM0eUsY4!Q&H*l(}6@xnJJ}dr_WAKD%IfMhjqbD
zGDEJAvXMNgn?KffKj_E$0v9to%wyfmF0j?p@k`utWtREo7YItWKcT%VB=rS@qXRdl
zkJ0Ev6b*XR34lC_GSy^;aY<RJ*^XSEn?xz>`E?7<8bsX5A}ehYWvMcEeDs4HgS&DI
z9y1iKXV-yQ(;##*+>!@Db->D|xc0mqBSIHNTG?vYledY!B8xJ%T&Wvj?H$%HWz(^Q
zd@M!X*@%Noy+K*cQnr+^*L$dy*z){`+cO~wWB&v#j)!lO&wlE!`%a^?K!U{_!PMON
znB{_nX|C~=PIBP4AJ)i*ZhN7^Eo>NV>rG_j1z<JIZTBD~UzJa+cVdS|3c^J%Y_J@i
z#tRa*f4jzdbj-k?s#(=8fnGxBZzSwnAweLhsD%_yM9Gdy<>wxYZ_(}FhK5G8MNb<f
z?UCyog%+CT&YOIB;DQaILQhdB!yETK4_+aVlAe*?oe5yH;gZc6>TFs;NU-yr7}i6a
zj>+ncQ?_6t4uu|Y#^&>gtXM5x10kBFsqnc(_1~4Z*#Wzd+vBT>fzpJ*H!U0a)hCx~
z(@sVsaIQ<eS5?@y$+KpkG%EE<TX$u-F(7DGKFwHU)aH~L<4i{GEWI@%3~;R^Qlk{7
z^)Ag9HHM6Zd#uGx9WRq_O|z3jB`gNs@1N+=^P4^>){kZ%7_jJqI@N+^<{VeJ?0JXU
z2K8@|jkpX*0v}2uqZbsb+4iGyq}p8)HDf<)jUvdS!9Vv@8@euD9t#sc?zH;&F%B-{
zc^KSPevC)TFqZBUeXlQj<c<9;i1*3e4neG@o5H=hb>VU?j<wS`3N?OLS3+qksV{FW
z-U}c7_IR6A5Yo)rjUd-*Rgo}-5J{2Mibc3kFI=U4nbJo`einm3-ef1YO<34HhY3%*
zsrV;xatXI<vA6p(a2UJ4rl-%cI)=uGlU+V9i{@}WPe&H_^ZJi!D#6_&(etW6YxNkj
zQU7p$t?hCTe<h40<Z)_Dd(4Y$1A*pVmsLXBd`bnYQUT*FxaqsFz@0vNDw(e}dh%7R
z7jGZD|IFRh8cll4(D~enA1m23znSVDp?Wkltjk`6-<6&ZsztH|*f#{;34n30N_&$?
zg(GF@<G-<m%ia?;vG)$Dc5H^SvrlN{T{|(Q_iCXKlzbcLvw8hi10NCjK7OqmT2B?u
z*%8*+U)9`@0cw?i74e<j_{`Y>8*nWyIm$Wzo98YZ@AHADY<_O6EG{i_jp`|24uAPU
z?dk<CwGwoLlloA39<TI09h<S+(O#CpzryVos2>A9jgtD@9_kSMBLjF0p#|1AW|=ms
z<O2g=*KOV}E;lVI%cF-4z-4P|i11E>G^aJWvE?0gmEGXmgz7xMqIl_X({4zqv<rQb
zRnec8?)M*~orQ~k45lA7f8(D1S*UlGLp*p>dlqw#2Y;R&=XG~q{B|Tqa5BpX@2z5M
zpkI_a8hSnb{D(XF`SyE!b%cxEWt6c>Uwd!1@%zA=|H({3!^Iuatjb=gcbdw$ddX+T
z8c0aGHZX5FW#1OO216HGsBNvE-SZC@=#q1Kw34t^+QFKfRD_xG(ThTU{N)rme$E64
zS|9$L2#`Dk4!WxH{){?Fa<Lh@bAE5wqXH}?gA|kbsyl<M#!O;6M||CQO1C?$fbzZl
z7IB%qDby<q9V-b5S@pkcUSTVtSiD2!_6xSH{w}dP$zOI&^*`TgC+ZgY&pTZ`e(-|k
zk4^n?O8Trt^*@lY)ZVxG*1Ud_8>9x8y&>NIFGu9yutvoMRgS#nLw#&1+aUV{6Z}7!
zp83(}v(hP-@jGMTyHk!eRzu;c``Z;_WQL7%xuNE}4@GYsmizzmk7cd?s{EbcNhj5z
zdmA=h`W(L;<A3A0th(>)AC~SfCKZ)ra}7U46v1pm=)ndlpKp-Ttw8C-L-E$c*QOiT
zZ*>e`jYu*()ePm9k{YXs4KaR1PA_cUcbGk&_!RO=m3NEuqod;+{kYNvX|<?`LCvx9
z4%V<`@Jr0SoP$qD#^-+*T8{fgs{3%CTn1Vm6!^{QkVFdW0(i(TCw2H@o;J<boJ6_g
zpkb~E=s?hacW=+-P4aajmM1@;$zL{-PyWmjmd{*6ogcKc_XB<@;Oq8%)j+)=n5wn$
zZ$mMhmGw@zy$M{K1bNnvi;#J4EUCS%f;<FobT0bup2E$h%zdP?<g{-!eDyVOh0*;7
zp2h*h*!5|3usVC_TZO;N8nX;lanoU3U_PWJv;=6@@-_^ZWYpgc&-ilptoIycTLsOH
zzY85jzN`WL8NS^NvE_O2cXzUWJ_gLgT%LMb``zasC1M^N^|^08FiphWY}mW-XyQ`|
z(2cD>grmOfJb;?JPggd_e6D(7PrmaGuk;=3l_ddt`i*6h@T>)0Ui#i0pkD>2s_$7-
zSKtz$uc`U()Su28iU1D){3P+}MUJyQ3z|y;W|jSj;H)7%SH}NimqmTY5&=({aq+q5
zkG4twJ8d=o+N-0V>q<qU?n6uB!(Pif@vF7r<>O;-%!}a}^5ZX>%BxJi4JBaPud0jM
zG%yy(*zRCg=d)+&@>1p?7nd}x8ElgS^9`N#Yk?lDiD6Lnz@g^`?G@HaiPIbEBo_}!
z@0RniS|#bk3T%`Tb^J!zt>rbm%7B`iC{*8iwchV4BCt|MA}^WFH4GiEajpAo-$A;f
zL<4jMo3YQU$Ty8O&jU5M^>XJ;C&6m<%MbE~Le-yAALhUQs8yg-3p7j7Iu=LL-RD2b
z7&@f~+lXpSS?_@J*n4UP-fl&uY3ecCa?d13go?r*@_U~Ki(bC*C>raWCmSzZo-XZw
zym${Bzv^jFdeFB@miIB#Yf+T9zBg1E$Z@@fo9!Poa;l$xbBC>(=GijPItJf9Nlqk?
zFg|u>+r`y+(^$1gxwHkVH{8{?HJ!w=Nk+e?$PjPzow{nfMr^7z#bBKY-eAt#3dJ8=
zG%jCjtz5fmZP2?q(*EkG_Z=zF6y1r8Xf4v!Sk{-oLwLMAeu%HYh69PuA($AcgtcX6
ztcTSfkFL5Qj+eFAG{-A3Wje`y4_`OFqj>fdOcciXHo8Ehq#CjM0+r4ETno`@7PHKo
z_wa~-fS7^y97do}<8J<pS;C+%fm}E77ePV=luzOip}LxqHq-{RW1vp1<JIJ(j0qwe
zO>Fn#%f55<5<R3w>@hjL?WCKPR_%1h=eit4Z3qd{_zoY`y>qo&ot12$b8CA>8n8es
zGhUuv<W~khu>i8(DS36F{KAR@%@0IejQA;x5U(uE+Vz7>^Xl47udntdJcM`NnY*|f
zHNr{`g6uONXwYfk1=#!~@Ghu1<RjGFqt5X5wy!JdIa*{S16`za6k9CZ`f>EdJo$xX
z)gzS>31U}EzS`SE;@wTKEJlRo`S#dexKY#Vk8L{F>y2Ri7R7Z#=_%jE4-?){wc^eh
zonOLUL*eigDxDs0L5w4K$EL%*10zxtnxu@B4jUzQYK@8|S-bYx+D6xd@Xz%OiM<@A
zdfjhK1Cl2taNkN2xq2aQT2t>86XAtGFoZ;jjwG9c!c~mMG-1{EJ{zmr7A@+oMi3cY
zn!LnE$!j7As`xiEIow7uC#I8bOr|?}sBG`ev2+4sGDK4QtC?i-Ya;va?r^^I;x~_{
z+e>^>G^W;0|3I_9(jtY9Wq6eGal$I}VI-`o)}VG*sl>h971Fezjh$*nci-CqHQGQ|
zIr?RlatPQcP9y9gAK3DF`EBm@u*gdgoJgaTSVqDud#a{he0Jnj(LSDB)y(S-s|gnb
zJqR+*VJ8ZUe~{d$bbZxB>7ujD&W#-wHjU{`VmeXdIUh(;fgE?n$?4k_T}x_4{+>=T
z)ap24h&7aejrTcN=vr!ZqVgkHdgfz;>B?%n2x@iJmY&u)w?*K-Ck;{94oGQ%Jte&M
zT8Oa0Qrug)nkwL_LyPDU*90W)C(YbO&7Fq=NnF|9>&OlGZC%VRC#pU)F#JB!xa(*q
zE8hs%YjjPea;sn=VOpCE@}0%y48T445@h;?$WxrNZEOzYU9;NZ%X{T#9rm@4!;FzA
zkfp5HjGXO$)_(BZF?aN0aCQPLwY1OIo+X}+6k;ZqrvcGXBE2BzPA+}nTsidmIAifp
zfw!gZ2W;P#qRfRtZ3<m+(Pfw~Wv!3-S6WdahF!#*Gw?E5eCQd%eEG~E)Lw~3R{$}K
zC2bcYzY-(jX*06Z5klfxTwveyQ6Vre(Df+8^;uv0V8BR#$3r5eOZ{toUn0;7)FQhA
zAn)Oe{GoncUmT@d@2yIe3w->bDSYE`PBgjW!r7QoltMRybSN13jV`9R7$%!c2xTMa
z`k%COUm))=#swJ|Lk{y_LD1Z63j36|jf5;1X))a(YIsuN25@X=$<(Zi`K9UZ!p72`
z@({M0D}vmz&z&Rgd-6!`CEZHcuA7)`p=~7@u%>LWW*0DTUC|TVXD5JwfC%XOy#sr1
zv`*zpNR!B!oo;yj4sizihs!Y*7vpb{(n-IZmDwJKvi1lT2HNU=`c1t-h}Tulyz%kU
zPLzD&5<}it`^6-mB&V-_#E6zbDTOF8@cC@whu_VLn5;q#XG_TMPto(cv%0Z*gTrSN
z`yP&^5Ev(#6`;GJeF#c^ndlj6?_pt7Jzo>qr9`QF*ypIO-HRyqcW?3W@DEUhvo$;d
zee-@ju(xRT>KxHp@yxMV7#fsp^cDR~Vv~+ClPdBc8dkuc0=v)37GyYBhdM9d%xLUu
zk<r2>406)OF*siJ8*|NaeYPfX=P^mT@}jH6@R!p#k^vF!fd>!iTVBeTTHOaWSy)Mb
zZ|az1FfrSMFhDNDUqFh*3*`I;8j;iEOMsC69*AyIq5*LAteek(hZ4B%I{Df3W-BAV
zd%skCY6S$H5PA<*dHI8Cv>2Hip+I*G{E2K&<q@6zZ)EYJPwMe~{#xmA6c6PO_EzUF
z&p(wwTrv<EVl!cGPsp@mal7<UdU|Po&MSeS<+Bo}J{5<6G#0F=3l}7GDPIu=B{MeN
zm7#%f?XIja@GF!YZAWQ7r8oO#AX12rz20L!_beOXOF78GrSBmkc($vHIXP++S|jMD
zZT>9YLOx0XKQAGaM9G{}HbPG7Vn3>v(b?R*vG$2}nd1nP(MnRACF%8)*Ct{2He3~O
z%iOh+k24Vwdu>j;g8cri+_HWzO&j;5Y#W1|od(*bnVsVu2t}HlfRoB>$|Xb7W|`Mb
zI(nk+BMf_ONGj0QDn`$Lqt(~QhIDkL9r9AZVUFG?8Tv_8<#v~Oj6u>u`dsVFCT=yG
z#$`^n2G#6w-vW)=JI~)w+D__%{Vrvj8aX>4CCl@2^$8UyTJKw-!GaqSQWg?9)}V8_
zlUlSc+4clsRoiwox%%YiijtIp!kBkz+P=9FN=qmyEy6~Ld<qzXksrsl9550Wn7+w;
zbP1$FM)x*mi)@9zFl?{z)OwOyUS`_=H;+2kXZ&npj!2fpnQq|I)rl#o4E(FY<dz9v
zFoDDqzPaA5=cd1qEB5>5qQYLw@enHX%xcQ834`b^ND!=;lUBi;VD~|ToXHuNYCB@w
z5dLFEUvIFv5)&O#7A>1^Wd|Lk1{&Bgu{q8|I4?-t3l6-|Dhw=QDES48#vn#4<jO!n
zm?UlX8}}d!q%S2Zq4`ON>7|jlO4b*qjYxqZbo$$Si!v--z>^VOa5Qd|2qSq;%Sijk
zJQ-o$@sitCw=G1@_;aVm9#vT0{4o<o>rRT_<Hk==*_=^!VNm0jtB%XZ+j8gi#nvgP
z9>SB;v^yU1U3zrkbDwV$Px~`^Y+BnOxw|y0unkmbIw|#jI6JxQ!Nsr)!w;GxMBDBT
zscIp3MMhfC(YI$Zp`5{`q4*@m3fXoF{*OxVZ6d2{s6xIGYN~+BVO;~+vd=k0VKYbM
zh<#VDdW;fEf$XiIku?Jm1AleC{HG(|qve9+m$e<|1hW)>7{wk408H{{k+ld^jw&##
zFkM%E&e~u;Iq0?OGFtx_&t^f#uk=*$Mi@QOMXH?;dwU^a(0#KxL^gsO#1izRp4V(X
zXs^-%%hK{l>G}zP>c&ebD`73cl3_x;t;5g4Lc7DZ?P0B<-J!?6k%BlAmkVn)lio<*
zeLUTr#-1rli3VExBu$@}kx0Wy!kStFoykh{Q}(QedQU|eNB_Axb`j9Gdt#zW8G&?)
z-XpXa6CkQ0^s5p8-GBf1eMl})De<vJl)5qT=3=JtBfx)Tt>l<Ba-(EK2h+kx;*q1R
zMENb1=i1D$32QzNB8lQdSK1TeM8mo#h>J#jRc|<+%)Wf&$Cb@5m195`$BvNj{L~u5
z|K^=I+lM~HlXHnLK2zPgL_x>eL%I|?i_P@0E45F9JF|V!H<cixqnK|-ht>xB0Y|m+
z)99jX1ZFO$nqre*CyWKBXCMf(q9pYaUwf^4qNH|fX4>d1fa!%#((#YBSJFLLqJjoA
zC=#t$(vR57=etcg$$W(S6NI7|vE~6QOt(YH=uA-psMXOCB{TNqG_U*Y^g=Y9TuOmJ
zyc;VD3VWoQHmz4nY-$L?eVI5G$4;3O7V{BN(m)UD?Xi%(w(WWgzz{Xjw=bwD-5_!}
zTE3p*8o-&688zyVp6h-2<`TMy?{E_zbbX47k-zbJNkAB-v=MuzeFekF_utO!R&3xo
zKk*DojN)^nW%<kwo>W*rqq;DRd%wSoE?e#e_5k#fPbp<yGLpOxCEj28zWGKNjezzm
z5}es(;{vbclzERm{l@rI-RV?6ag$qp_%{K4{!eW05>(S%aDw?Gp8^S=GrtV*#gAcy
zf<5_^s<PZvn-QHL$yUCw=k%@Yq&<^~m&4g<`hHUkBVb8wBaNakQZfH(RCy<v-M%TQ
zm7<4grEJbgrO=jV?+Qw3M1_~;uW1sHl>M*%mx$(+oVm=*(RU#};vDeriM;Zo!UrI~
F{{ta%lm7q!

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-3.png b/docs/source/assets/design/v1/prefix_caching/example-time-3.png
new file mode 100644
index 0000000000000000000000000000000000000000..71b9e9b60ab9aae080a6e70a9594dd3374c3591b
GIT binary patch
literal 37069
zcmdSBcQo8<`!CG4BSeW720;WNh!VZdjxvHEM2YB~L^pa(bOzC)6K0e|q9%IFXwiFT
zL>;4M%;;s#NcR3c&w17<>wVX|&N}Zu@g-*FzOR0LuFoa<siq3W4W=6e1OybSkCk)?
z2#CxH2ng?x5EBp(jOtM>5fFqBs46|u^)cPfxt7i$QB%VgnvwA&-3{Ik5m~Z^_xJac
z)7s7piHO)K(%Q~$-&21*05|D6+gtPBI^r{OJ}K!najx9~pZVt@{bvhx9;x%D#)DyM
zH0)ti3MBuB{wbh%!i%aQH!+*3wNizaRyKw%acNi)Q79%!4m1r4ppKK?eeBwOjm4si
zgw9U0IFj}~1*&BJK48vLd(x;Wwl8V@=sLOky?5BLq{&Jda>ZoHoGQ^Cwx>&@=4>Ei
zKHt0d&5(SJiofqP!q`N3I0Un^Uf)f+mxkTymB%Il&8b4sYq&)ad4)^zEOmv7+Ub!w
zt~b@{FXHsyvM_;2v|V=Y@(35f1e5nKaeNNCJI>uvN$WMQrTR0zbZxYyx)|$-$83IU
zj^}f*rQMx*h{_^7ted&c%q4fGMG21um|Ll5ju5Yhl&(rLKhGB=228P}StGyxe*Xg1
zO-A#d_M=up{WM`i2BTwK7v46DI6l-Q=bWo#J`BjU=7ZB8y~f4zbX0<?nXRK;l6^b*
z4tY&!81&kkpQxn$C|am@_8Cy!uealMr<$KIs@?$Goir%u@SJPVg6l$l8aD?EXqM&p
zAVK7ahS}jRANk}GidVs&C08{TFXFP21|Ro&jPm<)+8C&^rD5$5tX)~$eFPkW$F0ML
zk0fzG4&Tdws3t?1D30Iaw@>mTp1#3#A$NbT*ljlh3R~2JQ^)z@&Z?7&wxct%nYYi%
zVF4^b{)$A3-5x4j+e424+j^ubfM;!D@)U5lwAJ(TOGTX)m2~O*W}&W-wjQoP?HN#-
z-m=B{am7<#=*xV<Lt9naKPrcbyU5P!#3Ksu=UFiZL><nW3UbOt+%;D^3B)C0SE>D^
zP=@pEOy-UE))Jq<g_tgIHQrq0iABlDxgN`Es4?TeRnLhaILpJUJ}fb9n819o<}!q)
zRu_)b9<55PNKML3<k4RYHB4rOZeU>DJhS&v!hfE=3}G>uS(mM&gv+WXSa@4;Iac?I
zGZ~PEma6#`!9@7x>h^_o)>d}2R~ru>B`Ql^CShR-_;b#2ldQBbcET?(nRl{r47TdM
zAO?#?tzP1L#0*ehd4b&oDkk%-du!ARHNv9`92pp@!Hs;oLztry!f5pP&Oayq^FaU6
z`BV}skHf*s?T2-Hm3f`g3an*eV{O+VfZ(#>kWd5Or9iPZ(^5wxZ{_(-I3~W_5o(qS
z_Eeoq5mmGaer$#enV{|3c2Bjs(?{SBts3All^pF1f^Z)SSCLq}2s?Q*(sdEJn}!#S
zk@ERHilQ6KZHlRZfa+B;4#!f?6KL@7U=St0Od&LnxtH?AS9MPxo-X2PDQ3$EifJMY
z$(ogT4WQ=8*5;8Eh-!4@>JMdC@}VZ;zs<f+42bx1uI_vIQCp?F08|km92CYqj(dkS
z);arJ*1}}&MF}_q_*$9Lp9QC5>#o@{nj_wSMZP{u+Ls5>yrz~3mWApb{3K4lXl49N
zZYYfEX?!|AHKXbql=-*cuzk-Yiby=bs7XpcoR)B+innkxig4W{GNulkSLU_I&Ei`!
zBnCjm*smMF6D8WtX-)u^ho`0h^R|*v0o9qc;YL-zrF)WrT&BT{=DBCdxso1waJ(<_
zzqCX{p8+z3K2*i6A0)byP*k`i<tmFKG?x?p8hSP=r8=WG&bAWH2m7ysJZvLi5M2+d
zOpsDh|MRTrTZq3UNtxFY&EAst*XQZ*(4{4VpJWt5E-AwM>v6Q^6H39XTW_MM0af-o
z;`d}B2-m$V-5Nt}*RL?y+Mc|zy}7$2M&&f`RH%uzb70)@Y@d4{ww>b?8Y22yJ@$e|
zXa=tVk}i*jKhts~bt<#+IjpFqOSAzNGJ*GzuYvC{KF{luL=B6oG+1e-nCs09;Nibd
zs*z+D@-19dD>O%&)<wwJfYeI+x=YiiOe}xhlFAbh_Zsy*czR!$cK61Mx;iEMkdqT5
z`#m)PCR?Fa!6k%+KJ0|L!A*5&6H=}ZhwA|{lGo9(KJVA~ESQNcxVdeh9r1L&3Pju8
zs=n5|<E!R4m0y-eX2NesLR9-00)pIU+oiZi{$OwRab}s`sDkQ_{oJvdapR4<xLAoB
zj8fM5H(TP?DPEFGy8I>LckIA*{#otsDEuo*LfXRAYuI#2k^bh3e=g-4u@S%Zt_>Zo
zBk9yZpHol>fy~cKj7nVdsa!%VB+UWV3q-qqfUq{2pFpz1N*>}V?A+4EM8MSrOh2ur
zE8FAU1FF?uxo^#Aejdn4QoGM5?RTiLe1>h{j<TZ=x?^%)31|wj=$#jW19r~up@jDV
zaoAGc8^1GT=4u2?_C@=R2*Xd=DMuWTTdjA9K+|8j0y9#WH~E*{40N_Gku!2A;nn9~
z-`Z?kEYsu@64w!0x>`hCz+fxM`}S{JBY&PlewZr>xkMiWWco4t94}ZwamR=>Z~u<z
z<43C8TsDdVVYdi`DH#ViH0_^6n2VnH_tKCBWT$ng&xqs=bd#z~NiOE9&afnEE0U{`
zKN%H>rAXSBI$vh|l;rb7r!nR(G`Qp2!o$@hwc~14DGH07-Ws;b^H(~#$u+h%t-tNU
zk|5bGg!J#czhP(n&$Ck2Tf%#PXIv-CELhWvJD7KWfB%21PWWGc(BiEwO|f;Lda#Zv
z6Mv@dwk|G*&U&-b>XNTVGE%qHp_q|5T=grHG}xZ*UMG`$!1JpwQtnE>zF2m&-cukE
zCQ!v8CD>F)-&5Pn((g+vb*opVXf~@m=sr;^w$zcnMT^pX1|?(fAYlJE%;|6^_MsD*
zFH6Bn;d8ZOM4>3}y~1b0_j6*R!5=@2gi~3DQ0dtxiN00Ns0nYP96{^w7e3q5VY-z(
z`XN0<vr|cWVtfPtnSVv`imEbwFL*MiU>dpMbN}7w2k-lFF{-FvA9^?E<x4Po7~qc5
ztxQe|YwaQ@_HspV6P-G0yfFpT2?PK7&_qo!u?8vya;kzGPC}dIz(9_`lO<-3<_jzf
z3U_vz!*gjq9J$yk5s3b_HD(sLe__99PXG56B?<%)KI4UGErg%!Y+7OPS!W}v(C#*E
zc<=#YtW1{vK`b!fV5x*3_haq)81C{QQVe~$c3rSY!c6?Wbc?oFmga^F(4Ql4vusa!
z$?$aHrkSQo)9M$VbCq(rB+a1JA|6>SwpO1!bw1{JM8Ljy+U{jj0>aOMu9*5jg^4|P
zj)ApK;9|A#3>s*7*QC{ZmsI;$YcU9(8jCwVys$TWC-1jpnno!;ztyrO_&E4-Fj?CN
zx|smp_nZ@y4-lVC<H{}V8<Z8lf!kjwHfFlOWLXZe_*&-4AlPxI6LS-ixXT@X6qns?
z7P;8CkHAsV#A4<B2YW58CqE32f-hJ4Y74CV#`p?X%9?*#8(a@O-6tBk95VG43S5ro
zEBFceH$8p%R^pvoKc+WcJ@oLIv~$@^DU-Y-pzmFA!eWpwxZhi^sUjX|J;XB~%YeGK
z^*Liuv>lA%?GsGZ#M_zQLU^$fnz;14ozeF#Di$+c6}F03(thJ9sLWD+<mhq9g?HTY
z^Bk}7AQa5%CIl{#Uhz`e`up1_P!4q&m*z?w@RjILc6EJ(1Hk+IV_ze2iku3^%X>Lg
zaSbN53!Emt^;~!f&N`IFjW(9aOGtPBv9j)tQfeP$O)mUuU^bG|7s0XlMkHS5`$d<K
zspHZK8S|*&OYt}%X@eJD(O|r%%zMb#VtKrAVAh$se98s?Zfs0-LM+_C*w!sg|K<YN
z?TeuLt4~di4QF0ipe0j5IdEtr+uUC+x&yGW#o)`L#_OAoy);7$D3v|7&T46NMf0eK
zrmW}<q%mw{#~}CCPT#2CtIxOPJnT;$ipzvKDQB%3u^%y-ka!oR0a!9a1j8B%)_xR!
zXqe*(|CWIkpEGY#hn5Zj@6V=jLq@~bv$`h|O3S}IY3QdRYxH=^5O_A?G2sK~ihUqz
z#!e)xl`2c65Jjk>n@<7qCF(!q9jlwM&zTrwEY6!0gKxVnDv!LMcbYqo7!5CGK|%U}
zLBcamzyJ)_;PCm(Twrhy1!v+=&D$w<>6WP6X5=P#JcVO{1Tcr)t-1i`P_vE(zso4{
z=AgHbl<QZFq*T2*?KB5IDP)L^@1r4}4x7-A8U(s6h#9wEV5a(x!c7CQ>)^y|U4^)V
z(whYsIz6*7(u*Zb4{9>&)7GGN2LUucLD-u46ksCS<UQ{W1xG#*hs5{kXcLi-sJFU(
z5U4=Me9RD4INNK8dj&Z~zDwMavdiL<TN*weqz8Ro&$XH?*|ii+u?yPEsR4sR<pk85
z8O`l;#PnmxS3na_q;vdMTgsfFPbx1?S85wD+O2bSrn1oL0a+{2H;T|aWIVY0$JcI~
z9%l2bvQ~Y+Q{n59p*B6r3}B0Aaf~Jdjb_37Z~TkTFICEwkUSr4M@cV`g22ZHMs6tZ
zbVYtfgdWf49qH}M^W>?Q@Dre9ZbLL~j5Uo5WGZ}GvltMH)dB{6Wzv-Aul)W;nZOMd
z^77UVUAxs84Bai1?&kq~fFz*Svk%qwB?eexV*^`+<2DK#H?Yuob^0l0Q`;rsI)NPT
zeT%ZGw_O5IN>RV*BSf`#;n0G=hDHGB4#H=7a#K9c!mmgQNSy2)zF*)LQp$AZ02sxn
zOFBOjwe+n-1ZsFGMuD<fY}olruloqaG>O_Yv5ugXl~ye0G^S$VLX?ii<%);1-+ChQ
ze9s@DbYdT@g3OB2En~0?(s8Qg{CG`NWVqx?jmmjXH{A;uxrWqWt6v-C3DQ0g%dsEU
zkQf**L;C0C)bd-0IMp4#%-FRATnD%h%?Ij9TOR+!O60WwIxAmx5OC}W{j4%Bj43I1
zSpC{3PCLdoB-L`yvDKtjbIL?yF{I$OdCIga$V$~?^KS2W!eRh_jFnyoLE}*aW~`9G
zO!CH+hm7SI<Rl|Jc6`MwL=Ro+&mlZz48)U(XFD|EB1{^Q*9AV#)9!v0k583obFQni
zgJn{tRz8dKS_1bU|8x>J9Fm(myseD43wDG?3JatUK`sQs94*+C@|>SPqx2>oo7w}e
zfGUK1oNkvkd(FcqBz!EV1Hm+Lw}lIAMdQ4gQE5ot`K-sqxtsXTgqTu0TpXY`L;%%>
zl!mcoyf4l{Jn?JSk`KrOFP$LWD5P)--<sbrbkQ?wxC#pReQ=9w_%Y5ge`$4dcHw|7
za_W7hri*B+)J?(tdfU8C7A>O0tr5&nkg1a6*d3X*sKBvKhKxnVr&#x~;rFlQaIzjn
z*Z1lJv8}Goex8g6H`;6G953KdVj`2(>IbWyQSpKW{2Xar^c)i60`QPxHx~*IkneXb
zm)lm)$3iN*jn){=H|j!<T%mU@05(ac{>-%{r<s}%X`drgX<9##wxAO<X3)F>`Zf5Z
zXUo}o?=`a^*dZFN3scOeCL(vLN6&^Et5%OsiT2Xk5VhYXg6}Gh8vqI>&5J41eY;R&
zuLO7`-0zo;iyZot-X#W{v_*ZtKD0=?!}sx0ja$0%nC(MjMxV_Q$7X+pGeyE0V}5d+
z)4UjFR7&?9VFbgqsXMY@T=Pz^$U?Q#kTyP|(%QUv*mk>K&)|7O86oIJztDnchVwDN
zOt$uX-LL>r>Go;E^@E+yzUfzJ)wssEk+?H;qUNJOfiNnfh!O>6$UgD(I<w~Eqv;?K
z?)hUOXZwz0R^7!r@|LYRrQM?&>7MJ1ADg|Jh9wbu#mE%Iw*q1S?Yi$$<zH_WwFJdM
zw_1nfP(nzNy(3`>>(b~gU1agN$0JSwTyYLYQS0<{#UOh5CJ2qodT5%kaeTKxc6#JS
zjl(#;%}WFBbCYxD38i0H{vo<;|1_dZ#_*$Eh_<Gb-3HtnN5LRXR8j666{u1CRGD08
zey%9T&yg-Ziw5I(Zb{nzy5rX8T3jTE<0zN=RfxrS_>d{bW`gLA&gPMdA?9YwFW0QT
z3*7nkUD^X0z*t~f(1LP$orae;{5>W@-E5o<g*^|PYH)|w|GZOi)(oip_P(~v6CC!|
zt8Th-Kt%Dyv#B|cuquJ5?%Q=)TVQr_hL+(0bal_ode^Omr@Da}+`F$&3=n!>B#0nK
z3{y9dw(PRvk7a<sSYLOD>5Mc#U0Hs1U%XC4L1=sHR?|j5oTSDrwq&0lUlfLLd>kf*
z1`r*$E!29JaYKlO?IJ-c$IBL-Jj-Z!!$FKl>5F`<`+M)~H0G{%Ia}?Qc$t$~4}T{c
zN+9g=M4i}O6EzL!fG$O!srZ!n5d%CZB@;~U$0HW}%ASO(p5w&0n&k1(D=go}AlHhy
zy}wJj&e`Jmj4ORnn{pIuKnKYK5AP;2)?KNkm{*@JK|XRs5Gx1c2n%c!ruFi$I<4SX
zd3*|d10Pq~op@iBkvepUY>kmhDD9$pWXi#34s;QyRRc^r!HVG*zA5(gjpsz<wFrKq
zil|eLxTeQ^OpTYgJ{l;{qhH#CPP6$nbFo%wZEnND1UKY+1zS7+lk0o#hT{ZQe{#IS
zHkX`o<a~D_C!%M#n^VH=bQ|1#H0Z^0Q~b8K#idyuNY+4BBxs4e@uh&r4#|*F7?BI{
z`iqFEA12@A<7=k`3tEb3J>_tDi@O5^?MCoPhNz<ncl8G^G&uD6J1V*#Bwm~cTQD`_
z<`>Ew-4+klNgp!3QX#&dC7XbE1HJvf@Zst9Rd`BWd-FYj*AxMGBHypIS@f;@cFfbr
zy~gA%VVhvdJhIB;M|c^X5FiMk)G>1fa=`sq?%|)7mQ~(P7T8{`jWW9sUk0TuP53;g
zjGaS84EE#wrGw~r!MlFVpULKDxNk47%TL!uB|ZSL!-f3TgnR3*^BD&NzU?=Unbz^R
zJL2EG3;hN&XhPd{t`uKe7N|V&TmDwov8e>@LO>O6by@EYi*au9qX;FaKvaM7%1I^O
zo`9oK$TT}<=(mOeL+!;Mr?(@P;Hp-$MYQ`4T2@lz-R2xkC4?Z8cLq2*?O@n59{tz-
z{<}gYj>RNUQ2>p2AXTgqRb;(~g`}s*5`w^LwU3z<0-lkAM<E&7rzSg<C(^*nIbCj%
zJJnEs1XUNi?-m6&AA}7dVv(MgW;1r(LKfY6GN>ll8bTF8SB5#1HU<2gWDceFLv&sV
zxJ(cqW#X@*3?7Z3+T{y7%e9>koVLw103@X5q+_*+qSGWen3jV4ZZs2uIFbr`+Ajds
z2!i&CsxB(IGJsk3;?XY6;B8{RGj!=u%kcrZkdjr$Vk`qKawog+8}EL3P1F<@z6wi!
zW`_xp@)tL2KJH5KBwIa`1ekkexPktvFZZMPDN6*2pyg{BAs}l<<sh>UU;HE972l3P
zUUF!q%-;MO$P6$@LgkPtfx9mDGNkec1)tt*o$|ON(~{bEB#G;1n6}H2(UQ1RoZoyJ
zX?5w{hu-mL7!D~z-j^YXMS(fok(sL8Ag$+g_sOBAP{qD3WkU*o<8Wbtb2!M<N6giw
zT+T}!%v%0Q%G}pdS!vmlZYEH994}s}+8jRsCLs5&3ndRuBEquHp4cW9j473lKq6aw
zxMwgvOE&NWIJWcH3YL1r!*M<yxFifQUthCuSq|#etH*b9{8!x^?+}3%C20#1OEW52
znAp8R?9my(5n*cMchE!_#z*JOa(PWrkVjC+1#2ve;$c`El4<iK@>X#P8TsBG)o#<i
zc)hu<pWQr{n8OL%aazwZe_b9^XID9JE;7&Fj^m&ccn5hOcBG;x<|5EzPJ80`tD%jJ
zieM=%_dR+FAz<?KWy2JbyM7*iZVuyqHKEjdhgc<7IPw!AXeD}IY3N~>be~!z+RfuD
z2Vq=EIg0LM6d~vo{pq;t4jXmXo!%FygcE5%MZikWF6QMp0Th|LM?dpR0m6GJvb?8m
z7Ub9F^V>lWyZ9F0SGj#qEHO(bdwJx0ORP<Z>f#T!Ky49}lBo{xtX{uveE-wxg&82*
zSP(2?=GCM85T}K>)^vUIf#^sSVHXe6$(U-b*?1P&U60s${bSUP-aq)_XX~PrKllJ=
zT?I!okJubKT}D&wQv68M3=y?8p&^>;z~1SSx}%p)SBW#~mr0AXz0LDpQK9g0z!bb^
zNJt|Qq-1ixoa5rp{U&?Mbh1fwqaK>LK_P^@U<KXvNklUn+;<5<E5M^H(Kne!vSYiN
z_VhY_v=4;pHP}BL`(Ys_y7$KC#IU;_7k*)=eqs4|w9XC)b;@51l8E{u{JSCzOAKQl
zgj2zUgB`|av@@Tsa`f3_=9sO$1o9b5qqOf`+$GfaWwY#};CU4;{VqhXs=+cBU%uqW
z0L(vas|+==z3#eGS__eO7_7Eq2Ajq)nad7Me*IFCsmdX(UUJhhsG!M3cj*vR@BdLJ
zLg;(cP_`g5gYZi|mweJqUVkULhOp81_ki0?<Dl^;(%Q9EBqW}KWsZQ<`C1d}_E=<M
z!+b)fZzx3DmYHG1SNWt*B$G+FEj@N-DxB-72wvwBMC2BAi(nLLS7M0CH+w3hqw!A(
zgSj5snA14l5x{$ZV^gdox91(=3eTnDv35TvI=#Z|8~^0YbJ)u*ujNk8ZC4sG4sES@
zw{T{u@27{P5jQE=?(K{b#ZljC`Kyl78Cb_##g+!Ih@=*-4aWd|R+LAs%hbygGoBgF
zrYDBD0-H<EfnL`{1(&PUn7SF<?3kYX4XYf3loaolL!JxAv=@^d$0cf9+6=Hf66Kl6
zo1%hkhtd8Jeh$|YQ#`ThIcoqAycEx&f(9FQ<=={U;2e40Wz!=*sbKMbKX@j=$X_^8
zVDC(!E+U+7+)V(qxLGec`Aoz?38|T$mwyiDGqh~U!>u!esPl#^+?vi6xF$dMldW`^
zGKc$i)smh`>nvSW;AThr^D~_hRUI2KY#Lo5tHHfSa>-vTf)Q%L;K@oh#o^s*cu57>
zfoOs4SWoQ!ipeZSDRKyun(+3_j<?j~rbSzI>v!*CSD96o@AGQ8Vxt%MR^3AYeR&Q!
z-k+Zk+1UJd5K3pIia32->TmEUpfoIT!2YAjZv8|7;kZGqbh%d%ZJf9<+RsS5xivi2
z<=8hGG%+Z;58o(wvc{-)aUYr6JRE>J+G`b`or;>x(F8wv3sNSJxTJ|i{2&~8!su)#
zUFpZ}as6!eWEi3e^b3|yaA}W?`8OC$XC;a7aN!?RtyTvcwb$6EukBOX@t6lgT5asY
ztlc;kee!e%oMmGpt54@xl!(|B-M7RRl(SVjDe<FN1zZpYo_BKq^?jMlQ@p||H*S(J
zBd!ipr)N75_o@)!uv*{8j1s&%`k@HKfQJDTbn{i?$kM8s>Tl`z|AWDOrh}3w&)+&z
zuR|<H*tIk3HI9*{HMu5c>c?s)2_i~7shxFqG&uwz+fBY^7xzK;FzYYG3N&G~WiAKC
zhG)+%JU?r~p2=+8Q;JkUoDUZKKZfTtl8CE##(seerZI)!O%pZf6|i`P2|g~)WHP~5
zE+pFF_f&}tQyhNbM1q|<(gy;C+E>8h6($IKrV)1)tHSWH%HFbtWU;GiDO5W^G?^;|
z^G<Mxc2U*xpsI9GM!5O1;Mqdbm^fvhJA>imNuxmTmtTF7OIo^CvIKH@o0c+K|EZNT
zX&>ZlZ{+K<oG0AxItO30<eoSgl6n42G|i=EM!fWV7)PZZXZN6fXn)ag@w26#Ca(Fb
z*T*pwb~VSKsMf^SmQIt=|9D-!!1bM~ix+iPOGz@S!;a<rXzi0vf_30-NWJmEAaXPI
z-)<*94Bmd!c(d*n$zJ`TY^`4g#hykVe`Up4-P%g68S73TEiLw>0+T)homWvmAtqz%
zoHvJm6f141q`Aem>e!t$`I+u-@Um9jNW-}M!G&YH;dV$s#S%lV7wl{?(}cf3a8p+r
z{rUacyBA;9t=ddQ-(A$kD(tjwiSGtsS`Nl}J$d7(hWOb67HzVkN8~rV6psXgtr4-E
z=94BKYv4=Yb;xq<3%pDq*Ei+mIUSEGrSC76+-Tp}es2BRt*q<|+UT=pA+3$ol#@iy
zh5hLj`Xhg5$a};#JLtuci8OQT#NC`Xr&o<KIldBotCX;}UEM#PsQudZ%hGIVePNB0
zj~x2ZKq3dI?q<1j_Q=nE8n$3onF;$N1quV9m6d)`{&z1N*(Idc+fSama+?-h-2X>`
zk<KlSNw5CI&&doaZ1o>@gscZO_7KRRem0v(+c27+?KF6b?rU`aiLPICe%Mws=NNbt
zY{zK+s=UQgH1k{4cE{d5BFL#r*^Hgr)X>AW^$d@ijTuXn-?CRMVHXlmgRnO7-r*;L
zBvyXtj{o4hXjzdc4|~RqV3~jQRs71el<)`}=T7t3JLf<9UB;evUTjnwu!1?B)bRvj
zzEw|L0R7K;u+cJQu4ql*dI)wx#l>r>>8U*(Eog=FHsi8Yal*X&2WIvLpHb~Atn-t=
z&c#*NW>r@I1NN~GfV}FJOv*(U11*8*^OHJY%fOe*W4+Nx=Z5KU3*!af38@hrLk#Ey
zk0x##Ut_%1zV^MVa@K3?AUiR2tttxLtB!fsH&haVZXF#)p7D!c{`Fsx=S9l54BPa<
zz{dN11l1I_y%Wo^{d*9r-6?UJxqMC7*sFwvWqI4aDE)VYw#@yq)g8c-pMed%1noKT
z`;}{5_%0*eMQG(6s{Qz}(AwV5K<`anBF9=M3C{x-&tkE*5;8OmNxKhm_kdWVOc^t|
z5V@}WbGe|2wS~yUYVwM1=f?i#TDTC1BD$|p^@>+O$Bl0qV=hkyWKbpdRrKR{!rYXg
zmhpymUKH~K;($G-JC|7s0I4QjhZ2kQY@cb<UFN^YhJXnoF`81AX2?vDopGx%@4Wgk
z7R<)yw4&;M(ZGfuO&-NEPG7Yf^ikKu*1l7t2pgTU%If-;cUMw7j^fx_NI?w(HTZ~5
zY2=vB8rW9mOxGLCe+vpZvueEw9eI5F3>bmi!j5U?An+O8nVI1$E#-np3`5Kwk;N06
zfn@ZujXG1fMV*wX?;u)x{`K8a#IIpVtquTQ%HOK%*TbmN7*Q?Oc6o`BfD(&P*pNFw
zD0XY{;%NEM?a>=WAAuKD6;nU$lu5hXP@Rff>w<Ym;K}Z@!Sn*fsD>^ka~K<iRjY;^
zTxI2h3P5g~o`~FAuT1^q{ca60U})uaql?i(Q8mTT4e7p(<4s~Bh;(;j&6q>FO-LdL
zmb%!L{Ku5yde9OjKhYBs_83|{dm`m0e|F{NnVPai5O(#YQVX%wU?!OxwXX$Pg`j~z
zk0ZT_izZ#67W?e5ZmKR9a-sPAqXZ|W5rFj--uyMOn`Bzls@T1?nb52Mb)(+U(Ws(C
zU9JKBi1}Q@ol8`AdBX^ItCQ!RCsM{LCj9!eZDK@*U3N=X;BZXO%Dl>Chr8gVx8G2^
z(7J*Uu&0ZKXF0g+S1;*OI4HS$M<Vi%+8n&cjTlN3oF94|nXW)Q>7Q)i-BkNJsP-hs
zCS-<KWQdLBy6nvb%Mdh9`+AQ5sD-mkA3=L~ytaR9;P_=@QI?D3w~JkRA_%4UW^X5b
z?&1XcYU(ZB5~KoF8*%H}a~5t>z{6l*zvFOIpUHHs{_5@;av}CGdk8+4@W4F>#m$A=
zp4;CgZ%++l5D+9Mub^&^oT^(LAwkCh1oeqHorH>n+JvwQR*~U^_JC6-roqd?T?(Of
zvv8#K_)Y_vmRsv*!pSUq0>>&uW?47$Qf{Z*Lf}BXGjKN^JLzW;;ztBIt3_g#!&MV}
z01<8~R+GO)@nG3AI^HD};k&LmWwcFk6@1R?Hn8rYj8L+bK5u$(ETPxp0YhCx?w;fK
zhi!1YFc_ijEn|g7<U+Y)6|_8k1W;$IlsAilvaTl5Tm2%Ep<z4t>k3JJ$XfJ26sUEg
z5Hjv!^gLwo{Y0Z)oU->enNXh|YV0v2{K}JgFV2<vHlw)$$p`=+s-K4KN$~}TTL8?T
z+@);C!_4{ysv_S2L=cc`U0xTn;DQhYMArS2fhZ!)t1cK1|Emv4Fa6jQS7$aN>My$&
z)wkr!&9)vSVHqME>9jG-eyRM7!^N?x!2mEAKm~a+&$Ly4=i$T0^Fol3a81$vQ}t9)
z9yKt&D9dl0jZtsxqL{UWlA<=WpEWGDBsR6x)qjlm(<3h(xw<k&wHkEN_dQ)Mm~WyC
zTrkono1!sgR(cw|vft-1=<*I$OR?@b`Mk#yaC{6g^=K_-YYv`>t7Ca-z{D<X5pk1w
zeLn937dP@3<BCL%a?Q7Xm(kwtV#<Qu6$uQ@j@XMrd>x3fsb}2{^LK2`p!K|uAUU_~
z)F$@=6-I02$W5e=46aQT({=%exLVP=6;t-9vl*KD=jD?GD}k}zf3{|*JZ92SIRV3!
z+0p_K-wp#l)1k(n<=~;QHvGJlI0kU>?SUXUbl=P~yzmv=b+Mf)zgQA)!a~wJB^NK@
z08jI<XzeIGA*4k+PADRsdtK!dzRPemYF|r~GO;)|F~~D&(fr};5YESdnNv7Y6WmSc
zB0-Y2GrCm>F23d*?MOlUDRtMX0gU_WF_yQ~m`2XKa_EnA^&K#1LqZ{lmg3v9HNi!X
zYNSiyY^rAeYciZ&7<J4yd`}o<Hztuz4SEb<;|M!j^KKqlywq<TB@y1V!vkb8o1{Tj
zum}p<uvf1DY|uASU+_L^J(rIc7c|INP-_U`;psOmFQYH>;raK@B0lIa;Ny7_QJX1L
z@1>ozAwDI4YT9CmTh4M2&w0!-+S_<mtDf3K_xGrZZb=7z5y?ky8DgRPqo7-yHr8z{
zzc%^G2O<P0%=R+dYa{JHx!#_`Gknt9;l^z6GI<>5R1_9!*)ROl!<dSX>C~UH6J2Lj
zNfo-#z3J10-K-CEH3iYRnvsu1f3Zr5WmSJQE&Qgf>fq3rNEz0Af8`)5-X-l5*<G@{
ziqrHs<irDkDE-N6iFV%-IPE?d8B=}!L}oc{KByy6cSlFXuhvOWe(jOTfD5^vbjfC*
zy`ezVTB5_cWMtmM$|SM^Dg{vxz968p$QCMQceFMS5uS>1^;>gov}mC(eNhR-^R>A8
z)hqxakSCR=Oi)Y2gW_^xKyr)JO%8X+czCSE57xEil*8(r+8cLcy{QDtBQv`Q1wkOn
zZFxZ^hEK=eFN8N01i3)Gi<IyOV_kL~uWu^P{lsUT(%&eNNfj}m^jHqx4Qk5@GMcYB
z^;Y1>mEKv0+<GPC9ZLAF(3KLvqFcv3E17Q{k_xP?TIcOkMl19X&?-cAMe%-l&8h!!
z)~zq@;~c<Tbn-a<R+(qr`!EV&&}-A+^T`d_jnBo--iv#d)UmFKl!KiGw=Imx3(67x
z4GnRncb%=z4~eH$1n0Mmjdxp@;2#M$!qLZvW8QVI+=6{M%#GVkW$2J=w-1<JnAXMp
z#92J8`YUoFAzdVtovDmpYFKIZ*VzBqtLL3EeO^i0E|~7^RL=2+5BFGf;*DN`R$l>O
z<BRX^?>2Pr<T(12PV|T>)X|;Y+5|Y1*Ph-#J%l2SS*+Xf#b`0@!H4qF4`#k9(~|kF
z{+}fjCV4#{=D_VJg^{t!VUA5t`-`m*`4lc+|GgA{i!Ie^*)!g5dc8s+#9W@UPyC$Q
zbavoI?RPJ&f?b!y!~b2Yf-mNm98?;j$IsspPI@oRM$B1rgPxy0gEjx^HqtEb=lG?K
zmRvC&uE+}i)}2v^F2TFq+s|z?%t4)w*<Lp!?i))!?{D8d6Nue=HgN{P{#V)de+><T
zgq-o=s-SB9AIY#s`Dz&gVZFh^siP>IV>+FxgYI9?o(PAJeh9(eRadPT$o;=2to;A-
zf1?gXY*Zstv0zoRRII0}R>~I@)%H-I&z9v$&U`4pM(DNe<dpK%LS{AT3RvaDw2iHn
z-!eo3PusOg%6;|V<ss<^2?M7a<55EuY)6ghMVv{*{?5wNU89~u)uFH3fpkj+oR_Ul
znhzXWa!Vc{Ltxw{u|ig=d1=_^HvUgWJ@&HB9&-tN4r8GS6T5M6mi<lq=&@YuQ>rU+
zQj~brV^y<vDeD_MNk!#d&J+?{Yp*U!7e@YY)>y`k+#nKltl+2Lo)aW-xrU#kPraDi
zrY)cjde>nyw;2wrNe<uNInu_TBpiPdP<o(pTT}&3t0Wjts(ZOT>8_>Ivc9(yf7@BL
z7kyOb_^kW%OnyLx7u}~e-Tvq~hvPFmCwGS?OyhAcl5_8hW;bH%g9^^VHXY~jqP`~$
zYWkU;Z%_yS6}yo_=w+re_v-RdMFMduzM%j_`4DX<aA~OZ4t*8eTJi{f3%tXaPCRXv
zG>bg$PIbjbCV@_fUOeU?p8kXDLQ56*xR=B1tzC%R)6vzWL2`IZmCmTg;4nc}0W8k|
zK1KG6<Tpa>x`Y<ouo`GVCFK&&<G3mtZwR8@xIVu)F&YZDf2@jm_r+RuEIrViLTFhq
zJ7vfyfl=*66D(sVUOJ3;`iU*u)I~wnUe=waD8h(FdbI0<O^RLb4@v74GD$6r=G7@*
z0(3O3=HtQ$aOamDyN`Qqh&e(oyss_cIS7~ac6pu?1D0h;?=-9OVlN-I1)0O~Mq_Ei
z_M+9RmIw-wrm!mc{*{rsxU2_5Y>yvwOW*HqmLcgAY1@2sp)$K4{KV$)8?Wk+lp+zi
zP#0<NXc!rulvdDrE^xm!%<U1(!e-_R%qq!Ui#LwxCWiVl1=}<~vZy9~H;O15QwI~@
z#nVfDbeswN6o?$#VKv^Yp;5lt>>g=&&Z&Q(F_%*l{}t8RLI-hVOQk<VFjo&_(>wHS
z)*J1$fZc$toT)k)j|N#%;W@lUl`xzEDO7uB%=t5{+Z=m(1d~nEh)6H73$c*BFwj9H
zu&?%rwE0|er();Kf!)9qYj4isA2(G-Mz-+O(wStNkR7%#;hSQgy!r7=ho>$lm~E;s
zD-xZ>bCGGSW)HWdi~j8_EQH>-IVpuA?=5~Nh~wt)>R7l>G9=PgWlWXEgIZ9ln+i5q
zHcuh+FloOqMSS)9+ADtG_MG9DP{M^CEwrY%h@WjzGm(f=nyULjh`xZd##hhpoDZow
zg_0|F<H1`Z2&*rXrzikOw!EyfnjwBaL6iD2Pj$xW=U%E4qz!Qo5lK9quj-pm-C+#z
z=UajtJ1yOOyiZZl9_q^USN_s5o9fOR7SZVdXaX(~_gacKXpY$tU-{wkwYv|?)iR4|
zTqa@juAJG#cbidwpv&ieGs@m86cxd$e!Wugwc)hXRy;BLnN}|0p*htcW>?0AYUsdG
zPJ5w1I*85LtN+7lhUJ-rsxO%36Jd?&s0eRlVzLcY_$s4$d$N1;>d?*>CuFdq{Y^ak
zCVA_CnAD`m{b?1Gb_}vsl?aq0r%X|S#+wTu5JM-{1|qhYAG%QW{W4TMCz9%ew@BD$
zpl$bW2I&jP9-%TaJ@DW?8nm~y3M;OxvLE#j@XIBf;(*XB`uucDC=1Jv<F!B#C?fUY
zKL+n)YYR%^In(tgY$1vKt#3()_2m?Z__f64s>p@>GMfpNiG~yY;Ll1y&Qoa;jU4>|
z8j(Ve<{CXle-;uxk)$vtp%BvlaOSWkf#l4Q6U_{0szA%!zdsaep(j4F;DdiBz-3D9
zTYe(P&^r*#n2}UhP(9!gdMV`f&qy@@wE39<oKfQl`F`aO1tN&ujQ6UmG4)Xwe&7V)
zT-1eAy9KBW!S0+N!(<hPFusOVn#JT#7mrW3a$t5}Zd(m{9|yc34aHkk8KPq)xN~jh
z**F0i((~l>CP2zslMi`>bCF|EdJl~a-26n1Pf4>KnW{s5QlfrLgwU7uNt$YYgJK`5
z1h-!ovj50OcX>^>-Jqy&EhE_l%Fj14oQ6Mil_HlarHPN##R~;OUF!13i7Pk`c&A6a
z4epPpF0Yd=<UO`yD#X684z!}W`4|Y%)3OioRiqpVp|Ak=@3luk(cvRs%>k-lF$4jI
zqs$Z%byC&dvT#ADT0`jNZ<D2MA`^)j>6JG&%P_b#$+zVDI&kUs$6*4DP0lMi&u9$s
z^ISmT%E|sQ_Lzr+h@2j@K8@VEWn6sW3r`gIi7BSq4S9+mJejg1B<|tIrs4u9Mh$DF
z94TY|?2c?5br<64QdQqPvm0ht3jz-!>H(K)*%B}D6B#s>LOGgM)%Q;bC=D1ng-Z3Y
z`_m!_%ySbzTsyuo^OYuEza-4pyUA}~4r>@joSs7ja*@Mmo?m)i@y(x4`(QBwviWXd
zGLy33yr%1V?~1l5oKO?rJo&-gst@?UAAh%j*~Qg*fRH&2yWO2Sk+#H7wEekPeAl18
zw>lvK-Re;>;PEu!C*uuISHK5KLk{zek1*N0bWE`TL&VzdyF2qMDkY4~MAB;0UDK6h
z<W71sku{&f@RPIWKf`<`ovKO*iI<{Gc<?0Pi03ZEfG*}UM8&VYnGq(}U^IqppMFV_
z>%PPwzy$443V5&rV7dq3tK0tJWygH8nAoB@uGfbM@+y4T{wESozt8hmXkV?lt)Um4
zr7TB8uCNlF>x^&41ElY~GS?(O6dnmWG|%#NJHVT}@%!f$VT&;D4foX<$@>XU!=Ox2
z^%9dpbPIpKveN~>h=9R-Xeo1y%ud60bUV46nP*9vT)jAVYl^QmBWn#AJ<-l=KAGP0
z<}_|Ze%XrO@E-SCV1MbGSA28d1#j(<#y!ae^5AE>e^H{-SC~el2#LWJ#%U>-4z@_4
zhk-2zwjsVy%(dS;Kc7Px>VhA!*&bSm9*^7uH8gF#HTxEuYkbo=^imdJTH!=R-*9)y
zv_mRgrN)!%6E#u41TpX^k@p88tn{he3K^au^t@+eulp+v-s$!AF&1?k5kkfAJ*xt_
zuClzXgO^&O*LQ(HH9vX<QVJo65$I0q2S*W!F6Lf-__z6NIlj4&{661v^rV;$m;oP<
zKACUy^}a~0z;}7nTs5daNs#P1Wx%p+C{ajNG+#DQK3l!VxYUf(<_OsO;-!7al4X<Y
zs1>J)Vf3>E5V_Eg*$$kBPGj5kgAlVjk=izGIa7<wo`xi32&&6g)gf-77(9kXigpdA
zy#F#}<ikVx`w2heO2R5DKpXU^Egt$qv_+#HTmhlhdsaoRbH(@mLIUA8c0O0nB%XPs
zTAxTY2rS57CFSlN6OrZpkkh`V5+@Y){N$~WU#+32We&{i#HV%N3}2DFel*R9&mwQ5
zQ9+`6g=$T*c-nsfOxA8E2`1|vA>2_ydj`*4#RoU=s0dRxEP%-o&v%#eHQ+W5kexDs
zpL<68gGZ~Q2<2AH4IYIJPtR4;tTVeuiRPnaWItTgFn8r`m^Kl1Rl8*+vN<ULI8#{g
zHV0TbTuV{5s8Q|Rt%pF)>iAz0rYM5EN!m-6>eJt*EL;W{yO85O67TSpk?izL{)VD2
zL&80;L**f-!fHrl>ORm<kuLQ6)cx`BIvTCVy?ql_Q8Ym+Tmty)alXOm8HUTtFCnb)
zwqX`NecY0`4t{^~^MKDy(JCEV>`c1wFj#vAiP5|Y7*2n*cQurU`|*Bo;_+c%QX-1=
zHG;hL4SjH%(W)$LZ=~ji0Rm3|gmPb;J@VlJO$I=P6NU*&i)nwwH)6o!y3-je>bPOo
zusC9R4r+1M(|ahCDjt@%H?Nc=uvop-^&o<qLd#!3(6~hDDxvx<GagcS;f3&{1lIkt
z!gB2qs9jdJG^p^eQe<*&-4*1A3i^^hnCXYVv)jV3J$Wb2>xq``^`QbY)OZYaK<*<g
zJjK-8CYX}k2>%2*y}2@iNSG2)9paH<u|B*+(-6obmoQ_@IaeKuJqI@C;bJVPQ{oo-
zuH9u)aE8BYw2Z|4Mgp1yZISs%SH8^b6qQ7H7eMgbKiS<X`na5PO<5L>Va-ZOu+dZ?
z5depA%qPoYPg!$QxDz}yTWBMwC0wYu6KIoiJqps2XA=q6?LR+{GqKHuakrSny$xjG
zT-(cZ6T}vIP=LPk!!Q}D_1_*FUp~y27adPWu9e>s9|k<YE<x~x>2#0kjO0oH$%-iW
zgl>Us?IHi1KKP-K3!im>4Vz_{Y*?bLv`|obO5bd(YJGCYB(+em&hDrDvDbBboNLr(
zz)|9U2F%nAhKBbD^peN5C3=v5w8y)J5wbdYt6}r^?k+buZ5Y=1>7Vot@$ANStQocg
zC}avl66JRbqqaEvrixWhLJwIXTsUAPnS=2QQnHn#h&t|P-fmhOUtz}FmhZ!dN!PdT
zZK}KIig?HQA3M<qKD_()-b83bXmakzN6pvy-{pP;J^1IR^u`QsX<dPhw0x-#mphw)
zEgB`h1;2a6s7E(<-$4FivGe{yNUyy#Tc`i`0!qjG$odykoN+^DTQ}3o`;1#q+mu&M
z{d?u@|J$cUhX``U{j||M-K5ix`HGR}V)xtVo;^V$IaiMLKj_r_-Y`1WOf143KTSI2
z?|u6~S4ye7M#p#v5%T*;Irm=Hsp)VOkk2TjDuNyPIuEufejjzjp&+7^Q|#{lbuWqv
zh#+U_J(>?_P+g>^U=iPTQ3V#b0D~kZ7Ek7P_XN~BMIEPqwsIHjlpxNWKelc)zXpxF
zrouo<j8);hksH(cF<UdtRGzB9leJd0Md3t2acCxIL&Ui6fB&Rp6t%Cg+4P+uOM5|S
z+;!((^Tp_A)Xr31;w3T<5q)TbI_WNRM=Mevy0ixK#mqhUM~E(d7bjVxM&)vHf$K|r
zW4M#8vyn@`WxqpWC6j_)?cZ;yL1Uu$pHBr3lziDeO;=+FN}TV$tn-S!+}mS=k@ZU$
zNV~5_VK3(fV-M{da?pH5R-gp6?g2S}z4izSjp97#O>DhND?{fVv37)GelNRpO)lj^
z@EN#^u}H&v!NOAdD7(67W$L-WMFOyerL&KAs#@aCCXHUa4pO;g8uBt}gXuq?Xq?!4
z$v$B2nmK|zX|>zG8euo9C5o(ynQ~a&hBeFFT#oEWi5!tT;vY%R=E}`9Y7qfBU~I|#
zv2uC;yVAB5u@9}u^e9IveJxjCbwn+phU9(6op1*R$vM<sGbbu4ZkP_dsYNv?r9%Hx
z@c&ZAf2x3fr)BION<Zs!PSE~(GYW6(i9wf_q1v}g@%-?mH<I1kA~IHkyUg4Fs3Mng
z)_V@TK*~1}mg9rXMe*_pFm?|7zqmd#_Gqzb-I$cE0i#gcJWfMa6v<MdNH>J%gejiw
zR@yXdkCLe@g_@+W8<n_6#Se^yiKTf9L|IHbO=+kVv&!6y$>J`vbNOqW>*MXh!PtOG
z#tF_5G1z}Ram^~EF#OiU=DmW5R}vdtdpXPNPA5{zX=biq=^at4vEa+g0UqZP{IV~K
zdpB)Rx{qA@*qp)UJN?Jo1y-0Z{W@fY#kXs{HKsYd%~nCY8I5);cG15KKAKnMw?f5K
z!4&`b6kS8L7XDik4bde_vlQ@Dr*=;R4Q`-E=vb(?@sAsD4TGG`L)?1t(8MS3<J<5p
zM)Pi#N`L98>}n<a&0$%;g3G!?xDA84mVC`5f&Z}pqxpk-A8%J<J6`1#RS!vdpKjW<
zg6HOTt7_*!9{plk4e*bzii~zEj83Ol6QEWY?4VX~o^#8BfSB}ohX~YXfoH$dRySE`
z>ieh>@G_@wEYi$x(TUAxyk{Z+)c3i`Pui`NWwbHRPYiGJ&UH8x_$zv=&|^@<rJ?h%
zu_zgMp-flI{Ev<MzHm+K9NXIUg<r@YyxE&r#f<iqXO~I1^<g8<a%1BAXomz3LM9H5
z<QY46#IK^kl>y#WcL_oZmdykA_pdwq<2bbXKOOD*o_^h!mJP(lwic`8`8hWXTdEhS
z3)EWr?boNNxHahg<HM9*_(d;8fic5zcH;rp%n!wCX=b+C49KrY=T9@<Kji1?e&RLR
zL7}fTWz^_}(82T2zWfSOvQA$RFP&MR<7acS?btD;fBB2mJNvh^N2yvZZgX3XUkn=#
zBeIGl<bHm7)uGXFJPSVeV=lHTV#<>Lc|JFSHf+G44MrV`as=eIs<uiS)(2Itq&hnL
zTdFh^)XRvoZDwcx;$}*<G((G9X+shS9HbAbQG*a>kq=N>iV!<_QZI@7Xk<cU_t{gz
zy(>L#D(>!QClKC>;$$_%yf(2hMy#%_3DRrX;9Xy&_X%XM<vao$DR+n@f}^Ai=IL0n
z3j17ITGtH`w#equ5vwj(g2p9J%$(Czbh$D-<#(pIk#ZB(1>C%mvwyeT&BtjbZWEb)
zH;EvA34@&e0q=yCZ8L3Zu$wZhZV_|K>VkQ!?(-<^CTBVaCy|d?QA}l_Cf%$pM$&t-
z)hZnK6BhiV`N;P)S~#S;`0%XFgZ8EN$U*}X_kb@h6|th#za^aGI?jsU_<CSO+jU#6
zQAkD3P0pXFKtOvfy%66E5NF0o1YJa<)Jo&Zoq;%nmN693rC6LDY}2s$VhgmSqi*_i
zM64<WKj<zNXY~aR1(501nw@U+VZX9n^2S68jo*|$P1!uG-V4axZP?34buV6WT;2H}
z(HRmsCU&O9FALq38vu_rj=Ze#y$ob|ss%|IlHJ-4kE{JAfH(dWdopcPYdl2)il{=@
z%sDNB=g-OXYR%4nBI^$jU&e->^5P`)YlY!mPsH?2FAaj@m%XwczxtiCSo%-*TAR`a
zA7Zd52kz~x`UELs|D6Ur8be0p7^~4$ym-E$TAUYveeAdQ8QHKG{h(zd0M&}iKlc7e
z9Lu9v^K^}hC97C#v0*CEY^RgE%yZ7{@&_8+;85)4;Iy*-Ph9Ed$9K9LEVmzmOZ@XI
z<+ipxin=u$?!-j8TFtw6FPLy7FwEVml?Qa9^b;q4Wn0iYd|OlpqZw-4{!vvk7Zj<C
zrlRmMrY~<kLu1p&hP3tqEZymfjg*-(175z{VDja)3|UH&#&0`eGaFew=9rypG#W1N
z87ZSb)hu}Cfm6k%qT!8KZpSl1U(odZYITdjfB8h)#F)$*_#=u3p|W^BppjFz)x&DA
zlO})J%X0REY!8HP)PN~EiF-0=w|fe0q0a<7Z-(MpQf_ew1`KTpwgzy`Mhdk(u6~MN
z!^+vM@3Td*wFC*rEK+=gDN+|$ZJ|an@$PIrrhYx)quNkV(e>?SPrBmVy4+?$?X(j2
ze*7^YBNWSW*kym`Mg+-pQeW8*V^K=UO&GZL;->emTEA2kx<MP3-&1TGU6xpg2Zt!g
zmzPZ2z3{*655z%ZYezR}M{gMz@h#ZL?u;fri+A_naGb~6{j)3$4O$ZpE&S$2he+>j
z;UES^LjU!hp`4M)?*SSAW;-8B2oJwa1-qdGB9)q{E8H-G>8t_$A10hY?UIM@B3s$6
zVjbnznQ;>(im7k><fSgHKGHXiEN>}+YW6`@K}Tz36C;A-pFsvZf7gPCMT@})Z;S{E
z1=;G>|1N8U)s-oPWFG2hNdS+HPv>RKbZZ6cFP)KAxGrRC^p#C2CTk7Qm~dSEp3C!-
zh{w6-Cjb=IVzKdJ3EUBtIH5i2O#iQ2QB<I8(}m?O;HA)?vLhL9zB#ow8vb|cH85U2
z>3p?Gnm~!LYvMC}A5s71Sv*q?0zKWQJ?~8GV88#N6dxFp#-si=<T<r}Id}o_w2j09
z(o|Y>c&7z{-Mrc2IrS52x4=L8ZS`3-KHuOqw~eqf^+p4BD$W!tX#i`l-L)ys)?EfQ
z^f)+k`s{SCb~kS9B^ERv_HSi*#@EjmT=O!MxJvE6()bl^eMwEDsbZxfmo;#$9s4tR
z(A|Um;*3I&nz?*VNqXG&nK!kVG1mv`e~Px$e`-X$Fv*3O=ISyk1HQ^3Hh1#Q<#W7W
zhnpxOB1i7nO^C&Xd|kbE>$fcR6|65qDTZjuin?>$EIgTOb(s$8_NMC=?)?+wpHUMz
z?kUE*5#It;FQjp;cfGAW?f>iZRbhKoab8*7beECy-g@KgPyVGE|1&E=y;?@<ET{PT
ze{zMy|MZ~vCf6Ah)Z^fHVsAerG{!Fmz02R<woG;y5ViK*H$!;2(|YY1g9%*h*y-<R
zty=KvS3I-W#bneQ@t3t#?aNbI`mT|cwLFk!Yxt2lH-@rH?|ERPMHG1ORpPtjJQ#Wv
z(C`&Aht?zk)NKR97@84JXRNF!@vOxLJSvr@jNF$#>djiHML7D@{!0V+l?Zwh%ZyAv
zmo>3TuY=r#P)hkjQG?bQOuihYEZkE^_(lYKm&Z5c3&(%UB>zK$;eznWf810lK9pGw
z%S`xe`L=SrF6Vvgw<4or2D4xTu)LC09s^5qD#LT1O6>3o_pMzAw(YC*c9kXhg}Yy$
zTMMwwacBFwX~<(f*9KC`iMz{Li+(X}&&K~<g!k29BEEy0e&U)EF@Kka`(M_jnTu-W
zP7BU7I3D&R#njHy`__QpcREE(UGi@aR$~y?RhAb1l_H`~qug<;TR_*j?mS;@r{>yA
zC4)V~qi;Lji-K}%O?(9y_W7T_^vA~6S>oLLcK%y!?;X|DwzUt3B1J(&R5}q*z=QNE
zRg|KFASz8dk=}a=O^_y45$T{5k=`K?N&x94(tGa)2oNB2emm+pw|Vb-?-<|bUl~KP
zGS^yj&GkIboO^D{1EZf8=kKP=@sotGg%tdvBY%DR5nZFicyO@n-#Nv9&v5>kdH!#7
z;9n`-|AKJ;co-i1i=*UU`QP{uc~cO@QGG@d=-mMO<2#Ea9Zw2@WOPQpu{FSS0yY$%
z|Mzy7E!=%aNBIlAYv1~lB9uBEbYrtZxyp5}oC2{l;|F%0fJ;a039-}_I}EN<D9p-z
zy<9)OUSWHs$6?ens4I|y@E3oT|FF+Qe{cZA{A^OwM7zCl2wg9mHu}AC^-2OqP4GVV
z)l~8Q{c)dx*O-R>@QM-cH`146f6+j#Id^BI_3A<Skz0@Zl(ie{80I7Eq{urfH_CO`
z@T*k^&ioH`*u_+f#=@F)=E@te@Soqs(Z$f)lLz~p8}3D^GM~sUdn5Y`<yr;PLbTVT
zQtaEERoy7_;af|}6;tAi$DEVy*2k}||0gfvQ(X2!e{c$<HH~OY^_UlkT~evvO*hkL
zO#mXHV?FgyqDeA@+m%9N;?Ur+D424}DKR;HpFrkVnk_ZqX0`kW8}}7=*L7_Dq2puG
z@z<r(O#Q*?x|WkTbP?GYY8HQ0-5!f*^=-0~VyT(4u$0>(_S^TdD9ttp;#0_S3{0Zl
z1&q9dgp=FKYHz@x)u_54`B}NL*`@Jn==f0-1bEWAWkhFwDRhp4aO*72^YZ$SCD=if
zaz{!X9~KJDdUl3xYJ=UG#XeK^Hng>NTf0$ml^;LaVB&=}Zn7$iiR+s0lBxphaGE-1
z^_}}SddnBZ=XXr9fsQlH6h0dsxrMOBS33a|-s)7T)?}IVXdgB&tf|kJKq$dluZENo
zPt|Nn0;ZR<J;=v85=Ko#Q@bb7BVl<sM&YAI4(OO0UVGs?jh5&K;KXBlV9iF3SEzkl
zC6jf|G9&AAMv;;ez}>leb9g#g3c$N*&=uRapMRk>p!!SiWG!}8$W`ilA1YhKN0aJo
z5bTMP7aToXqOQ*eObFX&zyd7JB7zk>b=r=$Y^>o$v@HW@f8c#2fTt-)qLCEdXUPe6
z+Yx7rpQxuDhmmx6)!6u9U9B#D<wNa^>|Y=V5Y?rgLYur)GO2Z4v!GH<ur>JJMr%Jh
zX4_*stNzDykI-iWnHU0Oy{;p}0U~H*y_}M0Bu$Y!@glvjMD0;_3gs~FH<#ZbU0%;+
zM%Fp)UI1TNK0Ixj`@})8fMMnFxwMY5#evbE6fc~{V8FU=s+2L(u$J1}YKq=Ka#wHt
zC<r?)F5O*#<wbnK0suQEbh!zr9k!^&Ho}Ew>!w(1hCPr2i}{z#E&dO%Q5?<vAS=;O
zemHn;k=JuSILDGX=4eNt>%C3MZUl~FsF=oEN96GN<H@I6HppFb4R@l`ui+R8>TMCS
z@mb4UIV2$ni|t8MyglEu-gF=MOpo;!Z?C4Re2|sw9q4p4iOp?c6tmK-#$6ksJhh(H
zU->(;06?T`ecW5LFmZvqTDE%Y!#Mm5xhKC@x$pB>N=Om7ngHN#My4XL?$pC&7yh*R
zt^QQ`CIAZ2eX;ySL)G+EZc~?7Y^n7;vvItmdS?kng{&*{NM^ntR9#bs4d6vYNMl%B
zCCRwa@2KbuXgXl5Mq$xN0i+n*ae9k81|a#7aPCE`1GOQ$;jB_$FToLWj_y0injSTW
zz<85~6aW-nkpgOz{@i>30W;oQSlr(gcG*Z}l~-QXfsO|8$?Gh$s69gQJqvdZq&wn8
zT+?*pGv!q!;!VJct$g_|<aRMqYIkB~EZCv+2u~pc5Exv=ru{;ni|gg#1w#3ZLoK6C
zVO$;%pQ;!Q6%BnTfkw|4%eLiyZdk{9dV0WeT#1YnQn+<uRb70Ya#F$PO<Bd?P+YRl
zV7_YYvhD?2@69TUS_MV{fUjwgGk?FTjZSvzF&;!I@ikwmV!BRafV<$U$6oJFigyh|
zKjF#OR?eD?X9#rjYpc|19HP#^9k4>In=q`-hX?KoS*0aK)%r(B-K>s0z49Mj=qBAu
z!3cBFfX3iC2Ei3K(r+t98Es&Nrofhq_3osG9}m35(Oz~Qlaea1X&M8RZHbyQIGj^Y
z^q89;-(@u%TOG?t1ZK5;9YjRKbA?P|x-)pT^}=>Z;)WwoX?Miv?YWyINas2N@xdmZ
zy^5tK7pYk72J;!c6s#UjWqJIQ>)!?n7pPt0kXBScp|Rdb7}4SU1AV=<<0+#f-Tuiv
z=^7JoxG7M?%>|#!84Sc*G&~Ksmfx~&O-M!%<^&es6;x~D5(J#hzHmpj^yDzod!24w
z&|stmH0}x4%~_ss8TRViLaF3Uu?JR&NYU2P3VD>#3zcL0^jRMNa#p~C<zcCQiDlqP
z+K6^iQ{AVM5j!`@4{FcevVW$xs&;k;7krZ9CK{2za6Rc(>^M)95IH_hLIWgRuVClN
zE5Cu@r&H|x*)7WJ9T&z0-<yBgu%Dr{Ni2Dwg>m$_DCzyQx@}GoK3x}9vvpuQA&iyr
z9~`C!j<>bIqp|=Fo{|M}-lN<59!PqFE|~@0`N(wt7Np{Y)wpiXB1Nz2ta!8=R+Y`G
z)^*2x`3|J56_|Y3@*C4Yeza~8aAtdWHmC~;E3&b?UpyJx&x#i>0`==v-96AuOIuj>
zQTG{wB(V@12YyuRdgf>=_iN_PS5=$TehU(0{jI*6e|F}{dGgxvBG9|+0#UeL7?Z~w
z$uv5cN{7`7m5g=bDc%h5$8+hq3Wx;Anw`TgIceN1yal#Wdnxo}0Uu4D?vb1eSfxfI
zKoFLS=~fo}9inOsQYy>xNs9C6u3B%ud&6MmPiy|>2}CwHPq5LwM4H7;?H~5`exueE
z0)m*bEYEK=o2}nCKJ%?r((<*_X?4F7LF_+f;?41cE1CK79U3n)8EQhFVCd)ZG{ov7
z1H*0a7ROykmoSuq$TGNZ-|4O0iGA?w7gwADvHGkDbUQLtUpM-~oAa=)&uspH<rpUe
zwa}&G(FlDS=SC{RcrS8m=pZBVOn?R|$Dci6rkId+A>i=@_F?kK`BNI>8IkbGO?OjO
zO1g}V<3#IudFdukJ%_5RlXKuby+g%gG+U}H6sPn}f#9>JS(Yj1#7Iahxr1LnL#v{n
zxR6K#_*tj66&moC-*$$Ttf)yvqO>}Eyuc^IBN~`px8u@}{pDwL@%RKl`--uu*Lc@l
z;#4D_nqm@<Os|N0GP(6^%YQk#kBf5!fVTfT<O})m26mnQ-q+5cUQ(S8ak45DRU!zx
zWJI^0Jb$_h4P@%1le^J=Z&5^azFUt+5!gBk*4AD}Fr9&57n`o9b+^4Bj3$3pB)q1m
z)wa+-vY=X^4+ZN;K*8QdKY^K!LRWeIy3gNKU+)94>8jJ$r_krWyaB9RFcFb?XB&R_
zWps)e_3U+Wd&P3>+stpE`!s4aDuS{#c=A;kZU;S@CxX8F;do1(3~7{dkfiCbuZFKj
zJulL8pK$dJ{7IduWtZ}!kQoB3LkrLmyNrInnpDRP`wiM{PVktVXq<oZ?zVjK{Q#g>
zL{z%>Ci#7Ei%{eI$|YKnvwIsNM4B+o3mD}p1#f|pIzU&IKUmYfqRo8j8m*Eh3DDJ>
zw!XVder+VNx!6a>B>u!ATDHEQEH^y5Vs7f-*$?k-gt$;%9NrS_zuoKg@|GYX%C({8
zBZi2^kQnS}(QO};>hvMk_8=VZS^<C8foF1isobPRYLHz<VZB^Jg6H=?{w)jxo~Gux
zBRQw!i#@V;HhBJOTF~y8P(MsazR|M@Ru9N93m$YQ`{Mu-Vye^KyOXgbN*7btpos)E
zxiuMoMK?u#y=1m%#E_BQ{RQAtZ!=klXg2tdFY00*C`cQ%Kah+20E0zQA8)7-kZH)u
zTZ&fNDj7II{ij1cGd_Jq16?Ru5*a1ayT(W{C%5{Fg%cCl0F(<ncH9-xlqsSE^vg7#
zwtM+L(VfQ|EHCWHelVFzL=#FIWxG%c^da2UaTvX75s}^$6v$BzNcUYQW$C8J^;k))
z$zx5QH)oRz(Pk8Y(wGfAz6}qZHk=F`FG0Jd5nR`?B8sG#vixDzu$kDM$>Wwccop+0
z!~?KRR-LmG4kw@j1?wU1(PFkeU+DdmYABV3(Xuof{%^GxVBVa(eDQHa8=a}1>pJJ{
zV-4QFhSA9EFqCXFaz^r*Pl@yaHG#0AUe?w(rhA8NbOt0MA{(9b1~r?TmRSn#pr0$F
zC`Ie(o&#MS<beu3??FzwpfLQ`xktN|Fu+ZI%1mUuI(BApsot}L+^O;6{*=vrq+g9Q
zUdxk)Q_@W>z->48OnmuNn9<shA<+Hzd>fr2Y#jcqSqzJ0CfjcUiN>ci^Gtnb)nDVu
z&^EL!Fyu*mC^ypd34s=NfByVy`;u?m7QM?F?a5YXH;-|wvQurm&QdR`FG}|<5wAty
zHJ)i<IDkLw$kT1Mh-i`}Xr^IW_BI=_)h#0YxGlI!-bvz2e$exW-9JVlIkF-Rse{ke
zBbSf_j8wPV7M2}mRb{e#3$b>?yN)-YK#gO40H=2#P;GA{7<aTy6vxv)kY6w*jxnlS
zpEd`2ILNPaoYgFU!LCf$p<CGM*R@n8pg&d4^2D-b?!?Ubr4-4Ntsn}8qmnW-leXBX
zUD@6X&Wz<$d$TK2BXYkor7VZXC&c}?vKZc;4yR6s?+yNyIny#Khy=HvZ{{Fmeds+9
z`X)I*)b5<oS5l2KQhKX7`EWB>>Bn;z3$r}P!lvK6s19}0l#|~1u#~ODc^FZ+CDiMy
zVnlBxZy5D0@~S3f(R8|&r=x;hX2JGKFxfjiCEpN`1=gfoS=R7aPO9XG#Bc^zpFfjk
zC9(-oLlw-8!n0D{E9=Ip1ax1+UQX?;gmRN>IlX#IwVfitR!D2Vi9wiY200edIB!Nb
zP+~4`>xi!|b~_L+92-JXp$#kSg?7)#mThNF>9{X9vD~H!^QNCC^ACgYU5kglECgWB
zpW@9{&AU{s3++$p+@}u0Ro|!v%sz?~3%pG(LT7s0u3?3Z{0W%=-P@fWM+TeJ+3z1-
zJ4)8m-9(y9S;RU;@TRr|I_7USm4-P%CZaXo*M2K9xgp3<1vxjk37IscWv-7$n*92)
zpNSCO=Nzef_q&`Vsyqry5||y~o569=sRP|wg-JjAW=|7Y-FvmID_25A58MsHcmtfN
zTcnJKZ81s#==vu~tuDMQIzIMid*c%qoUa)t^m=x{rdt_`?IHx0FQAmfINU@WZX@ON
z*a1Jr=}eCS#n+}6UFXRRq<S=~`?f=~6d1~$l}Rfms_1mK2c`#HixZbTZJSKrY~cP&
z{8gE%5lME8h@TXC>06%k#o_%=IS9zo{St=$?voOS`P1ta^oIBNiz0a*{X=rIa%Ijj
z;z6vQ!>65%M5J<7vWe9pa=M<jx%eI4wLjCxyG}#5h^yZg?hbMLoahp1cK%0cMTk^B
zRGP;KPDymQB^UA!+52l;6>XlY8PR*Lfx`ok){IIWM~^G5kBBt?tcn=w$Aj7?*KM67
z_Q$4H_vqgLzkiJA-~AX-EoR%=h6SyWDmhz$9rX%zYF@k+I;Umm*)6nu+h!x~s(cbz
zQMGLAwYOwN6_o!e4{vdq?3$0>R~J^Yo%ztq|Cpj3vqk3jZCT#dnDhoJ4s>MpXUfIX
zVZfs`%`B5Y%K(Cb9SZ+HcIZKed?@vJI4?8pNGtWtzfbx*7j?oJdIW~8R&m+vMjD8$
z6?#(G)m{&}Azvw$HvH<yYQAyNkLJ(KlDk2WXZ^n<^cmNx=K}@$-?A#e?&w$Q`^c-o
z*G2glB`rF33hNV+PI0;BwZ!8t@(oQ|gW_K@zP2nsVoV;;MM7?-Yo!f^X1Zu??~Od9
zhM~`m?&lA92Z2+$>vpy`2=Wa@r@t7PJJI`G|5H^h`RZ_LC<Tx7DiEIHBG+4Bt$qsm
z+d)4m<W1F*Pg$(MO02Z)bn~3ve8VHe{#Oz%<mQY3surU1tacb5C$-`7<YFVuX2ZhT
zc~k*s+o7g?KoL%chMtpE8B+0Yp7rM@Yb7{sl>=sZ7G2(=RFgVvBc^wMh%?y*zK+_C
zM}K@h>**Lq!8X26pnB}{OL*De=#OH)pL5V94CY*F>OX~BJw5wzc7gJNlCQ37!JVzh
zcGJ4rmP+)#&G>gq+e^){2~>X(manetL1c_Co}srG#ic%2O$$8ztE2<ScP*z%E$NVh
z0k&@j9&<Yx?31HT5jl_jyYu&JOCcY45hjU{4US_(y0Y@Ju7M%;2$z9Qe;Ssy*`N*%
znsTA@Fr=wnJDu;AG^~G&OHXcyfP;6z=<-GoB*Sv0$Z(Y$EBQl)N^w7O#LntV-J^{d
zs6B2U-bio2Nqal*=r<mM-4Fa%#DMg0+obD$!;;HiQw2B+z0_)&<)fCp=Nni{>EoDA
z`fY(@&>jYT@-_1&SZvSyF&!0CN+`vLZDxJ%XbN2J->KH-ht7>lIRFs%w5eTB;(iTh
zJ0wMQd#Sh`45tt{iX7NZ(zov<DEQP7WI+mf?`z5>L4D*olAC(M?=b~`m{I;KrFfZ$
z22Dk;ob~lNB+I4#R#U+gbZIxiBeFWrtaiESIocVExg?}RTsoELso=5HPMmRekmZ`@
zwQ{0?RzixJpY5pByWJTJ+JNs$u+v8rR4?rB02$TrMq>D1j;(*T`7oCK?5Gl(?zQ`8
zeGKHoHqe|656CR=VWB6FD}V|D<8Y@0^DM>XILnlsroyv?4d>0?RUj7}Wdl2?;AW~t
z33l00FEsB>A<r}=hfT&e-fh5^@c?K&R2RfZtxI9(Yh5+Ju`w}06-Ylj1QK6adDd6_
zOeMNN-2TGy(~^~Uou3O&XS6GAj}NK(2U`kbdnD1vs~whk$rr&=ERx#`jybJ;sP^FL
zulX)^_QE4hsK$a1HD9bX)Bie5z|+lHdYh85AC9N2>zn$FHFSiMJrd0OfvjGy&`=~G
z8daw97cG=<ic+i6ajMiF+#fF53KUA~QW`O!!X|o-;N)DCP#Tb`{o_16$KQ{uF})Ss
zc_t!t7VgNy#^dhYj%w9E`RZ)K`@I%d3&B}X@qwN%9q&!C@Zy)=0{ibsxp*z75aS6n
z<KD=0b0xNy3zvicdEIW46+P>LH`XHeYdfWaGCK3iUX)MG)85$#yO6s(b-D94I)%D1
z)$@dSwaBo^QjBr$&d!Kaa=5T#{qPIdbSKQZOGv>m7tT6dmDKNP`b*tQ_F9bww2~%D
zgLbnwU%!>J6>XvR-VIBTI8v^nKK3s>n+v;6=K}UQigYTl$5at99<4CmC;;C?{SZ%3
zywuj?x_d6OpYU7&Q9To<iqR!lIZAhFjay_%P6(6x=zW-=8fQ<1RteA@c;WOO>f5#N
zt?G|TrD1_fTCuv|NyRTXB{~_lcyTqDk~Q5c{I8G*nAZzPDRbN2Kn;k4JuhN}&F8@|
zyWY_NZIKkZO`DG9D<rW>P9fDr&l&7^o4fwNyK+)Vg*R}x;D{+9;`U<cq{x%M@&`ae
zzhG76l!wy~7Va$4HX>M3`N|Z+nUeL~$L|kfzNyMA6V{82dMLz#0hS_W95hXbhr)VZ
zYsRtfp>w^1P*c4mW$7lRo=ujEKK9Kmh&gwImvJ8Gy5%XSqwMh^P17bPEJ1|bI8gml
z4Glb&61z|c+m!(<suiKZS%E8@`j@)JA38cIK+GEL`lMJN&~Vr21pDJ%6GrSwEzp==
zH-x!`>)S$xMNQ9`Ta)z|N$;Scl^GYfDqcEG;$*LBSLxjxl=)XXpzS<>R-3}D0$L9O
zWC0#rF-^(-9BX*{5W=@|P1~X>;mG9P9*}GSDqzVK^-3JOB)OAK##Sd+?)D1iv-cPZ
z<ltgah|>)lQD(2qLH3NGR5z1B&E}#PwsRi_A#toNIw8EZG~GJ~?G<3s24~{=?$lhK
z#?3L8;oAON3gHgRe;n3(Q8(8n{wRyk0(wj4l_IseP085g*-z$#y$QR(_vhadEyu_1
zEeHVJsF-1ZprK1}RYOi*if`%=BIw2&{Fk`d2x{6D&cP~+b<%2oZ2)1C$%A{&bwD-I
zTM?TVWRiib&4y&7?G(?z+f$t%sm6bs^W#%==MMhFi@=pOeBaxHqhout_?7I+Nt{=<
zq9?pPh7^?VbX1)y5H>uqur;(jPtN=LlL(5zv1-cGR(5vd70S?!;X=?igJlsS8u^6j
zAjJ&Z2R3ERE;f3JwYC$F*ADEYnaKe^n(v(QrNZnVpS@r9qd8At)ye3RYXeE@g%z!q
zkDms?Pr8B(oTA-bw9z%DLNz=dGY6H)Z^fNfzO9@^ZC_pD>u{7IYsTP2k>{1jsyVjU
zNa57V5pRvk)Cd~+x2q(EgIz_%W9vB!{$|p+x*Oy<zAHp=*pGK>T1&o5O5@8b5HYcY
zz2B-En@2dnGM1VDos@HE<jd&XacA%?1<yq63i-}>SehXTpHIdY#Xup1om`U7C%H)p
z9$Vj8<z^Q*hJtsBT=nmcba7fiI#xj9+I&QSL?iEl0%A$m&QK(wXwyisn%`_*=#M{L
zD4U09<lOMYhKFC%lx|-?eEj-$sFU&O*Cw7?aB?Z7`|y3T;^q%8D)jr{&*r9n>2|x#
zAoxjfY!cKOrgU<GtX(7%s1N_7z#3y3dGS27$L8u^xM6^rVC^{9=CNst^WOJ4!C+VI
zl{Cd=LdW>kFPM$ZcPT+%W`CPN)%qTaCx4$dJ<#?7pUO(kIN0Tat&U5-*Ptl1j#r{U
z-O!{PCc$ytXCVZaXJXVP&>|oM9*U;^B7%rZeXUwFuSZ<X^gum(PS4Kyr4LS7xaGmU
z+KlBC<;MoN+wSSz<0B4p=Y{Ew{#^tEj19Zv5a#*qNRgF5<USo+*QZT;8|aQ)EK=z?
z1+w`I3upMK<rP)uizDyT)4T}eaMgJ~B$zrHgD|_dK9&WghKE2oP_=191VxH1I7p4c
zUEKso$Rq^f3fNXAs@8<6<^DnfPxZ-om$>G0g5NHFuk90m`B}@vv2*xHbRb{=iYo%_
zu5znbxluN}a_vu;`1`eLbQ`lPwNBf7vg0iveS@rNJ?!9K%@?zpWUm8qp(CwxZ=3nz
zq%Q}~=X}8W%^g8z-vsCZ#fAj}dB?(>s>O#a!zRx1OTGtQVJ%6}#qL4t4+*B;3DYJ*
zA{BRBfb=FHftC({F?zT{4_xttqF@eGJ3r!%zCx0d%j1zVTeJjXk)xnYx8r(9Qu+;9
zWj1I?HDO|sKj{+tvcWsNSK$wcUWJpg(UV;^h`xQ-myk`DlAHztLFDm~j9<tjcWZ!8
z30>Tow#5!iq&%BU7pGf_*x12JA+Zzj1qIm&QqHJ>?3@il@mneaFFS!GSAIM^`MnFm
zLTiu4rH!bulvVOvap1==iN`)0y9=`%yXwjDZkyxTHD=>VzMD_zm|#s_??>-mc$-3A
zNBPQS@AO*<DO~e@h!k^?T#8Nn7~`5p5lfMghQc1rnZg8i@kkQ>n6db(6bgE2d`XB|
zRnPAg?jjhAvVMM!&|UjhN9ol|&ZJ@r*(Z(MLr)%Ge$QO4KhjK>^U#k!WADPRkpl+2
z4XG`^@KciEDsy@Rt<wLv-ZuV7H6`Eoi{iICkJhPOiUTd)eI)y*Q6BA(zT)~hOiYNu
z3~s}PU(&1L$xNxg*|s_-tLxXJYEfSlD)c;>(D=bWjgqE9V)e@yK{5u>O@UZF(8B~S
za=Q0rpS1W55WZ5@mi~PeTplX_wFQf~x&rGq;T(Lwn*1w^qvwOG36s-*PL7SVBWjFi
z&4_&Ph79e~qX6B!PqL|(M(+KzFP?-5S2SgIze4rur3LFu4x~Czw!FfS`D2ZQ9zF^D
zr%`SmzD8X8IZTHv?~h7*`jYS*-|M!Va_$xQY1}f)aqQmGmhruva#)vo9mk|rP0W`)
z1@AqRs4U7YxbAVp8%aHN+_D3fwe@4saUVvarv*#RJ0oc6Pwd8?HEhX$wGG_ThU~d%
z;jB%#!NIC>EX?Fmx1Z21Lnl2Nb;36~N-5pZt)ek9Z+sQlqRU?Q3_>Ng1UQbRHzD)F
z<n`F}RTbh^dR5kwDf4A-cDmBzN3T?xu}@o<eU1#Hvg&cJ_3w6$Ph~pJP0qo(zG#Cw
zJ$C9K!y{?hU{182z!)RQ;;rd^))Yw*5v!*243cGnn?X#pchdXEMa>l9CjG!88VTZe
z&1H|(Ct_VkxJ32T?ep`0%^-?@nY-v)V=siKUN^d8=L=s0`kK%m-c7dM&LSkZ0WB>#
zdf5pp+WN1GOec>7Pgk;Pc6{C<=#EBvNwoC`UzDERSrECkz9qqk*)4rn{n97)^5kwC
zrrq9Kynkucb`Cm_kgY>Vpz+W`wv2iF%XyORil^^QIf2@LrwDfz5M<B#^c?jg@68b~
z@&ZD$+lVNcwmz@aZF6)uElc00@bENyc<S-?7FKZ_O}Ac+#5*}uaF$lFfBb6#ga(){
zaeaOie44p+AY&9123pV2-grpFqf&qBy}N%OS+9zL3?PpWr8z5nq{o&&rl&ow91%Ng
z53MKd;7<Q_8W!4#4RQ;rl(+8}j|$TJ^3C^jtYnHz=wX_=yS_=EZg-_%=FXgLm*-Wa
z*<Tp%mcIk%lpDvY{<Sz%UL!S_?$olA6AJMHwr(N=WE~|I8l82+eP#UE-g7ie|K!u5
z)Hr(GYliaST_8-bNxtzWOa_HrEvgEli0r9fJN&*?0}4B9P%JnzEP*ZrR7&MAp-&gl
zm+K$7PgiYJtn95K2^S?cyK>tj^QZR^Vb}8q=b-<|UkD9y^NL*rw{jBG4IR6atqkX2
z<On+@(UR_p3AV8>D8=46MQ&ZmST$}UV=I-$kZ>mWJ$?bVQ{(B#YcF0FFH8_b9KU$t
zLfb4<j*B*75|qANux~8A;jYq2WD@(R{t`SIW$32>>hL;P`%>?cHacn{-U$hLNSCV<
z%2Teyzv*Va)Y?0SvZYzC8P-V8wXGE)ox9`OD!F&zA6Nc(@yvZLMnnx5apXj$_A*!P
z+R96z(3sAHYnr^(Yc1mM1l6nLfr|Q_%?oEKHSAtZ-4#j|+1vG!Mhg<XZ1zRgo!sfL
zp`j<ZAoiBs@0RDKS7awskDoDRe>Ns4(nh0|4|xu23wUf(cV0!Kp)3!ZV!4V3N*>P0
z)mILAoPF4rV9OR&mO7N_X_nq7GxY{P00p8Y){|`6>jsFeg+Wd16ktA<?ADH4FXqwm
zy9kA`lp`>mbx|*D!6*Lgp<mV7YdSB%vrcU^3|cmqV=0TWEz>I69)7eXnfK}62PClX
zRsyXf(Mv12E+PX_q6)!e*H!01&macqqq2q;-F8a`Do)PU6fx#8>uG{?=Q^1E+?NMU
zeC8L7B4MA0o{*P(bO*nG%MJ`f7ZhH5tp3#sR55X$lbzsPD!S_ew<6JGP0Qyp3L-p(
z)Z_4%!m+-If<(8*{6<*UjMfIU^SmTwXx+E*XmYjLy)b?+jmHb6ObK|2Y@utec0yo+
z!Gk6Jk76Vf4&%x18W!%v>LijGoR}s#_r{F*#ZeBA(fzn@wigI^^-3S9@XiywvjPcw
zti9>A&q~(q7C8k{nkV#y@tQi2_1&~W20ORQGv5>r>o7(0D*2;+qPbUj@rzQ=Hh9`k
z;*NdnCFp&NOO$-p*xN*1k0$rBmEFE!w|&-*bkz%kE+%r`L4U}eY|rTQ67YVTabQ4M
zZ-wgn<3Fx$mK7)vbN!pxRu{$+pVx$(n2aVW3A*80`1tiat%TcyxumvEE9O#rvi*^q
zC24&;&HNv0x?QRH%*gsFZef8DT4r9oA|bdubp#??MlLf-pHAc?t4N_5EyQPZ)Ri?E
zY55#pIxROL)qW0OGG##S=ap5(bmZa3zx|#Z5%<6lRM9Lf&SowJf+$j*!HJoi?+x})
z)p<D!+I{<IalJ=9OYWLB3+*@Eqo(XZqy|Wxv!J{<dW9Zed*K$nZ!g-o(%=XbHh5DC
zZ8?w3J(cElZ{VvxFR|OMuYM$n&vteo49dlMz~O0@PfVVJs=6=;@~)rQTW??g(Cjpo
z6yb8vaXKjYeG$Wds|Aah#^cNGcoUzmm@BJmk@uYc_xPk$!kB>fD=3_{oPHxokI_xQ
ztrf4Rys-mGD0SFp8%?{D`GHo=aXJdW=mjni(|z@6(TCJ_qSm~$<T{89NU~C_<Z4i#
zEI>=d_~Yw`F^C?iFDB*D`IaMN6w0`7saXk%@7JWZ976IOb*lfk_H(hJCQZu;P&cPp
ztoIXg1S~e%am2`Cea+v(1gW8KIcc`?&9L3{5VuIVZ}MfQsYX~-MnPl;Tq6?XL&OGU
z?ozgZ=UhAmdx90}TZJtvu*PV|<Vg@&viH=@cCRX{+0UMS9O_d5fj>#`JU-*YJDeN$
zQO4(0pYx=nGFmWJqmb_W@Zg9sLe$>#NvwKE_0OwoxMVdm%O<>s$;WMHiD!efZ(Go-
z<7%%^Yy|Gc{dK7g0dGUviZ|<XUcOOL<uv!O*QbV$OEfFxPw0Yq39P8Uz=%tW=_TJu
z%vZ$_g0_n=cc%>zc5fc{?mc3L3l!r9mQ=}g(uI|HG}_`EDi#k24SP3dS>F+V*|$!0
z2g2>#XD6aXGFcL!4_xA~mGRc{B~3>peorcPghtzuf)J}{0F6NY0gCoARU|Y#J^`N&
zl@`X5BQE8}kuhi`vw@mSYR-DfE7g&PECCpj9ygzXho<tJ`4uKkEB5-4>YDE5-<NNJ
za@Crp($J(pB&GLdu0NM}RUe;sOro%qB7cUNX4E@UEh}8)%wZ0>X<s*O$p)GifGKny
zy};)^G-s<Up_tkqS`@kQ>S`hCoWZDdwB7_fS+~Ka{7Y!@vups?8TS{6+b{ud5s?tv
zj+*o@lVJ2&nCeZ)Exg3Qllu$sarFhEC9dVy%&o{OfGbSPMq5y_m@oz=$&+Y$f5%YS
zKM`FPaAUH6<kMUhI36kuZ?Ohm7BTy|J$&C)V#kVK9B9=c=so?K*n0DZY(=qsbRBqN
z-Rs-ic1gog8L#*sw_21>i-+`3q!HtMmjV2NJ8+*RwhCZqy-~Eni%Zs1RBi$Wh`kK^
zdc2BV6_}1yNM_nIHYKG#WzqR#vx6A*$0<3cxSQv*R=N;E+Q{n?N-B2DbyBlQhvzI6
zgU?&Ry@j|EaRfz>1jqL-hkUkzj2U9RsE+Kp$3sRv+Ng$TeEXP=sTMRJLM&Mbw6$?}
zZT<`8otv+?w%=3x0*>t&SWg^=ctc9eBr3SLm3q^Q36I)bB0Oq6Xbu0mX8K90M^E!Z
zy7BV(kDDh<C(IPZ#k+TfLH8w!qw<3X0suTAvY{|;vMiYnmfkbkJ`1tU^jSRiz@rgo
zc5@@q?sUI<&7E{<KegQ1Jm!mHetd@<uN_zP>5eeem{@^28S2T8IO~L}5G`v0UUBso
zl$Sef$qtf*$aBT#`5i!@8`WxMz0|{GJzST6L3~IG_fbP~@5T|+6$aCy$B0F+Lie?>
zLM9KZIR+SMyc!pgyXXyutK%G=dMVy*Q;vPKU-=+yAmj&Cc^m*IGUQFU34m*|oKXjp
zNGdbKWjNT2*&PY;<>Dl3A?-1vABm$K&94~086Upu?tY$z;5@@w*|J`)6m$QNAs|^(
zM(Zd51XB@+t*x4F)RL6MSp)C9Adbd%)%l#Pg}#$v*M2nW-F#l8RcrC3P}lNet3_4h
zYvhvsL>U2_T1n)0?py#EzF(mNqHu`s8{&?Ud8>w{v)9*yusgPSbCJ6CF&WeIctT42
zvrU`E)H?PMA)eevJC4UZc4ssvDHKB&rIr?MaFi}GAG2U}$L1$;<#dm8sB>-eV8!CE
z)`Ly;+}w9{PI(UMchRF4<GVXQKvI0$7S2n{sa_$gc!XnwN`wT%{HyOSD2K`Oz+diQ
zBfaUkYQ7!!4o*u-incxiuB*Qe%j03C;DY+Ot^|nHeI17oIlsRh5srVNmw!>GADI3p
z15;!3du174Hugv7&0MY;wxSo1@<o5gyeqzgZ+_87`Lb1cO9lAax2p_1e{S-B&Et%F
zF)B)rn6Au#>$WgiHi~pjc^d*xara#O4^j-}^n1C{4?Ssy`<1c$uN?QqHAeeQo8IkC
zE-J~b8E3WPX6yReipPD@C!19(OP0uj?>!HxqUX;eEaZOjL3G|uVWfVrvZ%+j+1Jgg
zbcHNE#*AUA|CO8?U!ri0vz>a|I~U6Jeu7}Bd?f9HJnlhgrk<^KxPV2U@Wgd5&}-nP
z`$gZiReVn7`+IxFBNeS0Eixac#{79P{I{Hz%ireUw5q+ILOSlitlJ*H5J>dhP?0Nh
zulc*3VmVp&H%+%s%%jRL>5DU4%6;a)GZp;5kpCZkXNKt>3UwK?DV-4=^IGOgTa@Ry
z>b2J%DnrKtq8B$=O~?;5YuQ2hJ9b|GC2fQ{wcH|Rf3;~XlA`Yzp6an5I5Y7-@8;O5
z2P)?1U6AdB&_UXD^2ezTa(x{8mi?FTXjat;$@V7rzigjC??+WWP`q7dXXiyZF27zS
zm$N`wd*d{9zim;+F}pTrWYO`yONzj*`}m!u`?1d=2K2}QIRmM@bLTsDNvZtfG)m()
zZ1hosU-4<~wg%96VC70n%VFNO#*X7}z)|LmCAagF<Eub31+joz+>GN&<)mq6zfo#9
z&e>KAf)dTCfW`~K-d$P1pC0$K1)nPXMxB@+6&y;{v4C!r&(s1v5qhWZv}C@bJ`H~;
zKNae&^t&&_GoT1!$4(22ZSo&q18h0GY1@1VS3IFV^s+8lvHi^i#psx-w|oj(?=VlH
z3>k1=5s+|VMUB_&@J9UhHw4@EaFe7wsJzHb^P^ahd{kcTZd7GVnuIo<D&0#c&`BaW
z_*gsW(!c6d9ONayOoEkVPxJ5pM$36mG^$6!EX&SvHbhFsDE3-G>#!l`hEQJBGLt-g
z!cBi7^0od7;o+iBwK=gX56lPvLu9j<&xV)@If?&S7y7%gn13_xNB&m8`)Sgs-uj>}
z{cb;HDyV@K4{57+CTT$6l4t)QSoK{W2fC`Q$SzEaz5b1&W*)T;G`MhL*|uf8_nU3{
z|7Qu;!%t6pRs12fl(?w%j~2dQdt9|X-hTHa*x&U+-?r0oMu@k8e)`o|ec|ejZ2><_
z=f@7A6>^y=kLsnqt=_cI+i{+!eTrOkkE=J5W|BNuQuBTPY442tE{<E#S@YSPn5DdJ
zO0TcMzKpV2Bq$<A*GYKHHK20DmNskO%~MQbYOkwgg`%y;o>R(9v79W_<)+E_6R`~+
z^;4vV+SJb0m~GC;^g2dPUP;ovffGVA|6%RRZ+oCcC~o>6E~my_;i?<C*Q*wrbS()2
z{FKU6GG#_MW2uwV=BfC(R&iY&iXTGKwbkuVm5__#cEbsf+=3b%r@j5AKk?x|uKfo&
zD&!q*IsuFJl~NVGFl8~9db9As4-`GcIn=(mP`QFQaUqLvt%foEkQ~?ZPd8EKy=|0D
z(J0&gH$M>PKaa?n&zn=7w%y@a$xg7B#Z<>E%9C3IT00Q|N7OkF^|o(;r#<YWvih5`
z1=ddk2;EI@;ulW3PO*^IMTj{?Ua$Q@4gS%4hxvo=S|WV$ddKS%UPxAKyvUV3S}(V@
zvr(6$n5B`z>X2JUqHd@a$k>u>_dG(^xcE3>%1sq;AlV@to#3fzr~DSUR~Qcpn?RsT
z!4G9+p5J{RNV<pXw9<<wVMq~t`Vc^P{xFTuAc-(Nm-ZaDORYtQ_tT|oPaZg2Y=ueb
zfxA2f-AIUEO>}8w$)D|GFfz2ps*8s^f!A$W#kS(S(lbBk2#V_c*g)V|0!M2SKq|_^
zKJ{lWOERsG+ldDy+N0QMF8k^_ZEUr>OW5-y_MlZ4tz~Y*e=RQ8Wjq^f8OkzZ>xQh9
ze-_Ia2J^M50{_Qo_sS9tX2w<z8wO3~fVXOfNO>+7U$-utth>S`;*q_g53awI!F<l(
z(_ceT2+V%s7>#N6+8Ysj;Wb#?P$h?pPd~fdj6LSHTX(W9nSIgAE$(!#wYLeY7f&{k
zrBcQ6;Qo2UJ$p%|#6MWtv@<I8s(4<!oIRJu1s8shqF#mBn}4U)N!xF!(fmCGxITY&
z&oL8eK$J)CR#K!qX8N!-7Y69QMvJIpTP8+HyJ2G)I*;80@#tgEh{v3NGMRjK@2vH_
zehNkPrXJD782}P458%MYul_bRzllhHE$Y80raz33g_HiPHvVI{U>B-?4aC64152^a
z_OB8AZ!3g*i1F{$^1p4wKc^7>whaFqDV<1&n-iXVj1gG&nqpY7=oDia0u*qQp7_Fd
z6?RJ_YbEk|{p$@3A6!@6rz3asHMIxXkbDfZmfFywMiHZYTMF-pyi2It&?h9!OS~=3
zVju73@7N|*71bmgbtm63y}P)7IUZ8lImdmH*C5<*SBt%+x-aw8nmBd{qHLr~tq0Ge
zVIP1DYQoj@1YEAGxajm2|LbM&jalL<>O@mz!7RGVJym3~__Tlb{jPl8S?I1uUcevA
zVGgU#`kC8xw0j}DA(C-;>^sStsa0h*A}tA*QMSb?5#<kdyYBewf30q0%DbB!SwSqV
zKyrL)OaZ*Rh@oY@_;l^wbf8On64u=@F_+8LB(J{ZFj?`0?$Yhq>59A>+d%GE@1X~d
z5}obEr%b5jVw*S48hK~V**sg$2~-CBL;50~JkzL{E<V3j4^OV4#F$}vnBbY8WIJQW
zOoP9a%W7;)#%LdKUeCsN{az6e_f?%c(9UGOPc%$lgEb{V*pv_-`EF=Nck`JnO9eZ-
zES_-0>Yde*Ae0@<XN*jTH7TK-L-R{+a8(e{A$OM<bQB^vLVU?r(j=lA-G`Vt>^Q0;
zL+m-NF3d46?Z4yRy-6r~Uui4}k;Z+ZCqDLC?`aB1iKG`Drv?&^RjkA-xQdF<q=Jw;
znVWWBKWi=C2rExTMj4_trQbNK<jw0^6Hl)<g2?P%Ovx;sGax%<Z}vcBzDiivte@$l
zEeV3?u6b0rj)1Ig#(MKquw8UmwP4WZ-$uwhjST+-?HXC`sN#Shk<W=qao`CPk{l8$
z9_q|cUCNHf7MbTY381UgTVW|ExGftw9VtSyR$O|hOmn#Tuu_T6E}19TIaRSU-G-Ha
zRdiQ?b9Dr)%?Y&l?pobp1Bo|fo{~%N>&EVRtp$HX)(&QRqo2echTI;94QLcUn?n@`
zn)Yqu^TKWp`Ns^q<BbFs7rb#UR6L>7zEBz#zH1$O`22gd&Z1jvE;*tt$Oi-0+@y9t
zy{7JHZWpNU&(k_q&wj^{M}wZw9tx6r<uNlo;;>4)-d9pO&RkWOhj540?Z5RHrhbFZ
z+i|3qcckk9M-;>h9*^6{44+55zT|kj^gJT2Je}akIuG1Z-H?e#gLkQw<nqee=_ORj
zBPQ5DX0BEiOt8oxki2yso9GIE5yTs*So!)mQqiBX2v9L1oWxwktSZ=_5Z4=1Dzpg8
zf4BElWJD{)ie&;4u~D9@E*;_tSj?)Uo#>&KXb>p;IuXeq>q5|;%w0cYo$CB(-{rfp
zL`GJ-h+@2mlco+!3V+{JH9beKDs@j0{9As*XLi|5yE9L0U9w9{4+nM$4MgXr;X5xF
z7*G<iHUYEVgx8;Ba1*-g2>2C7>L9UAx1{J%gPICe0|$<bSAo~aFFAQo`OZGuq#~pu
ze71uft}a8i`t+htS~61?4O<D}&OP?oJZWcV*{G`FHX;M1FV0nEg*W5y=zV9YSKy=P
zRrI@Wuy_+nlPFE}V&_CwbI~r@k(rTb2T8mUBksu#6|u#+$Oom-fm)mF(qC#mdqTVS
zZ%gw%+T2BHB4GoS^H|t5@5>dPnTvexs84mC)D)@BP#-TX+LxY&?H3RvSK?nTjA5!g
zuPyU>`#RXjT)DR+PJ&qZm`7vgPl2e@a;F$?Jygi!i-&Yg@Nq1Uoaatf?d)0O`_@{u
ze?4P7z&Pz;f$5Uf^F<mwnq(=NE}0m?kN!U6yM*$D=SNRwyB;`%jq-zixzK$EfrES$
z;{qJSt!iH}+~jTs>VP1x#8WL8wO!=!I@(|BoPjvZsql2gBFB*)zH@j8@%&KhqICcY
zySuMZ;3E8v*o^n1Z?~=o56J%_Ma3z$!!n6ZhAZNmW5_e$l*eJ1%}TsjJ1*{#B0TNz
z*3xDeJQIRq*bXHkxMrT}JKbeVMg&Psf>}YO#w;$f`-mIB5_6I~DDnOP*{1~4@I7ql
zhzn}xt7t*bdiXJYKS)sQ#Bx5OSL++fu0i@EFmF3hfQbWJj2cIGDS<ywNrvmZ&{GAq
zSkSDFCe!4t*?X;TgaJZbOl3bm`W0%S^)}AOPw?@1lkUY*GP3N0ZqqAKDiuBAXJ$1?
zU%5-l9aqZ=5+6?T_ZJ(_bb>nt7ILN&Kx&u!tVwx0GL5NxDLFRB3SHbs3VPRphnrMl
ziFn*E`SMxr0ha+m&?@uCh0@q=hSHif4~B1ffbvgFdR#Ki7=#r1Gyzyj7?d%NQuMBH
zzGtg}&~wOk?C>6X8SAZoW<Z(5rt<jZc)J|8^9*5yZ~lReR&^3SFZ0(V3Ot$<{a95O
zZE?Jjv(6e*1!5`#Q}~!ShKF0XcyuiDgkAh{$*O>EHRDAR-xVmF-zXqhmpoFvr=7@P
ztz%|4Ax4nGyMdTWiGk_u!_L0NzGJ1l-ZO#O)h#>~;g4lS>CLv`NopYZlTpgICmTv)
zPl=|}bGNHGKoAAuVUa|zy;te@6=og~0h>l^)s8vBN^T(@<V_p_7$;lr3tD4?@&aC2
zBJhk*(h7M!RV`LgTvPqA72P(TY-Q(kwD(6K63#QKxrdkT$n+I`kH1AomZw}C@YV^x
zT!IAgz_fggQ3L)|cr3Fae7e!#Q|a>2x8nPHAbCc%=Xz8Nvpegq3V91a1eXw-A~w)~
z&!NN%94_K%+HT-AF;hv5A!P^+2*>hViemy!b|zdAVYw6t4pw*!QQSi{oD23dgGw7-
z2Y{?9yP$qdet6^hEE_^b&9@v@ZO9XQT*QjBFG;R}n0aJWg+UmZJ_pGxwi!o~8ec{R
z{q79oRR?o#zZ-{8d^U<Y%CneS9?=VMFh^72b-<v!DBv9Ae^BZhMMucPk3nnKhk>wy
zdQ(ui+W@G!+M;KjnGLt$A}}F5pD(zpt1u48xi0!_d@@@6dfz}cs_A2j^7}!K@zq}Y
zL$%K~v0z_5s)z21)ng#5-44r&pqI>Mh|w9@xO3uV!gt~sS=}(&mtN5-;RhFVK;8N>
zvF{J8>DO|z9GB~gwz!=Q6xI4PKe3W@Zi@qU6pKuWb5dx7U0uv2>*t<kr3UzZBR0r_
z4=^w3P6u$Jp+KHl+g*{3Tpm6k@2O#hyw+@G0DYxs>WF8$2i1Vfy_MYmaUF=20)tWc
zo_fnJ(Oh!lTK6E@9aR%VAd60z-ryH;=_JH55(A0vWJV=?n{h{rMxg>zVtg4OLX~c&
z9J*BPl#UoS5YkqX%$Q2J!kkP4Dc-E0?+Eu~$}k!i>OO=Jw}b%!UR<n3rHN<GwX|q1
z&}SYa;%u%<Bks2ONea}|S{fj5^zHKTg-U!kFbW8daksmsD9HDD(G<8?es~VBFWf%c
z-{_-q)ZOKFOQwyxGqSCZOgk&Y>s69ydBxn%?o4t|__ckf$yFe{A4QJvRl8ZCc)0E_
z4io%_NfwYH(`G$Bc|`a^l!ZZ27(`&gGwmRm+Dvvs<Z>ywzd>ZoA13(iDzxm+dkEA+
zNbZvM`5FXy9D>5ESwv|VI5)LKdpA0u1$fKzMXyS<C^2|6r5|<lJ7y2lF6erAoeaI3
z0<x>(Q??rJNbGvV?!84x;loT+mAj`<Y(ZVEhI^!BeEtm%LVT-=A&wsyp-UOmBB`ge
zL}^Es%>XgJAdMirezJw)-j&-~-^_d@7X9ols6}>S<;_Tfg0G(si>SdiC%rG847vC%
z4loJR=0k6_aA@jg<h~rDaF%M*U$=lAL{Jey^~FtJw0fJ#r@lb(grC!EfcDI{5W_Iv
zCk2)!lqd?pc!HLusiN>3CTh7StsBY>-^EljJbegWf2xln;H?95YFcUU49D^xC@~4X
zYbG)X6N0gMN>fNh6gQX0V1Gnazfn?zuX5ZgF_~^K<fMI<@Ct$q%*cPyZ;(Ok@t!5L
zTKBL@%WJJKCbRjE<ry>44T7RxZgPhG+fvb((L!{?!Nh#uOn@4_-*s^C2q=tJ;d|=j
zn_$OgEmPFjJ&B|_B*a?nPUJ^Ig-3RQ=_a44hy_2ri-?A`WkR>?#Rg)Lo4G!ArHaz2
zsK95&L%qZXzNJgm+l&XHo|n$mk|Sm&I{0jP>Ox%$JWc($Hdg21{E8h`H+#Mj$BkE)
zr4WXUMDL{m$CNKiKpc-I%yG4V2%i#o76EF<=4BkKtTSslWpA#pB8>t9$|<>zVE1m`
zJHYV`=IvwmiUo8JN3G(qaEiDa>FCTnD={a)^F2W=kFK0z0rxPVCX!+Df)`SGzvlya
zNlOuv{4r`0?9=2^yVf`P$EYr2R#lm&=Xt59<Ix#XANeO=%bO~bmMv2K+IE{2ObdCU
z%dXM;HWYK$2Cb3D9`+FHRCoIOQeOf5UyR9QOWk=jyvh4q^Svx&SmI<)FIgu{&~7u=
z@|wCi325|4O8ipNRx$^1+@N^ZE}<rqdZ;G(ZNKf&Mop)Sco(OekoYe=9KE0yB51|K
z%1ODC8(s=u)xP@beioCri#i~}(c*($?(-gKF4V6Gj5@?l+6>>`XFMLR5&ndK8UiwL
z@jt&ke<AL5Q!Ap5JLy{!cI0aIZ5=9nhGRd9%cZ<;A9=h@-RR!s@<V@H{4QOk`ICMA
qB!E9D-A|hMllc9A`@bvt$38Ke5I8!(HU$Lydnl(YTX4_7@BaaS2u_ax

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-4.png b/docs/source/assets/design/v1/prefix_caching/example-time-4.png
new file mode 100644
index 0000000000000000000000000000000000000000..017df1657c22e0d371c1a194e7a76cb36ed8af15
GIT binary patch
literal 41530
zcmdqJbx>T}*DVSJ5?q42OMu|+(70Qmkw6-E2pWQh;7)LNNpN=v-4F=w!3pk8aJidv
z&hLEpR^9skcvbJdx{so$qJiCe&o$?oV~#NvVd|=K7^ozuaBy%K3i8sLaBv7YaB%Sc
z$j^aSoIi{@z`^;$DM(9de=yq5L`q=kt*@US9P}O<1U}pcqXjvC3}SP0n+`^&nzUJj
zlZOaalW!7w-*rCxsdwcm)LPWE<so?+bzx*%naZ~9@aB!~fsl*>4RQe0|7D+0xYZ@p
z-mT+dRL9EW6jaNvAr$C?X>|`n6ZG%Xy5!<*=H9+pk^M6Y5}^JX>^1G>3a`sBWx2R3
ziYSQ<e0~he)=p{oP03<7P7?`H)?d-Ue<*~$$;|E@RLD^EY#XGRbTSyaKN@Z)CCb*$
z{!@MV^jC<I+w{u*54>OX52bu7H(wZ6A)6B2GJrZ8$>DE?P)AThdvI{lJ=jsy|9X?>
z6bur6zeSl-@}O#8zCHTs3Mk}z1S2sUmP1q9LST^!Z$j%y<|r^NDxHQ<w~i?=&$OF-
zg9hFRM4df@;pcb=i&vR*tv~jBZ%>3XxANOTP5$bnxV6RU8Kgu%b<XG`I)|U5ML)IW
z5UYLm=CAw9WwBzA=(1IqdVKuyjf%HSYvt0`ck@v@$mInDjs|t?Pp1O!!ei-i66#3h
z+earRt5fNrgRM`b?<l^BzQC~t*FpIa#IGDQ@l&YzN{=lSOq(8+za2qnj$UqIJ?fne
zfLP#LAYAL6roWBoBrKJ_>1_F_1+U(hLT#BdgpglsBYQVP{pdgtg<q8nQ3!tCL+p4t
zZKXYe^_xE?Kijh0#eB!|)n9S1=lDC{XoG3s!$;?9p^pmR9*$o4vM_*5kf{H-J4NN7
zBTanf_2@Gk$BO?3zIPe2Z$iMc-Ey>X5J~sAGB~(dX3pp&aOg0TpY&n;8s2tNZ`)@=
zJsJk*QwqID$O5le@^(7D5zu>$*@v407FF<TrQ9*wdA6+lZCds7;<GQ$S45V6v1F7G
zICu{YA#Qt0HYwRU{LV<5_j_|?7Idk7wf7x^rSz|&fQIG{($IB}?-Jw&Y}p=mmN`|V
zk-~m3oZI!Kt{ArV4k);V657RUzjklJ()F)t3bF0pI_2dLUKKBIURHOhg}6zx>&U{I
z@t43W@So<LLVibF-J%jUBJ-}JjMmyX9ABl8>I_F9T$<V*7QE_2;%9Bb{k3pFQBb-u
z1^fu%JZJ7P)&u$EPulIXPo&lgzQ>p%T7RuJq<a*|M(VGqvF`~4zGCW`;=>+yzmtpT
zy7;xY`<NY&RM>v9`EC++>x5K-@knb2dN+%9)OFn2BXcflxI*{^Ocbz!svZm#N4aYK
zj%B+~QZGQI=Ktk+({*!Ns8Vf#ueysVDP!xU2`@i~y=J{~$eC;}<CGIC2e%R*+Q6Nm
z(docp7=6+~y1V?hOT548gXnz<+`Y9Xej0M6u8jkDj;$=D%;Zy=F*l8~|Ik?=cv6aE
zFv5!%{-fa#c^VwXmiT6zhurtBUYENSIR&?qFOVh#9!E^3OLJt(N5>b3M*gS1446lC
z67b4oTy$92_ZNoZ{FA2MFCQ(&>G8VX1))ABbVI7yOz9o|ko{G2feo?zFYpGE^)`Mh
zd|Gb%BuvmM?9%70==2dg=>s)IY4aA!s4=30e-D>CC2kJi@>x8cZwF@vf3v6NA~@Un
znLB%*j~&vrJgPdjK7oBQ9J|D)p=|SxvJkiDDe)+FyYT46c<T^vF1(~3J!G#3KX&3Y
zMw{Q4RWF!F&<Lfv_5z&#PHl2pHGc%e6unI%=f=bh@V@dLqzAp)N0a<{LvxleJ&4A_
z5-mZyQNkn_<1iP_r)2ZTJ!nD0C4@mj53h$;XAnD`ZpuEasPb(svh^M+R#}Mrm!K~{
zf<*SF7C`X*qG`FnmcT?-%Joo?U#O4;CxgBEW7MZf6c1sdtR&P0dw7Gc9Ygum?=1AG
zS|mJR3xiP`;e2oXRk0#g&%qk2-wYLHHZ0u?c3RGX8q-fzEtfT;{dMD!$l-`g$RVRR
z3Jt$EHq%I<c+}jWpxYOauou``4b;L(as-srX7&|K=tx#pi^4)*RZ(WdzdEFv9WqFq
z@HcXQ`G${s@^f0IDfJy1;VD^?_G>>o#-KM?`~(Ms#%Qg!*h%<nty2qU*r@7(o0-uo
z3;H@qq^F46Tec2P-kSKa^9)nu&-Tex=xT?)glV$&w5=8W9L>;&&Ml}P2Bi<Z-e~O9
zase5B5l1#B^?lR`ikO6+{wnKHhWx!h6LInY^$wkB>FbUwci1X(?GA4aG3%h6M<vEC
zX|S0Ob>iLAXOc*bu+sSz%l`Z^9d2XM?!z3e5{-_kzrbduH<4)*`PfUr2N|^YQt)$;
z*$h$XEn6t+fh%u$J=72j%l-NHGtm!9cRh8mfvw}U;X?PF+5na`-*=(R<il7b<$-A^
z%0Gc*Q~gHJiq<BF!9owx5heIOgK87pgd;ro`i|}^W>EH9m&<qDE*HggWYZGTv3imq
zc+~02YaD`=W>d|@)t^gUo0<s0G4@L|D>wsoB@Q%G{-Gj*Umf(-1U99_-mI`ea!4>0
zzIl1fwa(qC0||1W>e;P0*xu%EKF@fkx3KmpD8(~}bye#3axywbgjdzn_&xrEz}F6z
zGZnpNih$-;kq93#5JxeAN5i)oQN^EItzT?NoW+9|6_jvK|4w3`7#euye}KW;PlzRd
zg9MImv2gf*w^3RAGVn6{*Wr`DtN-sc`2XrF5$mA9z3ZdFb$s|L7*ejB3{~b<1{z_8
zS12gs!)h2UKj&sayedg^7-w;c_i3L>q~MgcmaK!xprYrE9L@Rb@yGi=a(Mz4@~Ia=
zZ76wHCbWsN8zrvHa)nxg%$faht~MoOK^;;Fxp^}q1<JQlONbiY)p3gX&e}qW`Qu;7
z5>+?kg?J~orHrZj)iw4lHH&<x9Rrv;X5<Sso-UxVqaCeS>`a$QV&j@yiEUoObiw^<
zq9c}H63^JKV7%n%0{^_-7gY@2mQmeME^WNq)C6nezI3W^c0Nt(zb^34+bI+Wlf2z#
zktP-*xQ=l2e%-$I&4otn<L$<dW5*v&JYT6TKi><<b%N7uaX<e1UHfG7&Xf5{F|z9z
z^MCHlSMTAHy)#S9@9~cwN#{v7d7K}G-{Ye0#6+|hle6E^tXIwL-J0c*w6CZ4NiJ!A
z%^mbJzx0XU>7mw6k0lvyk7Gz?fkvOP@14|5{R@ZCg|n-6ehQ^Zk;}lo%w?aEdgJJY
zh^zHQ-9krEH_LU*=8?u;@h|s}7ZV<Z9!5T@ctzyV(n4{57mH#Z%WgY<_p5svL?deA
z-fW^r1?<v?mEyi$?%7<I{xQp*GB?BA9&(3228PAh?z;?P<PHyNCR|bX8e)vv#7{@o
z96RrwKW@Gckv_aLIy|dXTZ+0nmB0S8yhz(<lvdA9uz4izBP_ZSdS7$X)eGErQT&PJ
zk$AkA_W*sI7e%GuW5EqS<Zdr!dC+(3SEsu%o>)xh!={o3iR9_NE@kHo_QgVcw3U48
zO6<RBU>e6Wd^bI%tIzk=Zzypu!}G^Zt-T~g(@q}KaA5x)n+W~=IOJCP@Q1jspLfgP
z%Luat&pnm#SH&qJwj$n5)rzOHgr^LltI5>l{^k67{FxpVg5x6xb>>17=o7^+4x2%E
z!!Ac}8&n-TPxt$LX6Xr47_;qa%JMFJJ=bTo)OgK>96Fl!GR(e&)m}=t20>nJv5R_Z
zxyICxIgkWNgUefgDDGI0mJwfjjyqM>@1Camqb}l5cvlu^>2-CUo+=fF9M&qWV-{yx
z?4F0aM2|c?UaBoCCF|Sep5Ho(5d^Ues7{1gYGo0;b$W0uF6UR>%J50rzkj@g{mQSR
zeuWmZ_}=p~sddx$HWzd2psMrtr%el*5rxJgPBDFpSN*z<3cpGd3LmehLgI{t_vtdR
zSt26S*=en%DJWcyjWh&C78{pE8rN#p^Uy4;&lC5CWA09yc5ZA7CX_|rz&zwqGwj9{
zxKAurqG$XX&%d3mVroBKCUR1%w4%!OB5P^nY*CRh<#hAxGHGDAZ`>-#oFHa({iMFv
zVPGk9E7<;^jC&+yaqG}_xHLp2!J+Z``IzV&^$049Yi=jD`8K!!Mg8OFTw%)XhV=66
zVX?iz!q-Uc3>d?Nm@}fz{1k4ZS+l*4I^)Hi&MG>R59jU0`BkQI>$U9bD%d^jAm$B0
zi*!NUTSHn3>>F^oH<&i5DTpwIpBYH3%fvEV*L`ZQ;?nXaWjywngii;JR%K0(=bwL#
zzinLP@rpU>c(7W$p9*&Zo5a1>fN3VdRA|>cjBagO#+{WcZo`sra=<kFMjn6q?mI}?
z<OwtUvXqsnZrc4Q22j=IvmEMAgfyCJ_IWb<CZeYG${!XVR=n0Gt~ax2f0;O}&l|m;
zrKj;=b#s-yI!JeH(pk^$)j`0cp*w8v$N;mLFz&d1r<gA;MRX%2P(J)M_)_uO*n0(u
zCW3C6HnS-o>%p*Pc$Y=?<TMm+gA{G?Dhq4b@t3mU{h@HBnFIJ1r$|Q}9Vq_zlSor6
z)#5bga7}~+*P*?#QR41Ar5BB<K?_z02^x{3nIbSMspbNh<>3H(`-hwT_FuA>uavUz
zC%2@ssryH(`)J@%cZk;rF|kHT3%HgX8|^1bJH1UTd{EZfuGh)+3PDJYuwrDCyDACK
z2<CRNVB3Qmc5!do_Vq#2+FLDn-?PLJ66hJuL~ZZmomO<2t{r+IYr`l)>UVU~k=W3k
zJ$9kOE{qkI9lc&zNH5J#CEXwC)Zxa#q!sNpNsqatqSyBx%WSB&3y=YfT5854-<gh9
zGYrzaxXbI~pw%CfDU1@-ps>)o&zDPIyT~InjMg-RfrYv{!^#>G5`S?}Z+f9>j>t6q
z`_{3I!+F`PeX6!@^;W8Co_{Rz0C}|Ye!YCqzDY|*NP$uQhJ&8Tb>6;pXq>!pjtSLP
z8r#UL>t~!WO=eRlYv3cW$uXm@>B;(Q#%!Q0xz8vFX$0dWxZhJ|w`bprv096j#W`-|
zO~gbmh(9)y*I>SIV?R1C*VW(h%7P0*es|b~hvSkj)|XDAk(610OLtVAH}T1uUP57_
z*&>^FmyDK%G0iHk4ELwwvd1s`yrwJ}1Uxa<fu2-jEA%c+WuIMG68&Xuzu~s{Q%BLG
zlh_H<kQ@^2wfhr;#TkZ=!a^$*Eo&E*m|Qx7yGona=(UjRWeJ)`cRM**ayt2%DvlB6
z$aD3f+UZ2R-Go&W+<J9|opat@PIz@n%WwIsn9S|ng?H9a6<MM^3%p|f;1N2Db<>8d
zZ88ZB1={*$E7uab#vK=3=T1tX1EVkGk=586+{dvCbnKU5N<w_xs^V{bwlldtfzM7v
zBM_OSrQe;PEJokmQS7kI+r12W4O=O~Dabd+5GMbogK$pDE*gu1D!v~X+mi|LWPJtB
zW*0dqVd`L9W}DU|5m5`)yyx%sa6XNDA$|D@raJyAW{^I^a(LEdi%n|N_f%QDM)F?j
z-kcn>AAx|Ca=A7V&K#3Fz^sXeJX%`_a#DO)aCouN(|4=saCbXCo8sGIy$d=DwOP&`
zG>wDnVIDj}&30MqDK-b)iP`D7e^GtB$mi$Ie+JF?3SW-MWdH03=8<Z1b*_gm?X))Q
znV2C^2a2~ctpf9`+~$wzw=qkeTr66!Otk*m;BPLu%TJq?S*>7>H};?-4VlXnHez!2
zec?w#W|p#W{<qE_-J^nD+2NJAUitR}z?wc0Wc9|pq+9x#l*Pf6PRT9jr!0sYgNMta
z9r8nnqj=oCsyJPJFQ$PRj8g0lOxL-bcw+I?hcNEm>SoXX4z7oxiRY_)Mh?lM!l2UO
zXSi`u33&$5p3EUhM;S#^73Io=v{;iaU^gW|C>zj2(q*%N8bFJ=;%mFr>!nO`gYKsd
zFH4~fPB`nsBP3k1=Mm<BBc=__ueZg_1vGaMiP<bVI1un~Htwlu`Oz&udLX{=w2!(=
zR(mkf>8;nd(vtO;>Q>I0s%Mp>$z&Q?2>JMH3nIVy<Fx$a0L96*tlbzzFdX7npAeY>
zisE=;wu;=2qM`5X=mVidNBj#i6+r&ea$1zL(n=6hpX|uX#Y8u}eoj{|U#mlFJ+$dR
z%r$}DjE18iUNrOJlB>*TKq{LC*o)+GbIwyn?#6RdiRu)eDSZwyegABQ=mL|4n0c$+
zeTn&gKYxqcPn<9q3Edoti9m(n>{c=ImWIny_v3;0cb+!BI!8MOiRxe{N=Iu`0aXMZ
zLLi}~c|OATAAgL4lL4E2Jvkh_&SxEk4?yo4w@P)Js$?p(`@N*>KQ5S;hqVX}>~{R4
zrO@xxWW5e-12n7(F{2o93dN2i<c%f8xCo(_H5$)^j37dsLC-JUU*~|!!N}{&rRZ|Z
znb_g3L^x5Dop<ln9i_o+^;gh1;c1cnkV1UeW?TAe==J*E2yX#?0Q_YQLc;ZzN!_LP
zr_gesQdM0WVW+Gh<isKmh@A8U2y}=l(aI)C(IU7Az<6q8ynS(AX3`V|XZS!@W`%=B
ze$Q_--rTgZhNP>O^;>F3kG-fEKvplR<ESBQ5q<wFQ7`Sc!;W+sCxRupI5cpln_I<y
zTy|`_<{beDi>6NgH+y?GRIhioHJN&(=0AH)DnDY3d~L3~3rtf+faG;BozytEdIr%?
zyx?<BO)8wZMi7d6H*;cpr+A%bi5(h@tnhj(mD`IV3N=anE)7MU8IBnv_ypuZ@{rf;
zWExeV3qo;>X7VN0?7FoND6vYdI)X`+J@3GgKpY#BI1Vhe*&#};B6{2|zwqMXl8*G^
zhVNGw&deB%%C%i>BoP}}bDHRTD){JPV}I#2wN%Ur<Z+ZL{JO@s?{z>zoEvbL?wkHn
z?o2t==a2y>x^Wa+MmLn{MIDyY>gk)rpURNmq9y#$I%tb!J~R4Ih_yLd3s9A(#mY@I
z0Nf1{F+0kvm2p@ir?J6S&Lz#n@5e=JyYuJ`AcBHJhO9QqtdPZGlyRuoma8sVfL$&A
zAnTnyYQArR+ry<T233;kIBW!`*f)0}(Aw-E&MokjA4I5HMRy5XWWoviu+IA7I`+;K
zZ?rWTaxnXMV)u!_qY~TnE;evhqh@qM1PI7a-j5BKGfg>qK|`z&5}uD#78N@4!z;!R
z+uCf8c(X~<f@DJHM^7cO(sKFssQGYnVt=qKAOtQkgFJm9<6!#hU*PU^8laEvXZ+r>
z85h7GqJ^Q#m{YVsz7D2ejVOLxD%FF%p_53J>$!tCs^q+fEIF4_?}=dYM~MLWGTt45
zg7}4S5_<si%%*zVQ{1!_At4&-v~}Q?raHJ>wASTVKWxFhsdtIJ<f-!^nBTKg1m1*|
zMPdhi>va(TOUJ{4ynXS-js{zAt74;3K%KdyS~=L+zHdaPMV88~mccX8aONw7muD{2
z1I|r5((QO8y!t5WeW40XqGS`#t+u@NLDTG2NJ$?2o50H#pPFuwT4HgcS5)_YXVteD
zZiD`5f?or#Byg<2Sk^h#hz&DV2RBZi?Bij4uMKOV2`leEHW_mBCwQg96*^mBCh^i}
z+ksn@{g*yor>1U95ukD=1m=9_>OaHb++X=A?R@tYy@bl4qwdCA?7;%I1D3T49GsI8
zVa{0rcizOU2eGUTJ;6#HYg(ou#hyuw?$H+`%cMa;cB_N6gHZch9{p70FiLWgU`Ts*
z+^LkQ73?REJ31{_LJyfDi?A$TezEfPZNkdm*-QU6)zV_s-!U=>i^oWeHNyT__&#OT
zJ{;hT!4uemVs^ytX{sWG5a|~@8O-loIz{!~N6A;^8V4VKWgHefIc8$@1-nNG$z$X6
zwn>V=q4pNLmTI<4`PIw*6#bTD%z&wO@H7)U`X^uo;^@k}_+pZVDcpRP@DT1&7-~n@
zdRW@$adMQga=8UEtl6vLT~YNckSX>)G+jlL>-TLreN2lZM`H4ji3kQ5Cwo#YO6b8J
z)yQvO-^mJAPaXAaG@oE5P%jP8>I+`Ngt|Ao3~M(K+eJuGWxyOfR@1bk3~SHKF|p@=
za2O9DGSNJP?~8qgftu(4qo@1%C4`Ona^ryq{T!~`Lop}@{QH3ibs^-cA=Ll+(;dOW
z8LK{zJbFBYw<k@Sc2VK;m?NFNA+cjHuxWfbRIaD8--#TRJkgP(%=$63si)_89?=sC
zMtBu3gGPj%(~Sj8-pw6@S{EzTGZ!D-Jc!}Lt8zGNT)S-4<_O%fs-jI~Fm<Pb?R5Re
zQ3^h9!;&D3JJMOy3Eo-IJRcFgQCV_`!K28Bvz}Q|>?kqTZlO85Tnx7;KR&bWA5aea
zFy|d%fde35d0H`D{>xoXg65y`7i1V=)G3_LXF{jA>tE^kq9)?(F)eG1K`Y%6nJ(un
zCjQV|)&R4S6ficFc_Lp*Z3r{XX%3g=m#H25R>}KIE%enoF=g#wy3&Mqtmu>_YUQfo
z0HA`dU7dL|FFagf%kGcKFXZQCI?8RswxiUEoG{+=jO*_z{!&>mDSCN&`DeTQ^s-jY
zeynA_L<Hl=Ya@L4OEF7~)BNI*cwZpitLgRc)c?br9AFR7S`Xr(9&LH<YGhX$ROoGF
z{UYY7Sxq?IjhW^dM-J5aW^tLfKc@H77Q4S{f`{vrLT0)xHgr3_16;4p$@=#EUNR87
zv$N!6wUzB9uJ=)c`m@vaQS64fPdTALcQ+X~pg3cV%>wh-Gb8^|s@*|$Rc<yOv-z-#
z_h|R%-LUb>3t0k3Sb>q&jeYrTt+S)Z!?Dj^KFVa!(CUnbu30_Z>we{!ocnil(4)OM
zOrRA~Ez6hQp!f+*Bv3A80{QjxcRDL`azdrWGTrku4lJh5p`wI(nw?4Yd^DdRCQ#4C
z4pPaC@PCMu#?Ta#&^oK_Zw6)p<+Dz{EYTJulClUy46k3pFVZB9TfnZ%1?F}Tt@~2f
zwuKuPsX&F`lY70D1LwL4l-zC)UAmMO$<6ns8pEWy%mZVpjQxJKT8%A|{jc*whJ;kh
z|BF|u_EGd#75ImKx^wu#wMxkD77soHBSq4W-X?BX72}x6ec~b=iow$(9@0*wiEtsl
z03HukY|fW<l(6q&o3x`k6X9wyr&)VlNvj~uIN@x^Qnm7QBb@SYHFyq+N!Nu(gbTls
z|L;hsWtDLOxAA$$WaMvLayoo#Pj7d{CPLCUZ`M>j@s+t<p0V#d(_#WoHr&-DyojS7
zW>kQ?{I(Rb9Y=G``7)mj_=Td<t9#{Eg?{^5Ycuozdb0n6jM<!{nVacYmKiQr2zPe4
z==5RRO-94kP?Z}p>3D5_a^1&=hrN9KG_@t4Hw|i8G4qc9iT4-{OUkUsQfn<e7Fy!n
zv|*XlX4}S?5&6;pPe57t<J_~U{6{ISVs=Lj^go|6y~NV?-Gtujlaf|?d0@&l?eDMA
zc&NIs#iAcA*<T1{&QOUt(u=zp8#P0O#e{jxcxHihG2q&z+a<o(I=4q$@1_$D=cb!C
ziPKqXpZ}LPwCap&#}}0BX;tep+pt**qaU&={!oSEDl8SZHo337-0m?eG(VFz<gcp-
z*$GDCWYvL~-skz?-Uj`{B#M)m#bNA>RDWHHiRw96IE*C={(c+1>CRQ9YmKN!)D6M3
zgFC#RDacww0`Ze5Lsj_?CN-`cYLj=mnSJ@DM+|m+fZ<P0&ay@z3ODFZ?=-*5s0df;
zi`I@n!0he&rB+8dFmf-Sq&a^!r9!m35H-c@c!bKfpnXhv>9n!ShLt6<d&DtgM|B9B
zJ!<;V?X7!yffhIy?H(ws+c2K#42FbXTIsRPo6G~nmSkQ|>r0Vpy3&k^)Wz}%qL9z1
z9$yp~hNHisy|w|z%oIo0!B#B}?UI&$7JbJ>_=iyS<LUGDvb)P*l$cHH_E61g+1h0l
zITDsfNKo0iUi3uZQR6W5JVepEl6;vHS>Pmu-6^c*zlrj^<2@AX3!J$Um(WUv>pBt>
zxs+-Ln%Y91%8I<jxHUH7HVF%RGO>%DYgFQ0=qN`y2iPgWVrGep21HDoNhcBHj$>{m
zj$>;i(AGW5)ms6f7I_zQP`maBT}iskYG)*cS~L;wp{%^$G6eyWjbIlTk9C;e=pz+r
z$L19uvjZX(G#YEFjDsxGic#*Jg2<Uthy^9f^oyluECoJw9>y+47E^ka<E?Ecs}o%O
zOZ<*0taR<;Ps0g0fM-)n(A6B6SM^KTvK{okO1BHIn~{~lx5YHSuff_PysX7f;5tC_
z#Faj2RS8xU6nj~n%_t!loH$XvpXPHt1DA2yb&DAeS7<mcx!R}2E`fti#!H%{)uAS2
z1-^Z*;IDb9jzyEy1~-I@jxNd0ab&2s$O8lvH$%W7RfB?Ef>=z}KgfX$^3ylV>#(UO
zg)k=p%a;*$m@6zN6mm;2fjRy>2qK`9cHUJ2=n8caz+wmS3}*O{QE<u^3LP1`n8B2O
zz5Sr*p7lB)oR;&X^CBP~s78(qrJ*0ck+p|Smu=SEY6QHM$=+VwtTkG`$1X#D9<8(X
z420}Vzy^)3LPi-TZNQ4JU02W@_UMg?x7t>4Cg6VXM8Ja#2wyK-Nl(s=3MV2#gS>8?
zsEIqI-z~ZK=*4BnDH5`1ZOhIry8hhJnDM2yKKeUI3PP{e$ukQ|hlQzM=75V{DXowh
zmo2)6Z;PCCe5R5}rHn{f?~~~uWtZ52Gx<|%TXqFLx28I3T?zPvlQVc-&^G$1NbD40
z67BgePSFV!PgM;T5M5=~j$?#bkzEn^{>z}T?-gHRUuy*qKV-jcAU;Fji2;AEM%7YF
zCABveP)ll_tuqKrgu@U1{1hwYBw54YC|-nwvkchT$J9$J3?hVtWiovk;{G#?g^pGS
zt;!6XobFuC^)zaBu|f5wx9^8f^m>LL86<Wj(>sSmS0fE8=E(z!q1~t!ux(blG?xoK
z)RFlh)}`N~M!xHteIxQ~>s}9FR3Xu$=HzvL{)FSr&{dE41QsEgJ0MpaoA)Dw>xw@I
zR7t(V2#4G7GV+K@tcmw%8ph3G6@#8y{!%umvVQfMM&j{ijekPs$567^6U=eTg8LHy
zxqZ#oCmVF-r;f;^^G=A{Tig)PVmFeF+8hdg8yEm|WlclCo+MsQ-FpMa8V7f-FZRkY
zq`@S%1~{2@o@B!P^@t@LWYWZQ>}K*`Pz78;{)bf?!59wOql33Ae$4pH`J>snB913(
z`OudBHflsB+&hq;ahS%>3DZqK+#FvbQivUF@c<CK&2jqC@g;R=P9tA9v@o6ZnsoQe
z&`!@k5wZk7uu22XW`e>tb4jCag-R=lf({5z(@7&fXaQBz6XMV;P#Zq49R(~N87m7d
z*tq0)N8U-42_XULqc`RBY(H)eGNZwX%8Lkyg;KI4PR<jH53=k*Q)egIUa0!s?XE$D
z2(M^q@wr~ddb{kZB^@^l>}?v1zAG-<Yc~hYK~YC!%>NWQe!rc`xOZ6hiZ!FoR)}BO
zy2=(s9U>6QRcG91$j&@DiUDs!z(5O|4pm?_Lu3N&<URsKnI@t0kH)MPZk8D`{9w4k
zPiXgFy9JDADmtAPujAnU7|@Ry)KOmofbk5znNsUTinL=vT%j{P`t&;}QEIXx=yMR!
zURh^xoS=Kie!I(O(xZBSik5}k2G#R0MKJj`_Nl+Z7w~+hYw$jm73(_tIs%yu@pER6
za>!@zhGQ_@2V>SF^p)FNkBIzKS-vSVSiNrfqeD^PyZbG_ZakKF9CQZtS1{L6RuL_O
z6dKs0=ln}I;8#`4z?+_6o&yhVyq0d7wcS6<q-qOs6>WE#vyYJpqhS`RO+j+rnwBTx
zE}i$>Ej?q(j09gfPrKz~?j76_Ar8g%D>9b8mk0bK50-)9*MA1T)*sgqUXVv`awY;-
zmo0LBhhd=+C(f1f%9u54-7k`)jS{*r9(H37C@S{Yh53?8rX@mzHy9eh7iiV`T<GL2
z)DqThQv9qc+Eh<ze^R?5Myn&;keIVdQ2~d+;#Ova{B~ZvoE*OYLYSv!>As?LWGIj~
zN%hFGTMD|s*qQjeyMMsTL<3_nHKsq{Ic0}&pQE#R8*$#%B2C1vvYKEc|KI?46BKk9
zRsftinEh^O6ivNj$ms?R)DO9m#Lk)I6w7RblVP0^dB4j^>5GlX<d*)pXymV%#H4jM
zLS{$l3lvd6{BrZpkENEkpFaBSGVC%vSn48WtG>*EI$@hzb;eS|e{5uJ;N(YS5}k2!
z6E=y%m}v&;6rEJvlRI*VSBD63nL^ku?k!+i8Q=S{+pmtvf>ix(E=dmEtG4CWD)(dG
zrgM<A0N3()fL6@lA5-vS$>)p6yU4PHVw3Nh{a!1wrh(@Ex0@KRITt(ur9<xUUTS>l
zFF`AY;2ztXiRe^-33Qz0i%emQufsw){O@<IuL}m(sL;E%ILQKf!=ri0OIxrk$r7kP
z6uljrHQmKZdPjReozE3Hd<El_vOzW?!puVls*scFW`=fb0Xs*V!vf<Sudt|L`eg#J
zI<)9#(!?JvO2P#>{O)|+)S3WOi!{SYiW2vuhzp-xIgEw|I@5od_vUzkFivG;;a0LN
zA6Y@i<D)pOfueogu+8ilas+)+CnKe&L|rIKS7cDmEVK|M^KSSD%@Ur*-59`LmLq;f
zv~p9W4EW^fC6=LqQ1|haI3Rb{+V*A6)LTUT!Y(Rgb-piKo}OlSQJy=4YL_QcivFlD
z$%Nc{X;!x~=SEwgNRh!WX4$zHMsT_ChTByAD6ir@vipPW?ws&SRqnls4j~Z=R3L7G
zABao@Z^Z1%(v7wy@9e+ugJL{sYWo)ZTE~<y_~Y4Wjs}lRdHu%TN24FqRF&$fc}Ow-
z)3M|tI#@N0(ovqDrnFHUwC$LaNOr*EE<WC@FWOPQp=6+2Hc1tAmP>0_Wh~Qfh!e_}
zYOs~V(R=q3InLvV&nr{a+l8-NS*~ank0BI^W8sZ?oj}@-v=_0yXgg8@-<T|NL@2%>
zSrpmpe<y>wY<Z-BjWK3q+#In5D=xg8B}Xa7Qt|Wy(m5=;ga2Os24L(8-u(Jo1lOx>
zq5is0zQWU!@pYH~z4U%ByBWkwT0}4qw;~*T-tN^I_Yb7$k!6nhkHQWJ<h1`zYBJT?
z3jZereQ}Nry!c1%uNGNM(Zs5JvjvaI3E#D~R8K1D7&~VAtcz2bopDW7#TV=1(urJ(
zh>Kztx@Ct&*3j<?DI)H=L;Vy3JESMmv3q<OH|7Y>5v1SuqyTP#zr}uDh5R4i6tRC$
z<OQ&Y->df5vu`7Xu7-2|>lm>8A{+H4kt$MD8p#;%HZ#H2coYr02jBEx+rczl6iF$w
zB#~DrY6sJW@T*@PIfPW<EI<DBE|e>f@ant?;JCoX_tx0=(5&#0WS79xjr{r_1B8A&
zk4@P(M94lh^Ob4hllJ?u_stg1aaTIKYUj!LGmROP%*Q_wm2wU;cYxa<Dr_QZj{2WR
znO|Wv_G4?`+jW@$dTxm;$+Fk|^%zdM-|bHWJ<q$FUxyLUtrNoKD}C*xpIg}-m#%mO
zolm}k^(9PU6~;9{7>#?l4ylwo^laO_iy!y99ZT2KyX@p$_ZnHk?csg6$}Fzo$Vk|{
z7xP|uIA1Yf`)X_qpr0La-wRc|iYL(~%8fbme|-d{*BwrO!A{FeQbIuD-UmGa%<;#o
z{d+*b@5}Z(wrhcMMI^t>Z8Y|DzqmaO{0V00oJ~BVk$yjL`qiuAFX7X5AZl7Q*}$f{
z%<ida=Q~l0e|T7OE!;?oL>#l%%}7)gn>E`BA7I1j`MHLMwG?|Vn<WV7_T0nj^9WYU
zZyT+d213io8x}ZP<o;5(FBhF8g~p_D3zt0)O5?J(K#himewNbIBiZ&zsk^`<3o=qr
z^zbkF40-A;`1zbVta|_?i`tVweDepXm?D+J@2W$i$z9;(4iB=nP(6b+kG2X5bVgeH
zxR)6xr<bfvcRf0miu|zoVjylgP#%|Z2$2bc4P&1!UYQksj!a{LOB!#8bAE7V%2<rE
z^&#8XC&i(&=Xom89|)+TW4tD{aJ;p$_9lEM8FQX-pwc59d>*Ouyp8_P11Y=mleOl2
z26W-zBu6C<7)!={`T+tK@$r7*a=bQLoHf&RT%qQ+?qyEGqUUCY==~-OmiaFRi6@An
zF{5KA=dYjT`#98_S7koyAVi&YId4Dz{7Ko)clx0;EW%ey6XFoLKbrb*xECY4k{)+X
zZ)DVayCLT=ywuYQi|=Uiv*_9KTY9v`TEGsn?s1>6##SJ~c%v1-!H0u?SKSZD&6b&-
zKgvw%ZG#`!#BY`K=p#9fqZnYJJkPI;0XcT3(a2MPl&kxwaoJZ;6qvvG(0Lzzm-p7n
z3bggI7`M}lnmjI!Kf9O_vt<HFUKdlgB^kM2yx;lcbr?%=eJwi9pWR05@^}v08h@I@
zOEZQAE`FbTIj-|3&D(cKw#-POnZaTJQ2$yQuMs`FX|Lb4MuBlCcz7sEmK0uL>H7E>
zurDY*IW5k`#cy-E$h$t99|OLf42PBnk{}GfC&he3e4|e}@L?mwadNQ&aND@9x#`A0
zlQx_tP!1awEq0)2gXwO^O)-y2BGu1nCl5Z>Au`>qe)1}KGJN1Qu+FO#>sj>kkvdXM
z2zd+03;wLla;%zW-l19BREiO@8evl9)})9S4}L{InY={KrSH08Eoc7txM{#{z5+eU
zK?nX`(yUwr8JZ&dlvELj+b)-HlTqMoe?MH^pX|Jhh*Z*j3)!(VnxXbO%24J0qrIbq
zwVZJMcwg*jc?r!9_DNUyH4u!DZ0OoG_&C2t`I8wdxc`87#Cs#nbv2B!!r0jLW^89t
zOC^=UV|yuev;cnAdM}DwiNw(Jy09h~32pw#qESl$*<oUlz$pXsRfS{gAk<+4xog%$
zpn>1QIr8PU{r>!|#)l81M#4q6`!y=-zJS_mMxgjmjLuc2Mtr^^1?U>e&%Vbv+dGJR
z--eY%a(hgMumg$<E}dvqz-h%oaUxvdLbeZdwCYA4S-qQdSwddO&&XfQ>!^O`d_rZy
z)isxRUV%{wt7Gf9Jv>XL$$(8bzbi_x!j&(pVv_wyvex0h{vZQn^*D8-2O1KSbd?s?
zjTeup@DAcn@lAcLXn@IYzJvJ4-*ELVhYZkm?hTiYge}O07f{mLFSQdZ<lErWBbyRx
zbWxTLrax=B8;`KW;7*dQWzDz>y02}{h4e(ub1AWYxmPn6{<U8_?RAeVTc!lalTt`~
z#EEU!eivhBdY}L+4r-XU<XJDt?e;;k<zq5&0KcMt!&3%UCz(c}E?V6+8#CQWnZy;<
z;<UttxQuWbBy}3nHlz&fz&wF4J>Ffnrc$2yfjkgV&p3RpxX2uYf&h6H=bM%)rze0f
zbf*^ehu~!buWyM|{0YX;)MeS7{2rStr7xLb?&D#(u24UR;@7n}xZU)f%ORgT9}i3Q
zTCSphTFX%;I8=|#>5+FkHvc*0&To==l6mUXDd&*m2y@uQTupTv?nTq8GhOor^)3B&
z=n01JJLLJ7Sd=?q7mM`inqAuoSVuqkL5$0-F?2f;>pqn-Wr-S+L=&N?4`6!y-6z84
z&4p7ve)3NeFOoBk1?I$lOcv#)@g&HH@$lex$@Y8-9?QOG!2<k7-ul+?{UV}wj!jpS
z!Ip#kmt?8D=+u!jnLg906MzNeRXR73J|CTgLHsfq!IO0>i5k{u1T5}~FL=!RL#Y2T
z0dcxtq_7s(Z`r-FBsO5uPc5NBhuuR2Ay(~xl!*%G8(KM<$rZFQ{a2=G^>HC2tXM*;
zq!t&H{b?Flqon5g-B=4CLMzi1NO&fnFm-zyp91(wFffEwV(sW7k_(q!Lj9x(8gG+^
znz|)eF6mao;~Iz>t%Tp7lL$RMk+UYyuCI}Q&FwQc10y|F6XBii?a|84Yt0LB^3QE;
zkU7~aRpfr8%}kc|L#Wm`>~fwL^$75LJlXnYYp+$Sxz1E-w2-dj8uyS8h?L3>I<VcN
zjmi_=?7C1Eht}6nBb$;u<>NW%7+GL*q53yGT0!%~yk%muF&SKS?7R2x1bEx{@in>O
z6_G3u@TwI*uN`3!ThO2<MZ2g8(clhtr#sAj-2=tG%qjor=)k%e^b|(|MRCNDbL7M1
z#@E<qV%2rrQ8cd0?@Ob0X}WO-UP8dclr6MHfY@7P{QDAc0`fO;mw&b<74s&><o`?b
zK+k;{?AW)^kJ-iTN>PN1P`)Ub6RQeX1P*F|)Y0n4BRj<%3#+0PBiE+9Xp8TNb!h+D
zg5dy0NF|X@JMp1)3U!<|SWqd5@jIX1&H_9ds{~_v9_q&kcqZ0%mYkZiuB+)%+O^wZ
zH*qZ!LKf7&jO>GgdrTAX-9Nvh5Iag@BmfbhjA}&-an!b$+?(f81*1sQX<75>v1e%w
zZR69D6cBGjSkBkM_tWMz_4Oa<vGfLT6g~AMMRp15b^`*cAT%DLG)ss&Q8c9F7iFht
zsdUPjv<%)2t-P{-+BVu23n!TkkSsC}y=f#%pP$214|?ZTaaq3Ee;VW7*cmGMH!yTI
zSEmrtgc}5hf=F0T`xmOb$E%t2W4i+4oxmBx=qv4oFHPL$02d;ShbcFF%j4m^xp_}=
zOb|gzaff=v+kax|+J+2kv~Vy-vZ9#B&vC`O;yVlW8~_w46XdKb&rC#FSt3uAMfXK_
zEtV||Ga;OZJ6`^xyDP6WfEtuWR&OOFfDrVh8fU?oGj{{_OOnOnQ1b~C+ts=W<xKmS
zo<r<?F3DaIbWrA?aB##ekxZ4~?PO*o-?di=;Gdv=)M88zBF0f}U4hX`SZ2MyxoPa>
zfvf!il3kSA^W~mCRmJ|nlNTValL~mVrZLC6m{Ei6!BpqdS{@d^EHbMEH5^6W%jxLN
zs<Xhf2S0Ja_hU1a=9X6%6Aphf0tVn>?re-WTbO+VHfkwyUfd(h)2Ym<o?4)cxUCXH
zi8mk7N(sPsvEHU)oG_^2MCimTh*&n3ZMWj**eYA0*E|)T0g2Bk#?<Sq`Vg7S$AS}x
zNkndbkC-1CP13Alw_3chPll-!#{!NRvtoZIwtN(I0>m4L>x^Cey*pBg54$I{10CoZ
zWi|=K^RJ(+D0p{{A{%|2u1+nfu@x#DB<|fM4mK@@->qmHdZ=cmO|z>IcEh|pUU|od
zXk6+zLkzdqjM63oA3jASxp$=1U9%I2?*WVxl?I|X1#L=1!LME^>(<!zBf7nwmJQ?#
zh;4qjvhP^Y*Wu@}Gni8L*B4c7Z#}Ts`*km`v@~!ir<cd7FAP`h6k?uw#Sm#w?QZ<f
z-f12O7lhtpnhRx?K~fpPhyT87#>~%L<HRMr`l&yCg?P2Rx&IUc9}Az*T8UmBsh@1z
z311|USBLp<$VoggJB+w5gGGA3Mu{yA;|~qEyt>O-aijO%v1R>yyD=~U&OVIEL9gcE
zWfZ9=4=|`(Y($){!H8!m!XP58Mf>aJ;UTVUOnxgi$FU3&qC3d56j9W~!RDbtBXe(p
z0pnqH8C=h9F#fL!pxso@w)MIrF)hcx<|D%dLfhd1)nZT;FzK`t$fTYh{pQepV?j%G
z{OwG&uRnt_nxPzQm(<9d8I3t|b^6nIFt~KKF=>qAtoYfmh};7fAvLGBis%m>xQceT
z!g%SoQ+AJ<BQ-nHaxp^V<AKhRq3E#K=g99`iw@BnCu?&ZxtWM?8g0v4yf4I;UoY4A
ziTx3Hj`A=9OvK)R(d}chsAr{42NFb?_GxU=PLN5YOJt>aT_?*)rW({c8nfJrV{+Y{
z?Nnxh$wJZRE^wvw3TN5i4feCa30m~a(@RDQ-@%<T45p)q9+r`DoG+*u%ZZl{rm6po
zc=jO)gd_~7)o(~@zX#_^J&ap@VfQqF$u~3CIGa|VBz@1jm!mcaOVo?h=vJLT`6AY4
zxkQs53wFbKrnC9FJbY#lLGc0m2O6*ZN6uajS6Kgy&5HmFJH||Ttl*@b@8h!)88e%=
zSp`Vs$QQVqBqvq@qqv*~u2cxHb!1&cS}6ZQLnyTmfyIe<KD%rG5SHz$38|D{s#{cK
z;}$lE5yf}4PrbY4D$=yZ8}h*T{F4ZaSU%bv0dBhQhH&1b#|2Y}>*4j#old&D3<daw
z6&(_jxw?x4QZ9~07va2R7975p+&|;4dpzqqf%_C{rAsU9+#sYy5C6FptKx}Z+se5q
zAL6r6f5=s5W;>5)YpQBwo@?#?JL>Z@mI3Stv!(uX3p@Ty4up5PuZ0Rx<3iO7O$ml6
z`A#v`4@L&8UQQ6qyd2oHAOG6uSyvX&XwA_S0!@nk(n-0>Wf&NW1C2yO26r~(0BG#r
zA-q8iWfUdipEWpOus)tK&z)7jmyZ_(j1_+l51gLl)IPHAn(4eaD&dz`_BC_n{1ahX
zAtk2z<=Yf&)iwlC*!VcM)M&P+tb;=W<2!*tDM1D0Wqam*C7|LBs+d76$A-8LM(auN
z8(???X&=i}FDo=(AhpA3EI4r2g-1TGzPrLdIjAAr39SD*g%=-3gU~;MLx$O!scN_N
zqmJxoiKXVzqRCuG&z&}N#`hcfC;qhVZ-<TVY-Pqvs=vJqtPUyp9bwWL?|gvv5k9ZU
zMEUP?4jMa?X$2alZvfnh1+2mU13sb`Py=D-Web;DY%>rl!;<dskI|ABCvp-0b=f1{
z>*LuE%~K70xFjLmBhfxWdW(V!GOzZ3>AD*WZjzst#VPAH$Xa`>K&@i`xxsveXeV<6
zhNIzc3vINbvuH^Njcs0yvjeVqZO&lnC2w`rBoozg4@*+&`5G%8;Prt0n(rcGqfyr(
zWIKDX8Z7H3xecot-5qI9$TIjs!Q8gk>Bt-t0a>@f*l9Oz7J~agk^eu%<}$5}N&NK8
zOZpI(Yxa98?Q)wJi?am><7Aau<|PY=ub6GFY$3h}ZQWz(r!b5jU&xZp>(VHGdFQWi
z=F13w+6!x~h`gIYUmQN?t6{S2f5qS^pkiG&GvHc-8*umhqU^(F><hnw*TrG)zku1h
z->^^Y%%Q0cXZf8KMlrt9;6G?KHC>cWrM2U4bhG(kR5-zxUZE?llOXzG_Vd);4BBKQ
zGFkFZy`z^o0i$oFkvfy_7Cnr9^5wG4`FlxiREO*@R=sy2t|><)Rj*YU7I7ICV;wN7
zL}J3{OBcLdDf_Q`mE@D#3M35tLYQM09i2^{iacu(tkU#C#b2{Rpgm;E3qG7Th1LPY
zFZ=Q;X?qd_P)O;5REB3)2hh}`#|vsre@&JLz6B(l=c@j}>pVi^{*OD|RXF00E@^nA
z^QMx?aQK0TG|X4+bS5@i<@{fbYkOW;o}4D*vQWuUrHM1DInf|LJejH6PYYi{TvqSh
z=lM$gVa=l2e3rgDqssf5@VJ|R3%>78;AHJAd!Aaji)NuI-4t&YF%?1#HMRdud_}!a
zv?>+)-dK9Jmt8#Nenhb6fuAJRqG}Z3%q*RbmQ|pzJH)z272sjFEXn=A?q&H{XtST=
zmcNhQ%M?>)-aGJpdsw?94Otig+)s8`GL)Uk>mD}&M*d6=jbT;mcxg}cgK*XqntEWo
zxK`F_)5J8XrQ8cZFzu1*;qUQ)p0&emG|7lNS1oRmnvn}d*OB4Dhtq!mh@ioS>s*Rc
zKq-xcVv5RHJ11;8%-i!-AtYFnhPv^EY^B{Sj~}4D*v~q6*6&;T?_w5czH=08wTLDo
z&?qAb6(4rQEGHcdSxe)t;s@I%?xTB<Q~>gsi10v)Y!)ptifL~zHQn`MMvsa)h%68e
zL*(o;_FKCEl^%OiR+niGxf#yEXsRFCMqDP`YW~hG&qt2s?t|9v?J!?P!Gpxalvyx!
z{b^`+?p?yeP^{IM-$Bo^kZGa|*F4>QYvM$Hv);A|er%^XT2$|k!09>Qo5jmACB(_|
zL~sQrDC;V4`}whoUdsf8gbu#{D<^jKcOV{bA*ns2LP^NTW!<hBfP#Gbb`C|@$p4A0
z<9cuN>zA>sBA+Ixov#ET!PMpJJnUY3$B>(lG(y-O+hxO=-O6iFfO8HYE^Pqf@^dJv
zZ)IPcW!s1x`;xH-$bXeUUv#J41(O{#9ZGozQ@n&uL@REk`Ak3Gqs@!U6)B5ika*AS
z$53DzRTs}=HG^!5CXJo5BElSeQv{rNxd|ga!0ggO;kvUSWi<8NXju%xh2i96aCQJE
zW6ZZmZn-x>z!6o(UG8l3MyD}6>)`LbBg^@^>*Ra%ZGw4Ci|c?pFFo;XZG{)S;ud{I
zfRr4hqj{^O_#g*aUG^CPK*yDF0}!>kKR%kzGUh1s+S^@w75G_nrlHw_x1P}Z10<$=
z31sbu;w4yK@8!rJDpd7xG*g)-Pcf5>eB<lv)6Y2YUmPaX_*M9B){eMI{^vl*AH`py
z^k4v5a4O76{PFiJD_Lo$ihnQ*q!Dl_ZV$c<WL(MWe7tFa!x;Iq=#^Z)=i*P5^b9&P
z3qw}dnGU{`b~2INcyveFeqv~N$wH+<6VW6G)L9<N3=0Vzyw5KN^1Z&VAgh;2YblF)
zf`sO6ylHoA5O~P)<Pd*vj^JCW5=UkFSyc83>%%*K0%}A)i><n^iPdLl2yGPL!1B9H
zg5-0`X8r)mNYNC>izLC1fZnS<9$UBKCzMC_;-}v4O>sasjUytMPO-|Xh4QwV{#Ik0
z2u>wF00Xk}E`%6f-*G5$BArc8w8gPeaBjy;+PeaPoT)vy<ErnD1vY=T=T(se5C$Fs
z$f|uc0&6kF$2E83CH(FPxR+U4pka^$zCVEC%+hK>mkHg&1mb;_q_m5G7iazOL<fBi
z0?L2}6Y9lfrbhqc@c>ypxUHG;^0GVpY$Urg9b+-%5Sh@AQh*G;<@O@o5~RMe`{(Is
z3hWG95O8^ufCU1X9a!^Fj#PRmQ2mW+FNFK4!slOeos0d^)|w_2mrw1!@tvLRHHTEm
zr}p%pMhK1(P(Xg5dKM*K-Fn0W%48^lQ07JY_3#HVVc9A_T1W~i3={n>rYGhVl{bvQ
z^4qK8zHT9xjQji6pMd5W>Nbp6NSLE6^KSP`)NYlzK9Tz<B2$v;-Z80OU+^ChM;7+7
zy>>%G1(G0qKnRcl+JWvJz^=-yK8Ftnqv?j$ieg~!gDwC!#3z{%@iu<Ye4we7PZPrV
z(q3s)cARGs!^0d-&XdQ4&ul28WAxt3Q6%ZULb#HmCN`*mOZt1D3!i0%Z;RbLGbLXP
zgB5lXIJv6hTJ#yH0r1&lVr;#>NNY1~59F7_Yc!C?3tR=_ThRg?cN?HIpAVr#gzv{o
zKjsID#nQO}U>`gDptT2I)QSO-JZlB;E+@%zQ~nSL1s&f@kC)^Yy$s|GE%|*J&KeD|
zN`W}L0EZvSq_)xoBhHVl>z;(nvgb0t^+3@d3Tw>cupa>xlgsf-u6IT1?N9E(Z2abx
z(N`zL3}!P^3z4izCiN8_Az;V*X)ak3LuuM+#fsFI--YMQKC)7XOf3DJs0@(nen(xs
zysoF?UsDPv#q79tZWQNHmi**@h-QFMe-NVEx1F^Uu8<@OxCDrgBb|QY33>0P*w0*u
zM?W3#8mu-0X)gxKvcw|ec0l_azZc~`x2qQWQdv)`YoS1i0PfvOp)3?qNe16fAXBDl
zIYWeo9ThVqIS5#*y{IgznK)xYsO{$cKQtjE{Gj~gj}(ril3VyviN+*A06m+qJYcJX
z51-1FpN1-okgXf_X8_e*R9oY9b@@vN8ITRY_Bef_Zq18Y2hN769zgmg-O*F9!AjX&
zPqz<(4|l7aa@Tp8Lusy~(w&G!=&X)p)v5j$&I73cI1gK+>xllS^8~7FHOGtK+G1Ti
zyt&5V9i8|ZPY8UE4O%H$l+}4j+j1Ai1x^XY)laJu3j(c*p(7+vYm<NuQyVw1SzPpp
zdW5M@Lyuy%crIyJ2+!(|8{d?BN_U@PqR)mL7QiQ&bXUeaOEu_khn<7!ZdP1rSB}wi
z`{!?ijz+viSig(;vi}F)j?t07O4mFA^ik$PG+Oh)BbX4$zPZqBqm>J>Uf05I4yt=)
z&a;ih)*+)JE%E;qupZIlXn<Ml=js2FBqtC{JhEfupAEUwERigz&>*Hf3Z{*)g8o|t
zt@D@JVUfK%8~hx_*=B}&yxl70VX;u|e{ksk9w=vH_XRHw@hK28qSx?F1@c@@$p4Mj
z4LOVcJ{(-)YNPW1GV%tFD*vkl-o5%t;Hp8URA89=soziV!8kgdNb9d7$kVI_A%CAz
z|G(_NzWjYG{r{g|du8#B^j2W_OS&t~_-qr{d0sPx&{}4ZVc>t)w9-#mZk)uT>XM@B
z_qP31Pbth?mN#uS+tzat7MHYo(aZRrnEy~~MG7Rj2WBBg<<2}KwYJOD&UQ(=WD3KL
zzw+qH|FYZ8NQpY<=FRq1>{k9r18)4{8nv?W?MMoG`{OP_NkF`<!#t7$tc$1XQI~Pb
z(nHt}NfJBF<O0>Wlw)Y+k^YT_P&LxwOA5trBe(UJ+XxgYtN(+zw+@SP{oY4K6qH7g
z4gnFQ8$n=D1f)Sg8bm+@B&0h82PCDtk#3}8l<uKJVx+rfsG;V(7<+%dd+*Qweb2eh
zb<Y1Y@B2LKS#hs>t!KTkHKhG}RZTyvF`P4k(wSEMu7Qf@Z#~});CZIDnmk+tM$II-
zuCfpmErY^<6-~#fr6Oo&Gw)tRwL<PMhOz2q(bUyD-vhcg$dyZOr6_W0mOA1R4uBz}
z(^mM?-@d|cH|fs__)Pi89Hcb(-nsEuiOb<VOo4)R^I@d9$qCG--a1Q)qVZN*iTVSv
z?*{0Qih{4aPrb3ufqsO5xY8tu>YUfu+4pP`x-})VZYK-K)<V`^wTBx1jNWGkanjsf
zFP1ZD9^?0Hx6)vFyOCI&c+=hQTsOQT*F`1JWE+*@zf+UoO#@<(!d+pMh^A?rler3s
zj!2(90<k91?NXZU%X**C)oHPw8TIY7e9^nAwkjQW#RKN1R%*qg)=w6}zn^)<qoo$A
zHK%0mX)`IcQ=CSP<|_FN*UkF-^Sj~?zrN}k0A7W)U0mCP#d4!L*C@j{*XP9LeWZY^
zX&QH3W;1$(6k477duygqu<tQSHb3M_NJ0mm41eK4YhId?3`Lt2FwO!U`Ro}k)FY4C
zlpHI;@C2(QZx>&Nf7aS!ocFyC7BD5|o%KC+5V!~jm~}&O+moOCRxw02f9GA?f##jO
zixe-pd_!c9b<Q65;!R~Rp#d<sf~I-Oy||6losGN>=EbN!*WlG>K=$TxxW;T@QZj9g
zAW>R9)9B9p4lrxT?RJXtoG~zuVyt+#&A5}I%jvDl`(X@2_%&ub>ia9Z&0j?Efy<`u
zLgXjZIvCsXMRS7iWZrJy+q*?rPPol}6Q#{LEN6Dl8pm^P>nEtKya5A6y*#^%2(=cl
zgG~4p4}rYw>j*tuJI2v)mIAvtmiM2cU?#(#YL2SSFLtM74#ZSP&ybBZMxy-%n80Dx
zff*yxyVo#Q#&Ia`j@BBHjQcU39bt87eZdOFI}_QVz<N%ac0;kSSS@#sH!jjF?W##!
z-&wPbTJxOosfgcI`@yV}CY$pt;n6Q=npuX-x6<71j$-tlJqBhMxgUHSy-s^CPSh(_
zjq>*Q>#J{WUwn298PH89!Sk`W>T7?RwQT(szu-L)2I+G#0-XhO-TY>Si1n8eX9Qky
z&wWjSr)gZ@PO%StOju$0>`bpA9>+nf@rubFt_iu|HPr$<6W}@;J*8VVJKGcYTVs&O
zy$tOK=m_4tAomByEm<d)6u@0F61Q)n%_r$ky-$SFvJm*@R`w4-GGIUEh8mL4>&X|%
z>A1*>(xK_Gf?B}M2V=z@ncN<b=`SIF9Y7Y*aboyf%r9=n0+K+<T9bF@ty_^}$n9uD
zChQ(A5EmL&Sq(i;9y6?{md-Z5`kG=%EuBbNr?*1{T>;KTpYkihq4<LVbwE!jq%Ek#
z6!K$kQA9CBq_>mG6{E3YS@oWNqtDY$bCLl!>+4{XVkuLAkIYfSTt@0Vn+A4FO<Dmt
zg}d*w%&(h}q8pYPUi}<kPo~9ALk~bj+M31$HPWhJxqQ}|zVkwDfYJfxJ>6gwu8Zm!
zpqKGLdyEzs4HJHCUqdEkeVr%`n9Wk~-5{@4f~`cLt|idcZe@nhBe-Tav3F*<1wLiz
z=MVc@?f<&vmRYj?lZN!dE28Xq;LHlLjKV8=5J%i1-oP}-!#qbmtD@SyV$B?OwoFCJ
zk!FhCTW(U{@7;Je;J`Iv5>0ely`F318j19i_!;a1rY~1btqR7buRJCO72Xn{GQAoV
z=mAX05ht4xZ6BR}6L41ft<oou)TuDDPZ_wu`ASDi93OXDprn?(82hZ$JwmD~H%Lmb
zwMj<(nx+vg*1!%XiZ`0>2^n6xB27g9%xszXDlO0(u{(^VQT!g$<?4RQH~x2QNq7=W
z-BJv1E*xLaWd>6aWJ}*0Rg4~`z9)_p@O*ZfYK_nR9HZ#@SBBD->(2!u#HpL>DUTKW
z+QQcO?i9L}_JUs|3Qy;i6cXt__nwVcYcZU|ReA1RtX>a>DdZ=c3`Ol9ycWPD-o-nf
zS-u>QR`<anGQGPCY}x$|iyZr^U2rp<jk7#^uZa((oj~GOpj$*XfOBM;u4U;9c(pVp
z82?zNFIlRbJaE~YJ~*#aVSkYM8wm%{WKSF~P}1#fGJIC0l;4u#Xt}29r>qIye~!i0
z%I7FX6>}T@Dl|Q<*}M7L>ZuQ9vxV+6YEiDYhzjG^GywBksc2Bc1+_8ZA*P}0D6P~h
zx#id6dsA6Sim3a`HD=fu6F(2Gsrxg%)rtw_fkL9!Eo;4lhj&7Djb2m7bv}s$zr0PQ
zhVT_|*L&_Q7OvL9y~FoJ3r!Z2*O~(DfId@hF8}cP)u=UsV{T!F8}GQxU)*Z!gHi3A
z;owY_dIuOXRcPNyD}0t&-biQm8AE$X2K|Lqx;iiZJ@LN4mk%12@rt;)<>sv;rO+tK
z<8K33tSDZ`om+Pir3qfQnmsAC`|f{5?_N!|k2MCTc$w|)QL0P>v!94zx6k{VU8RPz
z%{+!SADP4@xVBolgB1LM|2=OK%hmJW1yXF}3m<)6zpGJs_ftnMa*QbPZj+`@QYqtS
zqb4ef9X-Ef*PZ1$S~}cn?q5K34K$9G)J6lh`ZRAI)!A{fileDg<oc5%ckR^H;&eQw
zWWC{d$D-Rm&NYNI()_yL?MRm>y4+H)aJBRJ9+`!Ar4?A=Sl<W~Uz^6^?v76bkMo&~
zT;CN6p>4(k!erliN4-CA*Yz<Me$lnp$MW~!z9ML@Z@yPO>g)SrZh&Z^_qjgNDrQ&t
zm@DV-YnHGRvENnVw_Q7r-EGgKwaL)UYF}=7k#KWY<Mjl#34#o{24%eyqVahcF#s=e
z`ij741e>ncwzENr?Y+Eo*)b;OT<$~t{M{Jir#SIfK3y2DrsOe`#41{{0&77_e%|ax
z*LP$FBZJ&dQmqFIj1PSgad5NN2~d0PJ!Hr<Etur?5Xf-Nh7(r(hPq7Bv)gjeaUP>}
zN3r7jiFsDL;y)trvqHyiL){&f4G0W%Q367Q(bD6BggzZ(x{n7nw7IoDb{19xPbhEa
zf8e0Vdn;49SI2w9A_JK5*G<B#i<ng_^#(^or!`O>S%a0mUv^D!-oa*k!%cLgT};BH
z<jwIBhrRseM5q*25QCYi4z#r9tN20?rRT$Y>e%=Q;l2#HOpM29<`QwH<Hasa6)b~y
z=A_&!^nGm7ue;pA;gI*{X;nlDY^SRAgejzcHGh>{CW2OH@f|#1SciqE*5bF?9zpXK
zo`>>RD@Cq5`Alg&nm{)<1gM|kSNg@Zdy2tBd=m9n`96>7$EkUtLl}vz*$A!$Yipj|
zA&Hlgpx{2UI!O4SuKe8F^L`$fA>?rsP~nwj*JWmD38!oonYd%KP-_pC=;5ii{w{5*
zO>oU{<dM+}dcLsU)0a)@U%{4lZu8pRBb2(=P7naLSt?+XPyst$TP^GwP^NB`wHU*0
zO*Y{AuJVfT7S76trJKqGiqgBS^nArM<t(qrU+vG`2^p->t^xYz&-jNh1i@HkeBm^`
zN3FPoCkI(?2ZtE6<PARpVS$`bwTgurP>^m))+P^KhO1mu9uu4&{J|k3O!piS*H+<O
zA9s!JrjrBrS#NyXVcdI2P9AdOtz(~PSip(RCr>T3z@?#W!gjzO$UmGEE@Nn2!(eaB
zqm?)Ar}g9IP)20Z@K;xwTLHO5W@vQey2u@!@8}?f{1#<R&c_4HWusy0&T6ast>oG#
zck{L(r1KxM-j?AsZevHZV{FuAQN(uhw$Ac9k!d5GF~OM%&TtO-Tu#;Sq18!I(Xtb<
zcr}ge;Z+K<?kX-%j?owyhbdAEzOuBJeXZs5J~uo#$Zu`jST>Dp_aG|^{8kZzJKgg6
zd%U;vV&W%s<EeN=r0p=FUm0#|3g?29FdNR-5*>WK>#&0N@eDT7Bc;p(V9TOHL!_}{
z2ByPRXmzQVLOvp!oJU~b(UMTrZihEPYcev<1s3ju$2Hfy^-<eLg={_5;GV=zyOZ^E
zEqriAi=tq%*gC;A-5X6VvnCC|B(kX1!!6^xty~5?QQN1O+pxS^J*pZN)J!}d4RtV?
z>0Wv@g(_+3$L{7AUDrp9Y6fKnb>BWBxOZb<)sF;9f5)cVk9XiH5^2PM#&(rR_Bw??
zaS?$wNa|{@WMcP2tjDBZ-22O6V$IRk_K&Y6Z#1}km)xDVs;Obi3xp&#c5r=pSlNv&
zJbW=zqL{z)<@3+!V%H3Vec6}dPQGO1>f5KRiN8QwrtYQxHIOX$Ef+N!_M+mf5pk*I
zGY^4>w|?ca=vT9QOXpKZk27(X-}6|DCM~f3zn$fB-8oi$CaS7Q>@U}?Odjm0#d@m$
zBPk8$8QR~W4V*8PA&0-uzbw>6tRP8$3`8?*79X(*digL52l-eXs3{0`A<nRWER4(W
zj(Lq_8TB);dpe6aadqON%f<B{5{Tm>;GfQzz`HNeq<kA77Zu<lsH;C-Bu^GVVWslM
zKZ?#E7mJ{6>z+%B7ut^ji1%=w;8rtNNpyeYR0CRs9V11c5bqy@+lVkSz&gZJbS9mo
z`9ODZy-doAf%xUx{BEMFw6F(medJlhRtE}!Gk2i~&O5|L86SYbAI}tc-(G#mZXkuz
zy>c_Md*$o*RwAXd*eAIRe_ym;_EXSB8>sbW@L~`&7mrX>VpFt>G*yL~-W5FI7Qrc&
zi9xQ?BKi~<_869J?Y=eE*%(0Ev=?oERv|c!+|xPAfuJUXRB9g4|5~HYo)gG}TW$q*
zc5D%Dz<Rm;$Er`FKmB}|@RNO<vZK2O?oti`tK=_tWSWkHDU6@s9^H9cCb?|7%5eU@
z(v2P>QryrS@!XrD=0ceD6X<-5Nags^3wGQ~wl+96*-Kt{DJn82h>g!cxQ#e6hl#r0
zfH<4{Ql9DeGrFJsbq9aG6qhFh?52FYj9vYSR~}?pw@^iyjk(#Vx0ot@ZFLTF>lx%?
zs>DFaz+*YvevOe-mQ?q}_C77dgCUT2?6-c<57ET|4Y5N<`i-ph?L#5}b)2mPayKEa
z3b;JC0b&)C`-dt2Doy4m_S>Ilq}CRW3}QYcLeAz#nS*mk)7q7qPSu1*)l{=Ibe(k<
z1^;{{O#><v7z<W3_`M2Bqho?xY9R@~)Y1g{e73gPNcTPRQMwG%fJP?dJa4|l6DWb3
z61>fH<PLD)m_C{;Kc<C!eaQI~wV||=UD3~D<hmqT)CP>bc|Y~0%&HFAZXva8l>AFG
z+2_{_*dkM$dz)~raJo48tWucY5dLq4OW=HZV4BU_&!u-l>*dT;FScGPI@Ct0cjp`e
z@eswL1crkw(a5ovDKCosxjl}C?+t5`D^!!;U%4#*-`UzmVeMnri4~_}1Fwa?)ADzF
z-NS;f#!)BdI+LY!drGNI5w$?pD^1m4QMWjceYwZ1#2+?B68>H~HpxlNcPd#hCHlu@
z*$MjJx@tm`>bjt_=2*WS?YZA&P_dDzB*a-Su)SY<K0M4>chBQU8WJ|Yyw|sLE-bn^
zp=yVm4MP#Ur{%je7eLv9MVbz`42+S;?hX*N7}{`L2X>z|Jn|Td?62j+URe-WP&fib
zZ=cno4^baYb2gszOVgtsG5$I`W_l#@_&BT_#aWMR0=t9V7dSnRBboJiXAgG~YaDLW
zaif@mvq#2@pxjG4`k~T=kwvkZ$bRdiX@?GlaxQ$Bhl=m1eeSV;O3SwQmrX7B&
zY04e~SddPu`g2_t(S<Buis|B}#bf&>mz99#KlKUWJifqbYIgb+dXc18QZ;t6CR<RX
zP8xDb<7zLwbG9(_o<xt{V@oGha!*uY)lc_Dy5!L0O436Q6q8qQGd_ik#SiZxMU(2a
zgE?7|>WP?|JgBPQjOM5sDL-%*`@U;5EE%gQ4BQi*y|5w%g=x&}*$5x-4QF+vI;gmQ
zd#{nh+skL?dT1gjHHQ(NRr$j$Wd`^)9;C07qwvMnTkW@9R!UY7kEByRa?}syuh|bn
zKhC126IG@~^gVmd*N9aWquW47fr&%NcJ`>HYJ9<fyG0(qMYZKl5@%hEfz2dx^tolI
z>s}(?lwG+VHuKvFp4?uMp)#poewZZux^(?cgO)EP$+F|}#Gl#$K$#VYv%5rGwbyRF
zy!XpB_@2&QnFdHcC*HAcKZQv@<m``iGlT)v){W%DH#ZP-P!zkKONhfzrQkGMU}}d#
z^(^=Kw2$;iNtiG)$-Yf}Jh&N`{<ruDZ&G3nsE_FiVpSj=>y4>Ht=$igI-z!ZpJL_k
zMGvD+4jPppF^Ugo_9+IBB{kr+C4~IbM)4>_?9p)TauUDjmLEC;1mp>~t%5DO<j~R}
z$$@oCThAsI*M75IJ45uxUz`hm1Lxh$sF8HvY?7;(J2bfXUb~sl5bseHUt#D>`=>*T
z@8G#g?1{|RDVOR1uM<WW73%R>X{sKHtp<g|(jq<|{_sDGWx>Eyv)B(zDK%DG`p#vt
z)GpIzDi3O)U!}7x3hPx)26Ma$U_^!sE&sVqAlu<N0Ndb0j`bZI$1oCK4Kuqxc+mHG
zI9qV~^vU_j1_&i5u<i_vT*xf-l^PmY8WsoKS_jXhbOfxItKrDEe(STDK<_C81x(Jw
z3=+sNFd8+%-VqwplJf;>DJ0=-FCJP$RD?S=d2>!ZtP<IjswRw!3eo5O!_go#fqZlK
zOh>DRRP0#A(c>$BwP{EkJ}JC=il+C;1|{rVeNiy?F0P)KL*Od83()S|;e{AxOZtEf
z-%T%RBzo^_?SZO}Z?m6n4$2PVsFiPzh3r)@Pv42)vqz5H&oCtpF9rHjjWy4Q5}P8<
zp}P9(S7|YLPQEs%)L(R~mY6^Xl3>M)b_PFo=5@HmG+eFU0L(MISoepTNP~ci)p;5y
zkjV4!+%&_+_sK5}t(p%~(odarKge6-4bxShxX(;~2?O{f)wGt&^w3VG>tR7>fi*+x
z7CD$yZ*>>NI{(-LFzIA=4>-Nl6_uQ%#6J9{)suQ2I^RlrNK0twr5Z}aDkamqN)g`F
zdG4)mKa(6@zv7}XoAmyZ{)4&Apkc;XExJSr*FHUikY>CeHt!ppJ4e#y`1F6ksLKF#
zDcte|M|f9GuD^xkHT)L)DhiFlKb5BN3jfB~_$<nBku)NYEFi*0a$m#X0=Qlud;7B!
z@(t9AQC#b~QcbyCMM+UWANApc;LHZXptE~GO;ss5`ZlfAz*CAkcZ%2l0HQ8~aePPl
zRr|(}>Zju{`vL&HyqReoNOq#Nj~M&qtYdDc)o<G=*R>AyNN=a?ob4WIVJ^&(&KA{k
zkG?Z4-R<=UJUbpos<7mKF#WBn=qsY|tcqbR(^Os^=jC$|@)R%kGr{$7d%jiosF?&y
zwn)Gn_&fgr>|KHro=bmFK<xyj;OgX>8hf}EZdonSVzdWz;<ssEZ9Gl8{P$%rU*rb(
zwc6SC1NWwjl#U_MLl9r`oAR!W`Rf9}rGgV-%IrkT3e~Dc5)zf?tzkOOWhWU~kM5am
z{fPg}^oGF5X%@#G)7HDx@^ECc#ZGRA8Ne46Fa`C^qSjK&T_9|l-w@s{nKr&phra!U
z3V#K(e%S#mJtF0#cTTUBmq%}~J58|H3-m+2H^)yga5lmSti#)4WjSm2QdT{s3N4tD
z-Lw|PR)1qV7vs+8Gxv*`9n<ksDqA%PG2U3ag`z5@9rw*oVCR4lB53X1I$Gmg>YfJ$
zS>zFV4n+s+SDgkf0pU8qAaCGP&u;@KZ*x*%AE7qMAp%Mz^{0d22lh`Vu*YE7FDLb@
zZ}}^(?(_D>Sw1yzWa-zugg)jU0U!8R@cq9)pML}2{|gZJH}HLFb%B3DU~2vWDRe)<
z;3KJj$R!jA)nk5w;LAO?{sqbXKiZC_{EvH#YD7<n{oA^jQiFL$Zbi!T?B&K(YEP1M
zGyER_V>#7_za~7Fi*#u~el@iX>sjpV+j5*?owAuet&}(<z&XOV^snD?vpHERunL)m
z)-9gs9(msRg=9Jxe4&u!OW4HVbm(w5b&_f1w6cYv$==oVRKp-(x@R|yL8)_Q{ljpz
zqVUR4XCv*?U#Ac66wp~*nF1Z_ElOXIZmns@Fr_-VR0$S3akeb<*W3}^pHQWDI%K*;
zUkpEA_+N!(LZc*#>bCcJPx4`Uk~Ye%0l;QYJhyDBgqF=86qixk&hjx8`{k61Q)mh(
z9L=5jWaa%p91LL^FyzyinA!8uFi_WQo~mK_W?(^<sh+FEqCR-@eh*<vH3(jb;Uu5}
zvq4y9?4FAcjhJ4Zc(}AMg&#GlPl*B2l@nLKfS3}TH1{_QYP$^lZq+vE?wxsNavb$9
ziJNFUcitlT2L7wF*uokxg(3Alc<q!5vzz94f$p^E8k5iqUV8DBB-1AQ$>`^n5A*;E
ztLfkzW_(nQdmw6n>gU9UomN_V#Izl3&)hLv|KSokfq8AjLCj|G5a<O>o6DRpt>j80
zFiTzS?)+Qx&@hcB*H(2Ay{Y|zR_WjcYQ#0>i)mCNEsCxVzcnMsYt~HO+H+@I9intW
zfehi3QT>JSfV&1%%F6pQ4QGQ&6%BYZ^4@T2qLO67Ti2M8A}#X3q_tELG+E@_g-eX?
z>!mw8jYY*~IXHN+m#FT)TEu01<!qYUwvU-y?o{lJf4i(zPNYsli&9zc;JVnf>rdur
zn24B^xu5j;<yADa?zbToy~1w72)2q(cD?*cC%ZvACZ4S;<QGwl48SKSy5uW*6KL$#
zV>f`Y{gU!FJlHv(cNIlo;1#og8W2;%ZvU{Nphw+zdf=W`0n)neAXJMg^yA%&nJ~z$
z#*U{EM`pd*p@gCgVZaqFmk&?xu(Lro?eF$`q?$HeFaO+}9O`qE`r0Z+qGfV4I@}#h
zZkkVvX@>mrHuVBYfM&jA6NZjgQ16E3x;w~Zg6w^0b8<_fZn*<-D7wGOmP+`*u3yx;
z(VaeqnIVb^o@y)adVEAC19$rqYTT68-W6>&%TDHKoFBvF0STL5B8-@R`zxf{_8<1w
zs+%H_bmBtPL2w^hrWY!L6THVU{Xo*OI!S;Q=&WZL%})UbS|Gtl7upmh&mFI0?t{>Q
zmvZch(S)ptiwD$UO`2hH@n!DWY)KxIA%kwgcP}yMe-GzRTR=-{syPLPwN~h{+E$<H
zuC!r2k}$gZ4kMX&LFhDb<F<w#ujtmhf^rmLimr}k>kU)sg;34Kr=V+hJvFoX8|Sk(
zb3rKmkKn~+%#{Anv#dS7M}$}dnTkS!L{BtrYnJ6Ln(1TYk2mX}k0u`)@ju$+G6*l7
zHu^L=man7LUk_iFPkPaY*x`A$=%M=)kv*~D?%hI=bAJ&fNaKUwsOIIU69{TKc>ZuZ
zg?g~-V0mgQBAKBdwaEBxn8NwmYD7<=GE{DvvbgFHV-l4mirg$RlY(FK^4q=}shDt?
zXvMoRi$ftNRczKN?&pb<2k1~p^ao;A(;Ldo5l&jjLXoNE1{#lUm-su}DpK)}C!ATg
zpGNQ<{0!ZLfcCE=63&3}j4No5toK&4*eja5CsC)=j`ogG#mMkb1ChPLO)uca(l9*2
z`u^sZA;Zri@F(6hZ_mi%K@R=b$EQ<j{efmvvJw6xv2$(5^WG^~-?j2+8GuD)ElC-r
z*0pXNrkV8Env5c(>`Tk+X)J7+!o7qvmvqfP#-?RLG}%ufS8jax^v*5rdK8`2#Y<7+
zr^y(W{xe5i(*iKZmEIE>hB^OQ!nD}g{pb_^=LMtVlU%N0A_-TPyns(sQ28k_i^uAF
zF0J+=E5~4Wbv9oBg%AB~?#dcXWdb41^<d4LWmiv}$nV3&exfB2dU2-2H*n@unTOkk
zW=|5r+cl*K(J8HK_3GN|C`~+*Xu}-#{VF&WQ=PXKz^9Rdml@!Bf+9LKI*P`H4=q7-
z;XA{y3#lfTTbQypoaUksewi@O7>0RNpa|3Ec6pj_`1GOzDB02s#f%y&OdWmyaNhUC
zyEIbABNc_UL_^3@!14;UD_wh9+S;Ds5Ka8cgsYH&be0xPmS&RQw4VWf=C%l&8cOwf
zdHS)e$1N;n;YywDQXL7Ab^5~Mv#=|b{Tw1I@Wk>^eBd_%-(F6E-ItpVd#yhHJJ>Vv
z|86#5wn%tq!@At$)gx}8Df^vaIg(WOL6C2%7As-ej~u6|(7fGsJvFvN*_%FqAXrlb
zX#e70E>GOH4Dd&#U{dJ=vv^f{@hw)jPpKocj_zF!h2De5WR$Up_uza@b(|YrE3AOV
z<G$sIzABe-Q@Jq6;K5w4{UBpZ|NI>)pd@JE*J{JD^cBPzmVTw}nV8j_WW^cob>^6L
z<?oNr2&vALCUHo)sa=L9*G>|xqR+#whu}p(SAvS`d0k_}>DLzEd8$vKoko*%&Q0Xm
z!*{NB84Yz`W2Po}!tBy5*sNUBh6wt&?Y4Zd*>s`hf^(hO&b~`k_$J_O-G>HmF3dp#
za8L}ul-)$yVapgzSL7yY>A608qVy)BHu2&8kAK%%6r>SxB{qwB+{W^54&w?VQ&&Vx
zsXrs9&<fjAjIOjVvQmUy5sz$jd4jSS{TW$r-DSJOoOZY5X#If2VWlV~VHkm$v3u3O
zxtYX!4uK*>SLB_>)f7;pdIlQmPSwvAnXq4sJV|D&Pzyu*#N;y@xvxA1xQJ)uvN>~M
z5+!k>iYJs-O_s-OJMVUi&?Na1_#DZmUyGaB580s&1{M?(2Ai{^ccbFiT-S%p$S*<~
z8;b~*2;Z8ML`%o4bEh4NAYHkr)||K8Cd#5|EPC4|I4|hVpa#oOo|$*h$Zl_Xd3K3C
zvZ&YMYiKLs`fK($kao>6p0v1F21Suf7Ismmv%~#znfr%%IBJo(BIUp;m!#M)lsbZ3
z63M(SRvv_P;B<ga<FG}=lvvOMbFFdO>>8xdWp~Vz;DnXqKGJG0KcIz395Gvp*OE;x
zVPdL(GRc>!2qP<CJgme~3s=Fd?ivZml_-C<lr~iLKcvyPTK7;wg0K%WL=dr)(X|M^
zz8o@G@U@gi)O7Dvgf_<KS)sHw6PzQ};@9Hj=2;q!n?{`Pp}puWuLo-mjKyos-j{=p
zo2R08=A}M0z~!-2Qr5ZnL%JE{4#)-`(s*#6r8da~&q@T}NsWgDttIYW#HPmv&+WV>
zupl3BUGdJ2C8X@W1xYnkdPbY;=AIp!AWUxkG2cYorJY$~yDU^$Ae&+Obi`U+>s~ao
zn?qcJ)Kd5-R~+UJYQ!T><|FfGsI}41dv=L>hj^sadw5Q+37L;JyS&GEG5H#UCzNN8
z&)sPc_+mm31cG$!l*#7g$|;nI#SA03O{`<#&jZSX9^9rFwIP*}rmjn*<+o)-9nf|Z
zIA{smqo~i#m}t^sGT8<7rVN~rUFI%1LBe-g)$v#?MNd}yPS2z^LX>;lZXvs_lJToB
z^{~CYYGBr?Z7zf#mh&j1tXY$Xk7S20hi<Oeep$Pmefd2YO~RWCx0l_By8Z6Cx9#$q
zHX(hoD@{uMOu@}m-R-{M==O*dAu&|}tL!fX!xIBRMAq_#4`O$ej=Trr+7mgsddQwF
zm+nn*&BL{#BDGs3G4f^9XtbI;P5ZhQuljlr_mPV}<?d%KEUyD?kJvp%51U(D#_MHX
z4tZeoY(B~0Wl9ref*I+)AHpkGqM`O_AkxE@;(E*Z^qcYnMYrcMhR?YeTFE8i1?Yp=
zD}y21DIKh-_XQCYNRXcNv1Q!4fv#gFOGg&*+i)NATY%oz1iUclSm0z24SCS$w;<Ra
z?=mti-@eP=@i2^Wfj&Y7KSkh6pUhr{-@^Mj<Z&`vm}pJ_LVR`vLrQ+p=0TaEEPi0m
zQ)r(}YrL7@S{`YV+Yo*;Q>ZRk8+3$lXm=`?6$jR1D=$k(N}fxfNwJ-=8iX&a@lX}#
z0J|3y3)(T2&Idk-QrnK@IZiEAu=M(H61<c!tVrVU^m#s&+P9SbwTBC5M=`pcN}{7t
z=6Xu>AgwEz_tuTg%pn0mKg>*ii}dGS{PWJ~Ie3&MrTC{M2Zc<jwqp$(%U)2#YQ90M
z6q9=;Uir&%#)d{A!d+0|9(m97>&x8e!Y;~8a;m42y2fb<PnHb-@?aL8{5g)mRjr3}
zakR(yUS8I5FrJGGxlw^<|M^4~7%By_K2LN$LFs9QVGmDJjLAL>s_7Kx|Ipns;(59I
zdnzt*-#F*#(bbLBTJ)|dIG=s9Ia;nSnIRYB#ka378dIH;F%*y0b5#K?*LXw&_a@AF
z(<)WWEY|9+4RE~rycGpTx`zJptkJtH_&=8(7;Ih>ly-hN`>mwxn{707T7=KI^95?s
z4L{FvxY>JXc(QUiP*%bpj~iW^GP;T;9xeGzZsez*GE>Nab5$Dr(#4+(zfC@B<5YLC
zf-pS`=9OQG)p?}?>F7_y7(DH2dT`*^D$YZdDiCX$?EX~KN@7=gQ&20<R@rPLoP3}Q
zEUcCAW2b+v?oYu2Ud$<H<Ahc5W@9wd2fBTJX||n}R3}B$P6?AA-N|dk!D!NCvY`aD
z(G0vU*=S!`ov_{?Tj!AzS%~b&%xXi2EKpkA%W=lWw^o~Hmt0$FAWc{a?%q1h{G}1)
ztzuC??orfgU}a&h|J%am2Ve2@iwJ)hpX*-e9%;Dz?T@ttAyahT41J0LoexB(+fLi{
z2$}0ekt?$w#uusy<$gZs#91|#0Afxn<x6J<0{qy}Z_WkO=U*QAYF`vSxu1eBTkG<`
z(^bftb-vQa_Vj|;GxewU_d}bj4P1MG`0_U8pcDOg=`efw$xjsvXQXNFuX3ZcH)iZ_
zFi2i;v3<)_Ey&OG%gH-2ZTP}wh_;;}3n!_04gnq9A^Khpl_K3ZxxdZA)fmIRJY6wH
ztJxd=#i=$ag$U{GN^>>UQ-KlRc4cmHFXVn77J4^&yGwxm$(E!R#(o=q6PC+Lqd+8P
zd_@XRw}S{-vcp;QhBiB(kiap8u^Eczq#8<9CSiW=(Rc+XK<xYWx<nG52Kh(z>}S9E
zrNGAbDF#euPYTVEH%;{k^lQHO!WMY+yY3rE%O<%HG@j|JR!xlmG{NM;#A4x@5|Ft%
z3gYFVSR^s7%X4r4qbH(k!JN8Wkj_lS-}qDaqfbTBN@C8nuouarBk+_bp5<4h6h^vu
z<XkYKXozRi`%n#euOf?_uo^S=96Uf`=enR4`IfV**qkK63hr$*C2CP6hl`DSIOuSY
zR77=Y=NSP;l+z;w2}!Rt6;BjKlt^b}`HSaBtJ#qj;n}8^6|Gl^3;`Mz?Sc&HwCBeo
zA7gA0CsMOX^gZEBvlMq@`B{F;V~JeZH4%vOUiw9~Utdl6r95-(<yvi+{V3Se`T&8Y
zCXZu1GH3VgJHj!)a%__840_tQu%YQzAg%9Wz`ntG>4^ZV>U@@hwxnKYKSsdmD(Rwd
zzk{=buVj9aU0wHV?>WPKw}oSEzyiC8F9v>b;Psu5l@RGOXo1$k#<L*ie+~J5XiK+4
z_T8WC(Y(?(B84G?(LTin>&uMsf}&=lD0jSg3RkH2;>ak3TqU?U_bLP8$n<?Wsh)@-
zy4$8`JRe1)OQLr_Y;nm!l;xb!vnf$3>z{Vj=Ws%ne0Jk4@9XeYk}2ajY_@&nVsC7g
zo%0H2hV?w`NA|i4iS>+o$m%(CNTjDLKoE7+zxR0ygP1oZlsDUVob#o~BpA|~JT6pj
z9U%B>zu5prI5vzPNVqib8{}KNk_;kK8{3Nq0>tKq#STqLw@Mh-!-iY-f%g+(u&rCa
z<5C71Bbd9t)h9?M)TnB%hDO*v(1wc0biTWdtUcm@Oo`)8&5b1loVDor!HI(YAO#4K
zFw<kzWDuRn{L-LD{L+0T%7+B}iuJ6Le`pq&a+QKbyCJVXW%#`Yl*t4bxYUdumYemJ
z>cu*jkIKHgW!ZVLp{UfW4$&ig6heRHJ*m#KrNmFK#gVD{imrk?9w$!f82kPpVU7lT
z;p9+#`B~FG6xo1p7`duehg+tRcJ}SUy5-z_+luqzQ<{wK$JR_cY3)f+;VCN56eQ`5
zQz@(Q-0oB7XMlR#!iWOAv%7U&Uw^b2PAhIE^{}$wgwxu&WbmYaVMk~1H=E&I&^;Oi
z#fH4K-(J?Nf@t4o&##gssdrE7^;mWmWIg~OZO}Wr?K)j!R9M5A{sg$jkjsc-9b+nM
zp<xBh-@gue66xvrcImiz;H7?iC<B5cqx*D%Ig`RFGLP_=5n6ugR3iD3`$1Ui)qU$5
zV(d7pb4*G4ND<ebt#g34q3+eu9?W_vwMuBReWzv3AGOGLkgO$3sr>oXeRi1JvgEj$
z{Gwz1mMV9h3YnTK{5W;X2zp-_l`zFRjKPeJ>anG@-<MH`k-EuF`s0A$)gYF`7nu!o
z$Dg^<$i5tt6E(L5OdFJS5c599zmJfto^&b6{~QvSV(HpVelG^n5>;X!tTIdBuAQZ2
zy1yI^rQ#%0CSM8Pup%Dg)rboE`mh5sk1w*`n*Ui+LkLgmzaISog>qF;D)$Gy4Qt)X
zzKITfxD#kkdx_VsOA&<XexXd`aexkNECM4c-9w(Jk$=#eXWV{iv_5}`+I9NAq2@=F
z1-bHa#t?RbqT7Z9RAXkB__mukcNH&gA|sB<SiO?Phn<f<VbwZ++P+l>^z4GMsakhb
zV6$MK80#hBGJ5PGv<Tv&=wZr`foXp_b9A7pz+ai_QC!Q^bzv3DT>mR`C_&S6+9!#I
z$_K@}j7=6;DbG*T&owczqzYLbg3db9HKL4UJxx{Gnqqt|0g6C81>rBm^gQs)7%ey<
zH_{}=)kqw;qUGLcSJJ@fo^9$!sjet`1n(B`r#c-jm8TeWT?T-AGQ3AlVlxwskrSS+
z5d|fK2`wiuiAY32D{+_w`{lKG1kaWbeG4f!8?8k7b7$<tbCfqIZbN>-*7XoH>Z3Cq
zL$5ZEp!;(e75l{g!~OO2`;MGt`*3xbiJkZq(aJ2Dvg?Bnbb{fAk5k<jh#<XztQuBS
zPc&u^Urf9jn_MZJ-X)Ehq8gB)UhxbuuTjXBdO{r$&4L+S75y9T#ea(QbPT5*tW42P
zA%R{uso&=TH!d(lk~Kw2_{80|dNv2n-nVsS$xIKQcX_{Ob@G`jyUd@+&^<m<W`{-8
z8V9@m;c{&d<uFBc+Z$UV$+KQ}0ScAw5V`^<*DLE#r>{i*&MUSl&Wc&nsMuM1>zciQ
z^R|muSWmur+eDIYn4C|%7%pmY*Nc~CD6juspzLD2mi<=AAG>&Ew#$o;LXhB7pNV`&
zh^(#+S4ZUWCy4KY#pG>`Jxr>-$b5VG`U%$@QFR(gLcqEGx8wSEB=%3P?BB8AKS}Yw
zgXZ@pU%b&^wZ{H&@{L1ZeA4j_Iv0yAmd=7tRQ??suXGxlKN+F>0seIE!y~VAnP>bT
zu&*T#KmGd0AK<Ep_*dTb@A&(l)!P|*6!UPKCgpmZem{ea^D*Df`xr9{!m!ObhXh*}
zm0x&1oH3<KM#PJEE;)aR({opS89uvO++m>J6Zl7{-{ZM4--S)GSVw($Y`6@UGI_gA
z@-HW45`*{fVx-i1sp9Olu_g_SXjW(Mn>=#!3w@9W%lrr%0H%^%=0@puM#uFb=0eat
zW8_wtyfUyf=-IS+j_3$HBk=bR_lB-j>AB~bw6saOQkSS-rt#_tQfg2;-=Mqy2LO<4
z8CrYx{j0=TiB(N~xd_xd)NNY<HBmq`dbDT;RNi>e6K<WV{mL%A;RBxf6kN-j2I?^@
z<jVSYcRPD_@^-q2I0?DXzp?*jEC5)cxom>nAk5py(c{fCwD%p`y5RCxR+qo}mAm}Y
z`Sk$!QG<qiXvbq9e6CMw&_))$dqZ$V=SBK12EBe+?U*nn*n!hyx6;5BxpO+GV?|q)
zIc}0?qJd-GJLizppk-3GW_n}iIL3ukWmEM>vSfS{UG~h``t<X#f&n-6koPRWAGDmp
zOjGT)cLP;tJrTQdJ75P3>rxr#Q8V|AkxmCeIKw#QmXg>?#ICGrFFPp}&kj8x>EHO)
zMSaUf1GemsL%we9vOC#b_ZiJ&e-!^rmSGIe+b0r~>U8|hbFM80r=-?o2G4Dqbk^#i
z+06?#)x~1yC`O07p*$vtY&M{+K-gqf@-lz^Z>DS#lYy3eNs$n~6l3lg#)tynkg#2S
zo8#vYC*Rv4P`^nn216`v%&CX0=sOg^>`=7tHN@I$Gsn=if1HFb4pFK)jaDEAF`@>(
zt{1J(W+u*^`f$6}YXZ-*0-^8Or$XuHuwZaH0OgsYx;P&8TniIjBK?}yJ|3x!|GIJ_
zhhFi3et}ZO1;Fdd7cl$8<a3~Iph_MH06h+>u_@=CZsE|u<NW8LGeA>N^Nz!J!nw+7
z-O}l!UP{B{1MlaTnd^T?P=nrZP-7AfZ1CwPU{;PQEuV8KWU<}tY^hp=z<ywm>3zz;
zv1o%)GIZZ>M(ARx+m<(N^kr(dIDaQduC6JIzx(IzqENKX+ehiu5lDA4^ln_XGp|b^
zG4DRFMS9}ni=jS??YVd0^q$6YK6vs2%o^jf_GSlJDF$7WVD0v?6u_CtQVuY|j_=Bs
z7<zJ~W}<l<Zb2pF!-SBVvPoVpZGGvz``#7oqB**15zB=^UV2E+AM5z=gDd(chsq;o
zh8}n*w75D@;n3otq=vIfPO5twJkyJ{tAFJR-GR)LM`~oA3PE%YaiQ&asx1S$R2h=5
z#0%QQqJ7K>hEPRWkeGe3R3lDfc0e#Ed*mzcQ;M9^ebCw(aLBmC+sNBoAp22JeyBTu
z1Hl^L*Fdst9=$L^`Sb|F=%G3!%}Bi+p|1jvGD8@|dNzGqa~W_T%4py)liqQs9cDF7
zGE#cbu{*?CNX#KHbkKwIy0UwZr#ly1|N3Nc3-j#wl=HyrbU4ynFJ{(t%@MUeESSF}
zdHXk&((2iC5vY1*7pA`je0}#?WmC)_pihQE8K<2-h;vCp*C8Yjt#Nbt^c?8Y&@*`u
zC*ioI7p5*|U4yJ@vN#WEp&DdR60Rt3&_}O>g@uN33yhYa<~i@pWmt6NdnCm0rI7`z
zk5j3C;BEKKBR+%TK7CuC<B7_Hz^gD>NpHJtdTDZT=)ML@L|>z~TzS$+VEStYOArHu
z#`Z9q40@md%qo_vfo5H%7Bh-4WCNH%TaMcyAez2Mmg5vvoBlRe%*qtdgr#FL6M#kG
zv#ejL^BhR?xhRapSJ^24e5#mu3@{2MF{J9hf)Zl?2$}-7C^_%|ZSkp|hFcVVH;s3Z
zLNzjyVv%CoPKku4vU#fzIh6NC@TIM`W2Rs=-gC#qh;n&p2Yu)jN7<<$4Q)7xr{%WQ
zgM7`_@!wJcpyELNyZfIAGpqNj{0(kw0A+=TkT$$}e)|=7CLji&kKW1h`jizW=NbuZ
zTtkx-8T3xKZ(=xfH)Q+3DzU8_d`Z=PwJ8ssu%&D6V=<z2yWbkWMqbzwSIlkh+l?y+
zpjgF5AN!q{eKQ&BX!|&e3{2v;gkPzwni%@g+SGX7?X~V)kfn_FOk0M&y<n<pMRrel
z5p74L8fF__jM#BvuW@CI%$rRktn(Uza%~%eFFCA;3!XhfP#dwt`J+>-6;dj(YG5QC
z=?@;OK=LQv5>PS<of?OVj=#C6IHJKyT}#$CoNPN+oU+QT>LdDtBIPCyl8NfM1l(LB
zfs>282w@FP7e87cL%ZWDMds*rb`aJag16@Kvi^K`V1V)j?hiYu;H(wndo^Ye5pYS0
zb%JE;`uQJVMp3qsy^QXOua`@TM|GLHf?S@z4zTO${D@KUHRS#e4%EC?EBdo^k~7%A
z^V4Xva4(kAQ#(XpZ)N~J#|)jt4RTljkO0XilQ#{}n+tI!-Raku7UpnokFrgFloY?w
z)n}^fH!v(}-lE`37h+cHoQ@ftIKY+D%bM&7CU4neN8C>c2ntY52Uw|a`)w`h<2gj%
z66E;1AqZg4KvtVbbISXG+xW#rCFZo?O?0IqhkXlT&i9XLPufVSi~khOFBC)ZWR|h$
zC!yAwr*G`e@IN@ZER$IM5n89#lsQzNRLPPc%sK0uR5xJDzvxMUjDs=#HtV`UZF85D
z-xm7se45JsP~`aghYw($$k1JvO<d5CFD5>m7UBZ=CcysZ>UByugpQ5}Bt+--EJo}8
zP<+0>Fn@nimOlwwZ;z@2InK(`)Z=#1M_$ejg{i<NIL~f&oZTrztr(=>I(0}(B8&i|
z{AF<)gGdmFB@Vj50n#ceVEkWULcMF}3qhvm^1O<)tP;`7Nk$~763)~k+KY?X{3W36
z_{CF;ORGOu#b>c(KVM0_dGYbwaToY5^Ct+Ry<+s_59a9KL2*zTWr;BRe!j<rr){jm
z2&>Wz0ZXUlc*>`tC-G=SB&TveNM{j4v1*vB2k7)o3k8LvQqZr2>faGM>7$L}#&Vc(
z3;(F?=XqPXV-*ICXFZv{KrsEqL8SDP*(t~2eU(5h)w%xWBmQB%VHYeZjBK$78oxPn
zMN)|kbWFVcYF#_9wm*m1%0zx*WENN2reL0yX}pZ3&b4SSFVWo3E-wwr8%Nv`z3Ekb
zszM<uOz4=nSk_A12kg-cJYA&^judbyMB+5e6nSJVr!_bjXjR?#op(Q4C#yW-pHAig
z&(!7xBnB@Ex9T`+q5$4~MEFUG?#Xgit|?8EEbx|JWI^ApmK|G|)Sj$jt;Y`qWxggt
z8s09rgA4#0?$c&-uGL47M6d>5g#AM<M?ke6fe{1&erWCJ)Kz;%Q`@|X$^}pTHH*MP
z+iAN{lJWY;iqo>{G`$#MhlDI)WupFpwpeu@V_U<HZM{IiO<TB5IH@|Qpuy%;%wy!|
zRR2H%b=Vbc2&8#&cnRQSqTcNVuOPzOgfmnB>A|mS)>dhY>&qUyS0#qyWO_B@K3!WN
zpX_@|?e`B`<I>#3&9p0BsFnDKLLD(iP@|zeo}-8p^FO5P%`eIXgZsaRuHZ@XPv28z
z_~mY<WPb!FGPXnTpwPtL^amVro(CTO@m(XOU_9!_)a&7od@&Tq>Yf)*NpYv+k%0%v
zfnTNPrZD`K+F`~aacfzn+H~GlR%`hifPeq>r8QZyFEx}p@c);6rT>!x__ob*31dD?
zc}V;ZhiF(jK`FKJqQlNV<L^Ay|JbbFT~V73r021_9TMT?8KoWj3E-tg-=vO}%W<wd
zO}z2|U3$YqNrGQ6N!xX|ANwqUAqM}{xkq9Autdxw^IP^0sPvcTS~l%Hg^b+7KDq7>
z-GZDRN>=Kx!?8W*F>%Pgm(&`|OOhsed^YeHH36WM!W_)Sq)>Kv?T<i`Lt8h5Pb1QK
z8BAJGm|OM9sfH(J=S<e1v|j<`f1FBC26Waf;OPCS^mgrp5_g;O&)AgrT6_tPx;R^V
z@i7!+%UM}G{6{D!>mS@G(MR^SeSK4bhyBXHDv{~Z6=VIUTuCKCpaW$Ps=*G!u4436
z=40#JX(<+{cgFSQ)SexbujL3_k@S~fTn~Vv>M8JTg>Nw1K5}!tpQitxdZkCqUVHrR
z!q3Q)-~zG|W7qs~r^9--q)Ltz>dhk<wFJ}~g!O&}3Adq~onQn=$kiIM`wnrETs_r^
zf@yH(TEc(S0RHoQx9L}FrAzR@dyV73r=Gb5Dxd;|FP|d(&`eT4SwBuNStf5O*K;D<
zLLvIDMQ0DUS(7@j;73<6HC}8gTnwB$?H3e({%r6j9C^Nz0+F<g{0A{SI^8TYAsd=8
zT2WwamhTtpV7o9d5TmV{w9l+`A5UO?z7JHVjA#9xl&k;+yF55^;O|-@Yqks*lYM1$
ztjfl(l|PW-;KIp-@#u4K38P=u&}s}{pR8r?O<7i^(ArHm-KKU}wfg#b+Hp(E@Zt6s
z#gZH+Uh5l8oi9^=l)Dmhs`cw;y!Mwp)O!F|uL%O~Ddk%tE~NrCBm?{=u3F2?+R4Eu
z&;d})+w+P)4$!V1!je1r9dFEkyJ;fskAm;FDmPvk&K6G#C|x#B0RJbGf{6p(zW5Rd
z3sN@5Xop^cf)`u>cEGE|i~pnSmYCze(=>rk!N;{Pi-;>^z?)^9zUw%y08yS={s2To
zoE9#?GgAV|N94gJ!gkv(kM}zB=@}8;-^x+)L#9pN<{YORD%0oSEdMR6As|D+ZA|N7
zuXyb&KSs1rZWQz5FfN~!X`S|WO9o!S72M?Wcc*mua;_7Mzq`^umi4!Y^8dw4RtcI6
zo@s=>!qNE*E_637DO8BdZg9`loTXf5F;lPQ^39X|){sRT_1RLU;(5t7@2(EXM{GG9
z`^i(=XFNJG1wHSjQeUX#gcal>n@Nfc33CvMkn*ZT{+TSH?D<+ltTJ7mv@CwvZ;e^h
za@wV?4yoQX_M-!`#R7IU+Y8|c4GkI9=VK%{{_ZfTo2~WWQ{^M0jgxm*)N>FL<I#M%
z4ShpPCU&tP!4^+O4>nRPJm$5Hhzs4J<A@8*A&{K#ltXM@30GN`>f+;AK9S9e<fTBF
zPCP~rMp6oocXCN$*%_fqmV$Y;ooi<U2*$pMZ`%;rZ^8$))odFPJE<DQPr1vpoa9b+
zB2w<JB}X)|t{q0alC>PS3bk{Ed^$MYR^Xj%%z{LrR<8Q-c?`a;j`2aqyro-<kk78;
zf32J=A<FaOi+7jyc(+46(O2&jR#HU_t~AmtF{53n*!_}-Ll(}&Bn*3K#D(6_SB^Ew
z;5QmWFg>?J7500=ylG$zbm2!FNjNXW4M=)ooVvK3w9F?LdD4;|UgTsI$R>zYCe60u
z>%DwB+~H#2Gf3k(UTiCgJ0zOc#-GNh1`Yp9r_BJ}zOgv@`nMTICZXaHC)uJ6mbI*i
z(EDZ|M&LJa2^u*-Z(WadC<Z#^7?V{mKt0|Z`a>!lT~0)?IAxy<osaLt7Ipw%{@{$o
zh@vz1B!&4S&sAk0@<mD#&{Z#8C!=aB8CItfyf-RC5qVG?T=;pEk(Y*OvLSa4<=kz#
zZ{lnlw#ZYbi~^YVQ0~@z#Li1tZ<02qYW?;BQxspmtHT6Hr_T8NkRuQMmH6qs12X08
zm|oQ(ACA04oT;k^@Yk^;Xy777c_X1-6IwSd4B~#ywb~I=Gx5BjD4(6_zBM|BbzX<i
zgVQ=I-ut-B9YPMy#z;gm$C^hUPrN)TUBa*czG9*acN+?}alQ6hG&XAChK6Y4i#WMZ
z;2yt^p+k2h*iND792=<H>I2=K5ug=NH<i`|<BBjBx;f9m^Z%rXR|$Z`Kj}4ci>yUZ
z&h>Lcl0GiDfRhac<<j`JPo*`4s>{NBw}Z;gB2wL`z*#-g9hxEwiQ`M-Q${hb-HXKu
z^3op?Ep~IJB04gW!H=@fmU9(uugyxAr&N|_Y0);)vu!rjr$7$mns67%f#<oczZ~^=
zv^S-{#h_4cLOF5V?`UE4&>nicbP%=V3r7)Fxv?D}ZDThL4hc~eUhyp?@vG~+X}mSY
z1`zcjsBSq(%x(r8Q{H4S6vUXk6ah|5c|jrc0yeW0!AG{=A$QUlkG54XVk8cFE7K7F
z@^NF9=gwTj!ojQ9R%8j|c}9b%jY@Nw%y~ME3ToSN27?n|R(t!nKNjbhq1PqJfkRVm
zyCj)92ofqA#IjeY+V;K*1=PtVD^M!;u2^#9EyNiI-|1j&7r^&K9W0@0#bZu410$^1
zQBdoKHb#>+W!P$jN136ED8#qiCY)w+nsFun1_PTd$`&uum|VHvHwl~OZI_E_gIwsK
zbISD0B!N2+0R93p#r?l@C4&PZwYB%IJ%Gr0o~|uHzkCQww^a-#R4j$X@FmLylQ4Dm
zJ=-9s&lva7C;38LcHUaf#lJ}82BNbR=YGbg@1He>&&jy}rgN^#vY3~Rjq;W9p6v$O
z@1|#MWF!dOq>pT5#P+WNUzaYQxN~GI&gt0Ysc^LXVw1~uAQEk+lkt>Pf@ZRZ@f5q_
z%4UToX9Y1Xa|o3kY^af!FAZ`Z1*w+{M#nW+9EwO`M?v!BC=e5AhEI2#0EK5E8-NIT
zh^Rv7$&_OZ3j>%suC<KrJ-E6~)3M9KQTt`#xYb!|H9Ql~M&$-j*(N#Q2{gjD6Uu79
z&BA3C%^X}rbBJ2NKw#<;ye({B2nb?ripBo=THL?Bl`0|>TQ1;UaCo3dvBQjLg($gl
zC}VeW)7p+^?56agf)YlQ?H;JZ#?OxOG}cZvfejcd8?X}EiOunW-v%cUjXnKuQ7uj4
zdu!i^<GbmV-5V=TUDl>gS>7R=KRSH!Q~AK;0WK7pD`u~nV@WXteet%Hgmw`bzF1TE
zL>MH+0m#{eOcTe7!c~l5IouH5AcZ+BU4jlFWuWL|`UpaD&D;qC)ojz9q~NT!MtE-J
zdp*SP)+bu+@*e7;KjM6q!c(j|3l@3(WOxerX2K)~g|h2Q?x{7)_L|o3*&%4B(f5h6
zTKEWPA3zyjG4?t3<GePwuKJE!mbQmkRQRi_6f=Emq*0JjAuFRiQzpsa3+`)4`q1kJ
zo06^Go~w0XruGSPp+Zt@^n>30s}UmON=3dcde6k1+tkAnu7Jri&`h3y7d*dqlXOud
z3l}sy7bs%m9srAjQ?7Yz$=>vRN)wgIgx*bj)ss~;TkpQdW<<lVG5Nra)8I99x7x)F
zcmmw^u##L^FkZnY5AmNhLT|kES-Rr~#@I<8h2}z_1lf^NzH~;lcx^O9ICw2R5!@Uy
zK|(;inqpx8<MTxg4(@V9jq%j&d<~5+`(-Io?{{E=m7%j*V{dJt^muEd6z@!rIVqGW
zCaaS#wgP68*wAG;rTNkXYtGK6)MsC16j<N5u#1f|MT!O>?JHvKt%Y#GzJ}@)2d~9T
z*w;egsBjK|q%3*1LvS{m2i4`WeMG;3nbB{jB7Oe7`AKok?Bg*4kr#DO>z^U*_|o?8
zl_v$I@R4!48lm};aB4CF{JIXek;bR7wuOW5BRpLQ4)UAcZ0N3^fV?%B<g9BA7Wx1N
z2!>tv2pQ<n`My#<XmO{9Cv9J9Smuju=>OH)mBzD~?O~)sTS}|*lUSmdVs164UO{a|
zW5#YeZpGfBTDPipLPX1imNtV_(y2)pTdQqqsiC&CYPf<DA;wmvVyQ|jK@*kC8}qBV
zo!<ND-uK)4>73vHod5Pb&p9Uqo&2=2sjMj_J`{ZX^P;NIZtkXJ2&n*e^*B$jS{^K$
zwsw<=hkK93ZUoLi-Km^TJU`$f$Wi-i11US~MaaJPN(PAj18eT>;MsqXcW)A|Bhz6A
z(OAgKnY!mTL}$B9JM6Q{VI%lg4wnkwtZzImdG4}$2bgO2={!pWq?u0_iTdVGtQf?P
zg9!<S4oBs27MuW*IK*}rohVL<P1PKlWQ!ihT}PeQa3F;(dGoR8^FeMN=`eNULQEbS
zNJpq(AL2m7t)k;3NLjLdO~*0`C+J?51d>e<`H3pg+5tpQOeH%ZUHjF5YU8`to5AX{
z=S`i%^_7^HSW%tvj5O8BpRY$dqf64xMrRw~au)gHOF=`J=U+e<Dw1qEN?DRR#3r-&
zD=9_#t7R<XpYu49JtMRd{OK?)6qpdbm_`*aJQNy(3DR*uya9b)cbccag7m?Oq~|W*
zb{V35+0h4d<aK><Os#HJcG3m?LtUx;&mu!Kdd`fbMzxAsV>jw0niZxycf(*&g9v0_
zqpe*Y8MFrywDS5(SO#y(5Za!D46Jt|D`#FTb<i1C^k2+HTB%%G2eGrk2h&{r#}p+m
zIT$z=L?g8YSnBAm{%Bh(MrwbSGwNKid`81`T|aZL(odHQPO7-$PKw?&^cc{H7#^B$
z-24+S6=_(WjEIy#Qw`S%yWb0!#g1Nz0LPpsIxCo&)#|rnjt~a|(C>`SG5S0d?t?mR
zTZcm3;SHuZ!=GNzyMC6GzGZ!omKR>1kTok!UnMWJtYW}KwKLXH0(b(_wmvgh)vN!o
zwz`}sXzX5HcaL!7FpUy)l&nkgUP26Q#bi#x#lfdc?~0XA?FVRk{dAq7zE!ysz1k&F
zT{$b8ZV#fLGTw}L<{Mveof>O<ddMYT@>kz7{938+%IG--4hNX=e@bF~s-Qguibfu8
z1YU`gYv26YaI^#jT<p=BTt$BML>7FUGxWr(#Vhgd6G>_G$$3<Ws<YAremG12-GNnm
zJZ!;i-r>cMCWSVSzt>;A42M>i8EGlFzAczDYT?~#*ZjrPcV%2|%W@%?Pw8#E4025C
zOexwQR4?=+j)ivv!9~focbW({iiWg}`mXscl{W;7tT?$vZgfXG=wv)A$fh%^{f7yA
z)NcqLi|Xa?W}t`!Y1;(dWE7qavSvc0><FZQmW(w*hunpU5(N0*J(+>nx0cF!Bwbw5
z8|jfSxEbu;>ceu*w;Jd07U1t-U6~E}9JaXZIRI8-Ce}Fp!iE&aA1sUPjBKk)%V8TV
z+-cVW+()fr!;{@x+2Hgx$wDDVxeU*ptbb7=1zxrrvh02t@KS@3+FA-X1JCcqv>)Eo
z^?xtbv5#-T$-S>HUWz>=FL*E$$Zn2fv>YJ60dM_vm`NGQxtUE#FaEccc-G{Q?E@=#
z*H{3vl5&kI(2I{5R!DEJTzq{XNZBPMJ-IRc9R6%NIFrqgm9*T)JN|B}+O3*E1mp9L
zflXTD%aaUKi%iY9c})$zxnxaC6+0C@mAxuOvb1nrB2SSa8w8*K!m$SD)Djw9zM;Vm
zHrvEkGqP-pe=K7O*>(Dd@`2bypu8$S4w0qX;<MD&wS^CZX#HvSs7Bv47gQFZuE?X@
z;W`aD0RrVB6wGb?GL=tXB*k63uk|t;_Vqo)Ks?JhH^;<n01wx^b=}la5ykYR-N<`$
zbN{ZEX#23lbPMFuU`ypfquzweuT5`UY08cf3{!X}xv)1~UrQw3iX1%c^gevKskQ1w
z9HS69mpGkd3S&Rr!;0)gzdS;~V+cCU*kPT$DFMVT>l8$ECm+Y1s*|e9?$Fjg<;M=+
zlMz;b;u|sEPMgU!61rg%+#fw~i1!s4i&jn#Jz%2*Q)wA?{-O_EHDT%v^A*gkO<J?h
zQ7-qRZq_NHCE>0HR#fV#MUHCRhWo_;0t6#QC354$&{xcyB6eD{u_AWu0w9H6?oh8P
z_a>Ll!kW!VWSVob{Uf`YZi`|XKpPIM2;nI;+oFxyCxVKub}n@eAO<`~+4l?Dx7&29
zOklpJ0x%y)lqh<U@V)lNo*Ax2K=6h$D;L^F{xu?BEa@?E6Y<Yqqs0Aqe`~9bQqCGY
zA_k>C_U0T{#`mEbi?er<OhjT(f$4IhmbPA<H)vvc9}2o5X~pl@<Y#bN^pZ_L-dMk+
z+1pi<Nx;*GnzdB_9H1!O!3$2DI-=iLZ~eA!yj7I6*l<UqiheBYE_uKQBoL!mTmxE3
zK*XZw2P{??;{{O5!iKno$Gtq`9@A!PLFKru+K@L!8avLu>1Xo;zz_auw`=|wL3^Oh
ztAt6`62DW%K!FmFG=r=SKf2bQ+Oi?=OJRROTv3gvL=INcpnZY`6{ZeO4yv}k1>9Gn
z4*wD?uL42WvVTbw3!8R849+g~>WO_R!o=B!4Qc4#oSB<|j@$g<tZOXC3*WFfQZo==
z2&9+Ne6Kz+WGGsGBvZk<EBS~<#k}84+KQkwJ`Ee*FT<S*rG%5VS-o}Crvq?XeP3E!
z!Ug^V*NZrFs;0+Wzgv21ltSiVx0V<e+^-ovSV?k^keiG1dGv?Wdu^szJVZE^Iv&e3
zu=Q_qqjnt_rAvu75RQRxmygFPugc6{CG-kEh>uY`y9(O+byn>`{~_EWbbV{3xZ`hK
zVj2f4_(rB8yoNvq(QE9!deL)+;*a{n-gCkeg4L9`;K5^4CNFqQZ-skeE5}GwOmes*
zSH}0Bj4~SmRKXwmosK@t*Ur`j^PVUv$Wk=w-y~ZHK9dmLZG_8F03tix(6<-VAL9Kb
z_u4F2#W}PoZ&dEWR_x+ZTHHQMmeNGMZt@IQYPFY*jd|@F_6Og)vz{BFm=7~sljm=%
zQY}0m*>$f?Bg8uJFm?0Qa+wU?rL?%=l$g?vMXVK8M?cRp+M_=;3e>N)x91G>212|A
z-!M>PZGQ*u{z=~p7G9>U3FIxiZhr!gS!X%G7(VORfaXG$CfW!k>e0nNhy|`ZX>)Pn
zw#0=FMrF^5Pb)mGU)psWY_vh(&S<`kg!W2>Krb3~K#ts@(mpasZc~GI+osYc4qF!8
z%>4I3MhQo=*$w}E{LUH^THt*q_!C*@j#Ies!-8e&^*$jNb}aE%LBtN&_!(R9aR+>M
z7-<Jx{7ZBBr~CZB2GIz7{p@N0(DIwyL6=4WJcuL+oIHQ5zs_+jziT_1Z2Q(W82CT@
c7Hhx9_L&zvt{JC9O90<Dr(Er+wt?4v2M5m<jsO4v

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-5.png b/docs/source/assets/design/v1/prefix_caching/example-time-5.png
new file mode 100644
index 0000000000000000000000000000000000000000..b80dd5b9949dc331eff4e6efe14141368d4c7dd6
GIT binary patch
literal 39727
zcmeFYcUY6%+ApXZMO2CiN(fb|BAw8yf}w*5ihzLh-g_58YC`C}C_#FWCLN@P9_hXJ
z-aCYOP~Ug&Z-3vMnR8~&T-TiI{1exO$+OnF`|o$(PvA=hNxVDYJJ+sV!;_Yhcy;X>
zR_L{B*FWH31AiI236H*Z?ZY)`iDxR#TI)%-%E_8*YDw7BNy+pouan7by=SM8<E=T;
zlG_44<aulSCf$IKdpP%`zN8)bj?YgfxNV7^4ONY%K;lOR)F@8g_Vyf!ppO!fBWhiy
zh9&V<qtLK>&v734{x9(nz%z{rXK-=NN*j}@%t~vN1*->6P;?R=oW~i#8{EYSBo?Zx
zQqxMA3Ud<4qvnJcsg|+w;q4|ns(&O9zeqygEWGSQ&!W$`ub0lUvK)?i)7v}Aky3$V
zPbsp`PYShF1!E-Hs@a?wnKGKS{|L?)%oXYt(oSoXUn_<9!np41*@;m)*yGWquG8)%
z4Y7o`GeqiLvYN#0lZSc}|CYBsvmv$DGSy=aap}6CcGTqz1s62BL|3}lc}7I;Ha(@X
zC?l_3bcu{gva346MOM8k?W;_Dk05ppcPOtb>U~b8t9v!kWQ2EOMH;=VdTQEAwVnt$
zBkPGklOQ%6EVb*4mZ$7mq9FRcTR<iD{Z8L=D`{oH89n4EVf&?OFRju@XT^INOGG;=
z@W_i^Q@NLZWexF^1G!uQS+`@vRgi1!VP|yT37@nL`NEr8bYeS_mR5iqb-sKMHH3{X
zH}`IVLefhHVbBxdO-}L}VYTkNMA}**Q_01o9h2m5QkR>ylH|<(IW1V`)JO#`TRTZI
z0Ct%YQ+VB5V*(Ojz+)uHXIrUD(pz3q+9<vITb7lHLRkFOX%omIj+x`N?m`awi>G*(
ztV{`fTvVb#rjp&1@{K)IGN;c@MU@Rt#x$__6;(V%8!*70jZ=L?v0S*6%%Il=@Dj`5
z_LSyH0$S`YP^+r!XkX85C5QRwmAt-4Md?<~(Ae6PuQLJY1CphK4y6$Y_>gjWh~8tN
z+OsEciL5oaZ>Uu>!9@oQ6)CJB*t#5p;EIpot!aPy?I$-M7X{+4oxJ%*BvT#k=SA^4
z1I|kFT7JRPWO#X>+m#_3++l&-9TjVEGovn0(B_gsK@@)U>tx~>k*It#$%1{ZMB0%^
zED*K|Cw`k$(CH!?noTHR#&Yol`{Bzrvcuf5?~3x_IqgM;y@NjRMvH^|Qi`q(iBX1Q
z5a*g|qSB34P7TXqKjdsJVEcD|%S}QPtL-r{c7rAX@8j$c8P3#&1}V}Rg0BS#S#4S<
zizX<Jwz>>Ed?l*dvz_K^Y~i`hscY~)_o<mm@}kR2>AM*yO;<B&(v$PiFgT0`%E93Q
zjrKlgz}p*Q(xtJW3(cyqHa!J*rTASpW8?_gTrMD$5-E!skyP4H(RY3%>?Z<m?=$3O
zGhyHj{}W4My_-tl%q9z&0?iQ)!A7rjqt)ny-zdr{f09jN_lh7%7L|66_J-^kB}V$u
za)n;b`5FZFynKJU+((M)Ia6}M0wc??InD6s&>u<;U}<{C*x?8MJYc~d8vfXdGx^g~
zzkP1oj21&3?A`|kmDOZfO)^R&`lmK=;Ld(6FTC{M$4HfJQ^yJ%DJB%L_qBsPiF?@n
zc-DecxBq^(&1`qFAE}%Py-<f3rs=j%ai6t5cJCl}k>gLrDg8b{Z@9fBB3t>Ld~UP8
z$^(WB?;C@(w&Xs>Rkf$s_la8f)f+aY8sAOA+EH(paO5;f?QdzMk-x)g!&zkOyVSO$
z{o47aSBYLe$dq^Y8BFRgHKlv|4j@-cBypQ{O^!>HT=avs6METwBvskrz0h38z-$J6
z<zV6{IKdjzZ700>ITMhwm}YIvm4x206^sa>416+$Qd=SwK!j0!lg0Ou<{w9P`X~F+
zhjz>H?%JhH#J*6(4wwA=c9IipnlOEgeSf|9{q}f|NH04Vs$au;h_k3Kr~aVmOOG3U
zUW{KHpkt;b{x#u^@r^z{*ayyL$E8f6Q(QTp1!413=B+1xGD8LSeUJ-hV|`{6e3|1F
z6trlVktw9R5;B>T+(J9<2)z(s61}fGk}1IH{|?J!jf8NGSQ5H`=Q6&MZAgu9YB)|Q
zVOKFMkEOXw)H?+G$7Y0|PA|KMuU@!|FZQ>!+?pj5dW#r8qZx238`65eZ-Vr|d!HO|
zQ}a+gn>I2LRJ4*yO{yj`PB4E!WNt|51k4L-XF#w+xk#kuDifBkEgb29TPo(UkOe#*
zQ3nXxzVcw`Yb=w+*S{ls&q6;}C@fVcq;q_DGdmXK6qNH6H>m2Qy_VF5T`YHH&TB}F
z^S&`MIe%PUt5ZzZa~x$;t{b0i!Omr-qNTio;;29r{}``Cy=7~~(R}#AQRjtRVG^(K
zts-2ygwgQ?zcU(9kb`|a&X^syN8wE944D)t!S9Q5FFQ1-c$Z>_k8q=b>9;7Q-be0y
z&#dldzYDG93t>fM3p(#g;dblz#1<fTU8SHK=E7g@A~xRRQSML%wv!DFhvemZ-$+-`
z1qGhDt4!iWniM+XZJz0EhJ>`Q9R{H19_PcdX@ju)csXQq8R;k&V<c6~H=r{75GK)%
zaKa`@<f_VBDyYy#d{L7hAIMVU%$WUEhYeK2BlaJi#`I)ho$;-ZY+8hf9EYL)J)d?X
z-BUJ~g@%y4v}SD*XuxeF;jb1=J}yS;Y{<}RDvCSv3jzD(e_Xr-CWQ5ug*@-4{(GHz
zk2CbIs^U(el8R?Z%R|idKgsgXZ~s>*I{x=P{<{@=hrMyX@1l60F**&CW|Bd%t*LTt
z;8GV1>4QCsB{prHes=ZGGP92ld)AxcZqDPWt0J{0z3K93jetH&JJ5AQb)WpB-Pf1x
z8M?1T9z^VB8HGufmN5Yqmg<4ySx%SL_oC+TsVjbc#jD2)${bHpf}~ISi(8SS8c|nI
z61gGzF1&vwRqO15u>bVCCy~2Zv5|_+Y3{3P^GX|eGJsBvsXkJ%O0U{06<h=*i(BUt
z%bcjl{o3w;c{lU$4L#t^B?R>ZF?0@<nM0m0(2w|3?|gM<V)(V4>q8e6x19Tc4~+Ux
z`{&fH)+ewAS|wS(fp6CBq#paTO;I&jI9;rfP~FgRah+}np1V9g8GGjJ_3&V{V445M
z5>IMM-I0p>S(Fju!Sa@<+f%+|`dP2DnPP4ouant>G!e{nvzyPc+UepM`i8^Fj0S6P
z9cDU?dJ`U#{P6NHBIe^ZwBKALuDqr0VjfN{B&(X}@h&{$@}#S(-pJtqQ|xqWDc<vZ
z5h+bBN?vo+so8cgyw=ez#DG~E+Pbj=?wN1Tz_2bTqiGPPR<D73NSxg}w?0s1f!NkM
zV`l7{%?bo6Q*TP*Dtcy?k@J3kq&z5dv7f{^yAZ7o!9n*rkR&C#*J^w2yIbdq2BX&p
z_Jls#1>f1@05sp6kXr_=p4Uar67F>;fn-FJb9htAS+ra}eVx0!7&tO(D5%*^bDl2g
zYstwkH@8-ZPA=bTpXci4MJMjGo-9g3_CD_-c@D=M*5WJOhq#xT1uZL-wkhG6aqgTt
z^UhSO1zw#)NvbJiTO|YK^OvF*#{=`#{DujoyK0)6*6lO(Ck&l-lciL4)0;cGcj39!
zB6Gu7bIOQuwt%64-QIB1ufGgof%TV?N$Uwzy&hoi`idXjX&{ZGqXNCyMU*1h39zu`
zh<vMbn|6-H4}&t2T}??_Pj4(eBt#lF`r!vMYPp;4o_cwnJgeTR%t2{b6<OXJdeCy2
zRh56IK~>$d?YP|M2$m6Vn*AIZ3P6E_p1V=(aw`<7Nkoc*@5S7EXU_V4^&+)gmRqJh
zg0jm_d_e<(3jr6iCh1Rxnv-$q9GtpGjNqbPb7AwPq*Bn{i*n&CLadV^wy6LkDjo=C
zq)HLnCYXsWPbkTAAG4DzXs$(@--s|0IBF7~Gpcoxmj@lO2sC=`I$pRRG`DUH54^!`
zAT)tZStL-CQ<Pp>Miu?AK1$c(j#r#`yqoEo@XMTougsYVEk*;iGVUB%6ZP6rcW)z=
zq{kBg9}dQHU+nBj?|jOj3?r|@q$<xMv?|$1H|9Ww8~b|r0d;392~o$Qt_NR^6N&J%
zTo5Fibs{3h=#4oK$4REzv(df#M7k)<_DCAtMUB%ar-<%3>yDt}RGe>L`Gm`wO&|vy
zoNdxIcTrfLEU#F-`nch46R)OYC9b{)U|a!EK_p@`QWEwE8DLpa<%+3Y3nvsWOYA|h
ziH5%0im?)<EU0L|vva-5iX?IEX@^*RCW@<@-J>f~eU*v!tLJ#?s)fiS=n;w5drSju
zD6A#_HK)X(s|=}B)AGj5*#>zNOLlchk%TY^qU_Q?JYXf_$q^8o$*32t%>=43?2mUN
zJ9=`N=$b0ht25(|9LpDas-+Xx>|<TB3Dyf=X+B4*tbcu|@fd?rh09-9(4|zqHu1QM
zoulD@WGdo#_JA8tWiXh~DXOQVDJI3iTzA`ex#n=0rOgJ4^N@m@#-zL_6kN8s+NM%z
z5W}qa^kTYV>n2{kzhy1Nq<?I!I7IZZ3tEq0l(u%cgm&)cB2_X-Jf!E_JJ+30a|eU(
z_-l8^=ZM7X1Lpdiv^9stYB9rkJ6>LAo}BT`rLAI%_Y*ZF<ec8=_9$`dfM_ST&d^gt
zO_EhsWoo)F&$nq&Tk$ZlT~WW3ZkQ|bWoEMX`FvE-BaabMQW}`RUKLdhc@tae<RX#l
z+>XZF<>_I~aq$q9^kfDjR@+f`QnUa+F5MKFcH}23YO0EZBIoaFMz`hEmAJzd?PUnP
zrZm52SFqOk)R$Z=lF`w2T7oE)?e&FR#}?X_O=!*d$T&iCdrHmh@aRU(P%PmJkW1m!
ztT6ZI?g}DqHA-)wDl5K)L;YNaF~>Va5*PZTB~&665)_A$A#V3wA}5hRybkzIdpxsR
z&<+5C6(pg<H09W9FdTNQgKw-dTmh4TX*mYYdqM3$(nVTM;MZok)$Tf2Z36u1dp*m=
zbC7QKK*;Gy%AU#on2p@xoRgdshK=*n03Fr{ZGFWH2?{lfucHS+Fz}vdzKvZoxXp}}
zFN*;t0BxLV;0WRB;uF7zRu#rF@mq5nO47cK40&~#>F;??0~lQelP)zWx51rFyCW^t
z+iNCxbYyo<w)8l$;uxpIcC~zu9QE>`yDPP1tTMA(g%^g>Mn73p6V>d{HqA#Q$Ym=r
ziCg_|$O<ANx2o;Xak)+h=q_a?thN?8d+QNgyeY9)BrDsyQ^;g+8)N<_hsLzF+@Rbo
zlA~GqA#JV|NHM#iRVO#<+xtVQ=vbSEqi%wfL|uwqHqY&M_mVS8LavaDh4gmgfc7Wv
zcOIR#(t3D3V(|{q_4Q!Nbp5=#!FFyzVO@~1dC!M2_r}tQtej1LqHg~DZ{BIonv8$q
z#U#Foz=l)g2Se}CCfcDnG<3~5;x}2dIYWj2P7DDnh+6OSi;M!gmD>Gjyl3v<W6zB1
zy1u@yFD^doz+ziuC`$NXK@|tSPl^t3=L@`!b!|Jf`#K=69Ki>3BuZ9$ZLbNj+5!Y@
zKsH6}K&gMKPFYbDLhe{rDm3fW0H*LxBMP*o@wza3nhHp=sqyf;YkaIBhI5VDjEfD5
zI)@jRD_WyJdk&x3c<lk=10F{FdNU!nrbMsSJ8g1L1U{}8v`92X#v?$Br&V__zftdm
z!?6)JLUR~@21&a6j?s+g;<$DfmyR}m90hPO*f=^{A|6JZ;N-G3&IQTINx_|lv`uwB
z+ED+ED&>2kiY62P^!@gf1{o_+WGD=`XZgD|^a-iOK5v;=u800yp!nfr;bM)+^Z-{V
zgs)SqH&X{v?`74bE;ak}eQJGYJR{gh+>la*<Ho;=bGbSKKRD*CetkSUIH6j5aQQ(x
zLJtGs8Kl3TOrQRU$iK&$H<57N75hF`2Zt9IrDM>+(9FZvzolq$Fpw1V*8a}ZHANQ0
zamqL7kXY(U23afwbFod+F8fqgE=lzC!$df`9VxS1GV6ZSLzCVY0;aMmUU3DYMpiu+
zI#MY6hWPoq@DRgN@tg3uSZl5ko{P!X05DZTy?97YxIVo`CI#u3j+>Jvm?ExLOa??t
zaVJ(+;6}M9XY&}s=gm9Iws?wAtJ-sz^X0^M`E+x=A~#o<QnGiNJ4mxRk*hQw-5vgq
zXhQSv_3F>u+rvWWd8l=pqNm8){q#Vdo*G30I7N|z23G|k%Wco{(;RoKz-66`ojFVP
z2Ky&a%(-*!g0AmJZqZyfdxYNOPBC@bVGjl>vh`T+v`4V(`kG;9l#n0nu0ZA~wjR{)
zbxwPRG9ztGZZ<kxb)f|OeScn4l^)t?5j(0UyvswZ)0N|Jk2<|rDFh?+$N|<9q4|A;
z&zV437)yFCdhf!)eXo<>jdOJabrl~Ya%ix_(c<J#7v|hd@6K09KrZyNM)DENsnsPx
z%h`JhQr&3IliY|2hcS86hVR41xdVG(is~W!Hk(z%r^5U}FEmg9px$1>3vDyJ4N0X8
zWg(oV8jzG>A=p+|oBvkAIU0wh?_`26mS;21(gS60Tes-t2<>)zn3<LAEwoqe6Mjkh
zsf19glUh0Z;@CY#EyuI~Eqd}TUOp~bO*bsQ=fG8_L*c3pmi<VTkI81n3N%)UAoy+w
zGx3C!JOAV>u&{rB@|8qZtM2$mlYTpCRpu_3_Cp01R^SYB#{ljCW#J#frNa)(D&E&0
zi>DZC^*`e2TZ)&O9VO-&^xwg3?q>?Mi5*H=TN3Krcj>>8@My#g9FtqMI~NZcFjN`Q
z>RS<8dNJ6-2vN@<<70Q~BjXAMu_T1Iwel`{TMrIZ>}1)=b(JV}K?la%ZC^KuaT<he
z`Bg=uap~5cUzlV*?vO|TbBDTXA6pK%p<RF3ZC(>>B)?`owfzJG<qvq0X30px#ZZ2W
z<7*!rg+qZ1$EZv1&)hwtgQB&@aMGy=bf`Bx8;!waej|$_YS`i3#6$Z>X{jeon}aaw
zQ8T$TQTdoN7FxGMaIq{YXR&Wc<QH*lMlOE778p@4l!r55l~ii=-ld)Bq-F8JQ`E3|
zAU>U&=%A67`>9eJ=Z#=-Ng^|qjRQYDRhDxKxv47>`j51Qu%yx42)rNn-XF&5N}vVt
zAaalJJ}J>I!bLqcX4?577!ye08~D<7&YCt{x>OsfNp&p0p-t9YIW9V3uq31(6j+pO
z+2qhWy(G+$(Ns+704}R$+)@-rUkc^xQ@hdem1V|q23${r4XXt9+~n!qnTw0<ZNUku
zF?4-DS$qhpNk5;MkkX9v-hV+m$cqR#(CF_=?OSr^j!;ae!M@L7yV!k={VF}ZEH~{J
zD_SeGh#nwY_!>pe^W%|-^6W{<85Y@$eqJ8ZC$RLxVZ0LGV9DtYdkAdUU?WII#1=if
zBNlnnk62s})%J81a#@wI{660g%`K@L;=Dm7)gJMr5d#ZyWr|k>L7=D1in0IbZp|U{
z|B7QTFl@G&v`HJcoYu&9Rz=G-;GQ&~DuY=%TZ}g&!tPD)0-QlqBq?;W^ZITZh4;Ku
z!v#fFMv3PYyZk$eHA7TzQ>j9XoqK3EcqVIIFIV)L$b{F_@T?{gbFp0MOaRr@L)2{p
zy=y$b?YU_K&6{+e{Z6MaSUk0U!4U%cScw|Wi?CR8GcB>9OnJI?dRF`g|CJFcSq=LJ
zhJ_{X#!B8}Z;m?sVT`{J&izVXSMq1BtGDsnGg_MiXquX63q(q^Iy1hDvB~Mfn!Qtd
z=Y85U`~2aMv}jeAY+iNaDKk-W8@6e|ML!*AfyiYOwdGZj<j(tqNWf<~x1n=h-fF(q
zx4y&;M|t@oE0KZce0C(kldoa={i~I4x{z*7v-ZXJAO?(4<+rL9M30OBW2}YuGhNhM
z!4}s>yg@B`sT`mf$S79i<8#Sp>A1H1twY@`aK#9ZZt7XOX)$ZRa7flZ)tAPL7$q5=
z@Abrca#{_fZDk=zP~aP~p0}10KfAMEII_GWTRE^#VgC=<JFO+kfcop;Ul{%Dl~B<m
zGq{F!5MG?|I?pc7b!}yVl;ME1GP05VH!vxk{N2HC50$hR`@Wp5N5(!8b+--dW+F2%
zmw<idDZ&r}=)=CGFoa{PqKsMaJ)=47dsn}8bLWS$luBA0qtmC#eri+3Eg>PG<gT_$
zkonr@&y6z?(gbh1yApzAF6uuN7sd<TO!|5YGkWU!AF!-6To${iM&b`m;PC`X9OwGh
zDtm4Iq4IL<_ho<H)F(-bvHXs8>n0_(wn0yl<Q^O}k^u)LqPVtxKGqkKTJwOmKu-0b
zaV=2oo)Fd$!IpG{U<!Lkt}>fIrK%$A+-SphEO+=FqZLF*_Wy)du+$EIV&S5!eD)kR
zIgI_d=JnIvKs=S$><#~;PwbQ!KS+&|_7YJGgVlV_gLvnita|$%n$=+U)N?S0vGS{a
zJt+Uh_P<^0RYvgtBzmO3-CfsdeXKx8Fm|&Yzx;4ryQyTc_U&CYm4Cfm{+nPp*YG}a
z#6d$@y=LUcOoS?>lq1RdMmm#|wwu=ZP6xq_8MWE4n`zB<Y!`Yp4+xyLR+MP+?)_mt
z6r?BYE^fwEAw`_~=<Ds)?x6`5Mv^_9wR56Qn&w!YS#<_!6)nr>OXhd*JdbJ1w4#N0
znhCbdN;y-GwYF;dqN9xX<A*_vX2^s&{7J46?|y=m`=+l&c0dtPs&55q&vW%6BtKRI
zS*yKpj$kCGMJ(B7-Mg*W=i(^#)qk*%QuTG?cAwIVZO4arZ<0%w{EyP$Lz==aI7cpw
z6c5Rt(olZ(n8}j7x{c0mq>~lQGC3b0F50>JMy1^q>Qa~PB?^A^J$dBu{pbk6QZyZ)
z&-jWLS6o{2i_RMA*MW_PBWkU1T;4T=#@)I}0dgk^^aALNn_B}zrUoKCE6`I|e#H#I
zgu96@^Tf9^tLn#rV=;)ESs8B@`hX;_F|vP)Yh|){q<KJq#by2$dLi;xbFm00c+{UH
zSA2RhJK}&TbbLtgW%1d#r?W5z4PO^TnUVJT-A>VJRoeVKo=~CnB!9iVZHvpdPn(za
ze=%mt&xOz-MqJ^wR@59ajZ5)J2ACx8HuVuu0}j8&-i`9Ngc&93oXTOfNsX!Jgyi3z
z!*Cpt7dHcRNZ)H9OZtWFbw!Lzx?@S-LY&FTL#vmlL{zIg?+4o_c|&aK8rV6pJ`B~J
zFP}KaSnw!dE;lo*7|@RlWO8uRfQmRw9zYs0J>uq~^8tD11y#%QY2Lolb2_61BDQu5
z$3s*?MA`s_(`G}7=_HnKFuC#0Ve+;#!Jf|9rV1DL%8;l#hsfHoPq^fhMB64$X>!W*
z&O*kmN~TGprMz&$m&vybmwx_C?LmuA2co$z_SSZ>P$_j=mCnrz(J&GU?$q9LqmpHS
ztdL4pT=atWRWMS<$NqzoM|k7Ib?H&h#N>XM_xz*LcWearj^=3O?xuLV>-t8byOzIQ
z=JM@YXXLfqTq=9@NBd(#x<QAr3o+R+&<S?<(#{Dj)PKE3uX=BAZuUX6TD2K2-2tro
zb_am+DEU2y<^}Z&;u`oZCkEz#dO8Ir=NRlgFVe)tmBKV38m;YG_(P19PuKYzh3-(E
zZ=;3v9grm5x8k=nR|QLLnvHjq8*vH94qqX|wz3Py2?5Xj?;R0mMgJn!E`UL6s}ynT
z0#iUu7n9t&n;vr{!A;!1J1)UdR&b+u8$aj%0GGMKoV-{eNRr=DcraYXb(l?HqxDxA
z-u)%)2Y>7<=ws){yo=$Kr&;OoOp}g%2N4Gx8B{f1y4Y&8dPDWC1T}|M7G);x3l%6v
zW!$(~2_NPTu}S-p6=|$%5r9;iT!day(sr1E?Gp0q4DS^%9t?{0@}>4vj9{C+hJ9m$
zdI#1ur6R-89<w@bD7$VEv)zgbXCb4|^E{*IjFR%rQPX5uYan)>me!Bv(48op0vJth
zvuY-_-K-tH)tusaZmLQug}dkNW=`V8xl#|<>d}xUql`b{lsK*{FKH|CaEhgk5p#q`
zhcv$87#FzBF4sl+^00fGq?nHsH=FH0^b|@Nfs<VAC2UicJeNF4GSXUUkD$L8K_Y+=
z%#pwfTl><ZrvrM?z#jC_?}>gqSXt!uQu&y%08n1vFGf91`@|8%;k+0Z4(DSJO>6{1
z1Zz)Y7iOCug4IZqkdK~`yG;vX+ZIriWi}bL_u=*_l86EXP}>?b<C}u(ak}$rRz^r2
zP@l|muYHg0jxT*AC4A5#wj0MjG-h6@#d&i8VZ@1L!a!13ao2E6(+P?Y8NxJN4Bgu5
zVX~PRVA6`29<L=P5KQqH{;7Y;na~(oYes#$@q9Rd5bBl8Z=@*j#Cd`RJ6wTI;O#NK
z5vf!yuU!{MZqXy+Bbbi3e^W{gMryZG188vlu%0UFCYut|a$W3+I&JXNw3m1v={y0)
zw#|VLWpy&o2=eTGCXK-NzhanfpKX8I?IdjK{l{j({Fh4+<>&KZZpdHUVIaVBz2@vY
z9Ae9<V$r3JYgug3Bf2r`aXQ-g0Bc2o*TA>abzVLfPD5#y-6=L|c*ppxi#Nb&%CflK
z%&a87z7aralTD{NNP=OOlRALmAZCKTJnqCH@_r<|vh5K*H-Eimnju}~WXs!CkD3FK
z`;;r<qdD(eB0-OMZ~F{fu32%=$x1D+@By#3Cp6SBr-@tr=9V`gru#`Ga)Ek#_*#G%
zaIdcwtaHQNp=}!uydbIcuUu;%;BwAM2GnsMO{P?L3Fja)iE~V-s%A0Hy^jLA>HZ<y
za`!2O$)~K>vwzW*=V)9!eH)JC@uwO)HA%fE`noD%TO?c(%Mg82kt6{|@=Dcvs>*MH
zVS)g60A0OV2^1`l^Mp)1i2}6({CNpOa@?~)Zs5q0O|dPDyf`3w+DU+K$^=Or?c4@Q
zIZdcVd%ItbRGCfMMH~zU5DeFts{rg+ta$pwE~b`U7k0<<<ZzCACRXIq<E0fYU7wN~
zNpruJU`u9x0wqH;?tGOA7O@WVZM_?V9#&9ylf)B~?=mY(kDmiR6_}L+bN=b_fn6*O
zZU3f2aB!0IWA`Q6eJg(c7+=HuEx*6gcY~OBkhtuPIxlTgnMESA&!i5eXxK9v4``<a
z7f3E#-hpu|_IVBJFyVFg%t3rk?#p7Rt&B17)rM<vkM{BCj)c-NV~g*EY*FwrIQD%T
zs*H$EfybcI!<5^B<pdguZvupDH<hoN4jP2#)}=C_=lLjZ#_!m@6JPcZP?qxPUcKUp
z2Gl0jx;PU@h7lCA93KZYuuN)B83x<Gc@tdn0=6a4adwi>Gwd+Z{h<~lOaZi(xcd}H
zOVf0gQC;DrQt9R#6OlsulCL$nb*U>@oib2wg3U_qOk&knczhKs5Gm8O2DluGn>@M_
za|7@xz)ha`Y#JTn(H(d#ht{_pr5wY$&rbSTuzO<!KSgbwg=kn3H*q-35f3WdcbT8e
zykZ+2X^v-1qlW2xyDZ*P8S8w<-Yw!nNNe@w)b<GE(4!US9iqLPKFIT0zyEp^i_KWG
z3lf1DTj9+!A!C5yC5NyU#i=KXIeL;(PQ`c9QN0~q#jDJn5<*kc@gQN%F*k^JpqA|*
z)##|aTQr&SH)r|DPzGcSrlNJ~Qxv*m|GiWDx13b$@XOnKDH_8vHo|Y}-KhDTWZD3(
z$wS{mzXnxd-$?vM8$<kNlVDwU4P`gLqWspQ@L7n1eNWr_s#ivVU5)D8W}gR6`ft-9
z#>swl&J=*oSswA)9Tye9nwCtC4TKU2E;XJxxX1i?!wx_a4Cg)Ykra<uv0iU)kCzSs
z0(+66-xK6|cY8oWl?L-zz1}9De^^+x=GzMFhB%ouzXWF?oUJ)ay7|>uET=NR7c}1!
z%k<nS8_q4x<!im0zHu#dCAF_+XL{?a1U$*boaCv*i@w0=PI|SAN2h$-&6PX|ciz9v
z@`q4%BVs9A15ZCg>Z;TCuc7+s3j{wR=GzcRN{+UV)#aV9nc=-PdeXS!36Q3a?eOOo
z4{SPJiEz74sYftMzb9<?*x^a9NY&<rJsN}9tz(tD4H}8Nk0c|Q$K}T!q<@@i+C#)S
zQ_>GjTmHk>S#gkugL5Ut2lQos=yW@7&(U*X6sRoJVl`9TF0PtBMYU+rx~M;`#_=al
zc6*^&r2OCUgN2kbI55BKTK%aPo7bx`#|yVUhLCE{BdhN2MZS8vmlg5v9m)JRK~NgO
zyQ6x7onogu=_SDD%my6{r;51}T3*n+S(dAG(P%V4p*i-f-|SZf`t;N91}V_r6b=^3
zu6cbMK*Ag1K#^rySbrB9aA`nI))z_XFf$Unxd$10@qhU<eBJD9L5EkSiiNcKRD<~c
zU-eLB!v!?|n`diGN!=xgqB~$zDhU33sx5cfjT7<wPvbXevJr5||J|wo|Gf-)w9H@j
z`i4_JUda#der&Q65Ar}MDDl@cUSN6rU{&JJV}`St*mIkH3RB|l)-CmC5PT{zP_nok
z64-5!{W#YuT{+dZCb8^(RlA=OH|17O%<$1R&ZXG(^uPwRfk1iMK_8!T&;`<V)Xina
zrBk`LgFY##Hl2C>G)As5-JxMnzR|U@pI&~ey0$9KAGv$>YiO}V*e+(@P};^1qb*G$
z=`m&PA;gq%WOpe>wXH(zsi5ttXPIkfF3VWWHq4uyUK;5nXU`hTSeJZqsw(Y2eEUhv
zSJb^0B%2E}Q^sDD*XL_k$nCx3(`V9lqqjXe{gwE;hR(70$4R2*tio!wxIN&GQjLbT
z4c+pMV1t89cZ;X|I`-7t$tXdcUlwlB)=BR{`kL&8cBH+AG&*{}PT^SE@i|ow6Bd74
zPY(UX1plJvVxdAgZrU<lzW2GFipJ4oaQ0N$<<qPA)MW{*le5gz5l`ZQN!jG@M5nP(
zT)JZXfU}%T0hxFMYNN+1YbvO89ry?S(j$93x+LfM9<NA94iW2>hxItJR~qBR2d9o=
zueQ@)&<Y=Fs%&~}JyqACzT5z2T~(#Mg4<D3whh1&q_AJ@V-Jbx_Pc*aWTO=A8w7jA
zCbuZa9z@B1_1H2j5FTfQ=u&UN+iln!8j5iBi`NNXPxWz!Za)X-D@g%S?0E?b#Q5@l
z6d4Ol$dFrs^ivf*Qvt1RN$+RIi==RjAX5sWCsC!BNr7CiDw8}W?15}BFdr>EB*sb=
zHAF+0azuZjmaRnO>|voo*)}5t#b;$R2_Spo541%*$)gZ`pSfDnby|5t%_8@uRBEEF
zkh`V&a5`pC)KE;kH2r0rYVWy1A~RN-y2`GvP%R@9!uQ90rHY=*p-l3WUFW(q&Az!*
zkL5NoR`S;fs#Rb%Rthu4#^hCoiKD#ql)fb~qTG#&+IN+{B#N5b)f9ZnL?%kxA-4J8
zwImV3uE7IfkIVV9DVGE-t+n}U($J~9PMA6wSKy9q-MV|>TCnV?M}{x0YdsiYK$_pm
zT4+;O00t`cCsG$~PAU6r9r!36c($K$XvR3iws<QYOn!X))X)`_d&%G~W1o7J2A=N+
zEu@<2VK*vA!c%ugr38mAs2X{z!Q3nIWqP#?0y~{zoS!01-fg<kfQ?!{X9}@&hv?kc
z@B&+GE)uJiW80?Df)ONXs8GRG_Ca7;k&gC`A<W}~nMqNe>mkdEBI(XO>|vhH9ROve
zYG)MiPx0)(XBkQ9v))bO8_a~dech38n+8h>FhbK)i9T#a!o_+>#p1}+u<!dAn;jN4
z1;m@tWGQbRH9c7K0NR+A_g`nE=r*}WEWSdE?T&EBIl9+E5<@;KCR(e$+X^!gnk(6V
z8y$<i-W~qflzPSd5oJnWjgvlUYc6-Fz50a!-WfXcTo^(dxXx8xEG*}4g$Jc$N$kFH
zC5eFVdc}{;=ZR-dGsj<^@@0n<ZIHg6GDAVkB~&cjd->9pgdj{?yD}g=2`+I%YQmOq
zKXN=SGy7hoRl;augx}8OSenk7#DG$)E0XS|K2Z`8Q(*^8zA+3aHD^YMwL51=l`3@P
zwBobeIpKHF89K4m2dPD*!~EIt1_>fdU@545-XmBcWkkz$C<Mo-sRgQK1Zzj~&Xd6{
zdPK^xITd3iVZpM1ddpPq>g^{ad2>+YXO;A@)gs?0PQlx!WMMzy3ge}H<@w(FnxB&-
zDaaFB+8Vh-Q=kJAGH6))%4i`~`p%3DW8xO~kY89TZ2)_%WJkJ9OknoZ1L#PF$gFm(
z-=j%1Ai08&_8pO<hG4MN)4Q8ShUaZ}H@V(n$zpv_YTrYyp1_C;;wZ6ds}4^<Y7ehM
zj<KLIv6|i@`_@}RknlC=4X0pJGU$h}B6hGBHbXEUgxI3#2QFRu&m^Xfeux`^S>gzU
z9XGBc$%Ie&bXBQIOn0{vr}6DlEU8q2tsmc}Zr_2i3MY-MmfGC*9Y)G^&E9k+hmv0I
zP`S0@OI`&8*%=>d_{C9d=6GgzeC=a(MM5kSgUt{z13=W)yQc(78=0gn3o$Dkop*<E
z$kci9d~}pfcEI+pV)-FhZLNn^eV3$~Pm2r-TUm2K`fhUg6wi5dG4H^J%*3xH<v5(#
zA9-{g8S8@$f0{IK7_tvXjsxyh61b$G=AVjftHHSLlR19Ubh>kW#fHR?`RzL|Ji3?3
zn{K~T!Gkr)ftF6S$-?BfEKCRq&Wp|zFsc>w7A+Fbir3MIO9y`)gY2`0c&YG6@?@V>
zWjZo;SoK+hzaF$|V71vzf!fZcRYba7pZEF}nt(X5T$E4amG0LSE`v?e8F$Sk-7HQb
z{LQE}PiS)h9wzAA8G<QbGWl+*(t}-&eIIG~J`D<8Fbc&y74Ei|1ER^OA7m4!Oa~a<
zBuNabg>LTeGIxqq&Yr!|=VBZZ8;vw=lKT{sx-0cXUh|WZ5;vwzp?oJe)7|{l>FL6L
zseFNhO~&{56IN5|HnUmb`A;O=p)hjJS<@K?9?GgmOrqMoKz!qj@YYLY{2l(kEp~UN
zi2mb!qEauJl8b#ux^!=&CHym+OeF)8XjONE`HgUSOXg0^gvb|@thth3XK=VAX~*+P
zi}|03L*A@~8BxQ9k=w;EhlWnE$>iD=G4;mDVR~V|mKOu48=sS*Zors3_vm9fyjL2G
zChS4>hALJ3{<rP6v}24ZT(Em@66x7=I_r~43EXzRkT@M}<_$GUFX#z?FcSa+zF;UL
z-sKt6POV=9EAFP*M;1)rB*!=WFRFb*pl4cVM>u@k*|kG7CU2FnPTr`5NY+*mbBF$@
z*-6VZkKQN`Tr0&dNybfc(9&F@`c+98lV~B@@g71+&$02=9$0`A@{09Jng(A?D8bda
z;@_JUfH;cT)7wa4`y2}}B~IDmOz#_JZ%^SyP*w9JePLJ5(9M1g7{{;6_P#G+_nL^0
zGwOZc|EWies}{5T51XyP?2|>{T@Gt~m2NLBuGN(WsK9q942`gH7)mI77Hmuq1d^&H
zd5Btd0~i58kBfxN@7)nCLn|DEgbi9NW=)kwK3TH`(1p`7N3bFGZi<pihRWdVNQFk4
z&8~6380SH>lIQH+h0{#&Sfo`xfq54L1<z(C<+p?<sJD#c%fy3{NZV?o>TRF~P<D?q
zxaQ6ncxkEv?||U}dLJboEWW_{Rdj4Ls4F=%k_<9(^z!jADc0LdmYPpcruQ~jDmH$A
z(=d<Ak<j7};^@OEp+S3fOHCVa?~<%S*f;hCVyMKz8l&*k=Mx~GC@L;VE<hinTnd_d
z?XHq9u<e4mzy?A_U4V&q(S)vOg<u9^jt8~9t%sG`g|yZ4s>+PN<XHu>NP>X#v(tF9
zR2>2*AiByr1<k~n@TC-FJPFoqp8-pCCJpdQ#1-3RQyA$_0fXh6r(<JC+gmrMSe*0{
zeR|7tDR_`AIHU&;_m;cweLE;Mq^^_Ew(I&X%Y@W3>@EU?_0hv{m8oxFWWk|6^HkGj
z?ab8#6i*x__$FYhJAl3T_6t0^1XV(WNhT|>;$nC1IvU=1Y$uqSON-(ee`bF;Z2ZJ}
z>)Tmdx5BUSX^F9Vh&&gEnyy5!gNR=q>GSo&z@D@(M(Hbwru{N=%(5V9Bq5fx&+wq+
z7pv4&{M=YBvfHi5RtACP`+|?!c|xaov^;F!v4(ohnXXB-7*Ld?;iC?*qPD<1w_@y_
za3z<%dkyS;k#}1fyPMw!9oBP)9{7&vc@%{iP;-z)BwB03vMc&x)&`W*3yQyk+_7$x
zrRf9^?g#<YS=QdB(@2bHX;k+k(U28nrXyB95Cg+r&xUQ~KLdgRNkBGYX-1@7xJQ$y
z?V<%V=SvGKmbdzf)!yKu_&c5YX=26Q54v6q?Hp2avAg4Xq6w84z+<^bOALEIem@Gm
zlY!)*3FIg_maEbWiWWQw$%;!{<2__o_N8?i1sjI<+~v}TG-<(2sqDf~c}fSu9zEMg
z!mjQpRv7bJa&SK@M2zvpz?1B$7Yk^go&z1o!bzs-cT!=xsfn-v3Z$vJWqAV0TDlHI
zfPacRy$;rXqG44cCFjC!@9AOp;?@rfC5NW2_Z}(D9LQTB0@#cq(sV@coZTh%H5If)
zG&tL4rR1{@YJ*(&MufeVQ^RyJB#}^VD6L}K<NKPvO8(>AH3NYv`6-EY!%8?=n@&pR
zIfx)w7Ii1ZXX3Nw@cRI>mA1|z(kS@PO4@)(9}mie(~g>-4I%Q4@4sYLdkz?~QE%FH
zc3SVn?9w7O&J@Yhs~e6XFwQHU)OYzYyNIVA^Ifp=i&;`gnmKK9+wC<(5x6t&{TtEG
z_;zR4<)4d>^PZ&&elGsO&2;d?nRB9)qCSylSF`0@9^YL3B2LTM>b-!CqJj6wVM@-4
zm`bhRT#faWW`M`vvuIjX8mTUfu6V%C9o-{qcwfKk-CxkK2RbZZKE~l7Q{<(haK?Q3
z;m>~r9QruGB2q_7;mSdQN+%pN!oqBSy*BsfvA>@E|K5jjsAv?i>?@*F{0BNRL#3%h
zSKK>r{Krz{mltQ@{box3=Q2^^0B0Gre}4DRF(Hyax1-ab5)`{nUjx(M9#4Ar*u<{x
z3}6%TXN?-+U_yu10u^Aw`tYk1y6)O<kx1Cxz_XGzmFRy|_iEu@;#drg@Z~@L+^UNZ
zQca&l_&<3wfk2>_vWiwZ;gYgQD|yF2#<X<UKhL`Kq&3oWGG9eZSyA{vJGCviy#jCG
z)&jG{*!}-}s%7?Bgy#;`tAs$?ox9o6A0cUj!B;DB9FgFDS0z@@XWY`Ee!o|@d$ezZ
zCnwU-qR#A|d!IBuZVT01t|slIWmsh{&y{V#FSiGRpH*Al|4UkFQ|I#)bSYj(#Uvl~
zgaZMIFSCE7V$3>7eRuxRG@Kgm_iFXtM>I{TR%EdDOZoe0kAAPZ*bg9m=iE$n8U0W&
z-^Bi9fW#d96qD-Zb@WWg7UL3|9SM4N=^P!cuB-Ag?5C?9s%gsoQ2$;XS>)m~!jE=q
zE9YGJff*^D?}bdgUFKHO$v`Qdv&Ym5!ePl?*D0Xeb9Dc{CB^e?wLjPYry%~hBD=bL
zYD{uDD<#ibsB^K6S<9`BGEHkCSx_r2!Q|K7(}KE0;=1Ewhn>{BhyW5bW6ZIr*X0{)
z1uv365?Ne<FIk0TBsq$m&yYDW7LcAaOVn;3mp&U8`h6<vzHoK6_GY$BhTFZNzgPH=
zj+tMry{I_8IY9Re{GV3v8+comJ*;DhV5dn-<ng2i8ev2U(K|nys-|q(Ss;_@Q}jAE
z7P6i$*>Q+T0g<n5rQ)ANG5JrA3vCqcrSB|~6}%?;-pE5nXRwIsB3d~i`Kfb*m*xMK
zj{d2w38|mko7fz-e2ut$&4@`m6HZgUj?lQdwF15;0ZUKOVyOp1q(oK~E2F^s3V%z(
zWaKMJprW8oUHmsYML|9--NKtb<t|r#BmkREr~e<(4>k0TQHq7}xu};XuZzbIk+G%@
z=2GilQ?wPAlVeR!nXBI3o1A*_$JbrrosP+O9*|IK5<%m-xR<_BTP6c8{sEu1+(bH$
zWMP&>CE|vO)bEx5J@<$IE|-6?V(3RdCtl;`g_ul^k5+Z|X0IMiy6z0lABcJ$LC+5|
z6jk<@thA*Y)<|1Dj)pn4EaP5$+{Uo(cKwPhi?^{{7Mf9mz8uem#=fb2{HGYAzPcrQ
z3QyXgFP;`0@z2LP2X8p#8J|rY1*2=-b<gWny>54R_3mwXky1JLw|V1>9F|a9@&^5J
z=c4_7jTDB36Abz?v07v`($1{iB|-BrNU>tN{dj}TlPXa|F=oo%s(L`dt!ZqdQqjHd
z0(}Ej_4>*iuN-36{%21{)6n>i_3mX2x>ij&<>zrD0zu`zQ!`ynlS=t2BiU^$c>3+%
zJMNa89XHO7QF%N$N!cpb?!HB3XL{bWMs;gxfat)$U$_OOvH4BIedoXl`_DH2W=31O
zsMjJ+(T-y_5p>;2yvdqGp4-P85!<fVw(If8xjY^+GT$SouP!RSfDMPC<<uM+Sw)t%
zoxj{%x;wl0C(0zeiDs?ZNp@~!7Jphm^47S=`M`lE7gVQtButlV5mGU0AXz|Ow`YJ}
zoNeRvB8IRp#v4sK_tOO5vZ+$~6N$pURwX-gOOLf%50nlD$7t9@$?$L4{2<(^rhOP7
z8KO?-xqvM2*(<gm{iKR2?i66=NJb@kJ`<Xg<7!r&!o0j!K=wlG?B>9u8!7aPvaEsv
zD*@A&x68(<6a_ba#48|6ho<<F4S#$%u6}BT-@PFnE1AFGO)`lW;(%<uTlYDhPKs7>
zybhgYt-O1gMWPjTuJx95%WFPxCw5?PvGB%Dsgp1`G2v@fiaU3O>d!3a?gzY0d+>KJ
zDPUQTN&fsEUA?DvKB;&adw$tsR2P!sc~%^xby|=1Ieu|)no$>&;<2<xU7heNyj+D6
z3@a|RV0tAjY`K5#NS=~x2UVbL>U?8&44oq5h{sYCQdAcxFt3^oyh*j!<i~HGHs{CD
zP}R&(O19SjSMbPVHXv;!>aHze=<cBMc!1^@656>gnM9m86=CUd{K@%er_UTp>*6ur
zApB1vfM5Nh24A3v0Us%Hm+iZL^*~6~ttBIqPaHYT{wdCH77n`IsEDn;tqcPBh1g|-
zL<MlhXrU8qTR&1G+*xb#V)hmPsT-(W`vnVvmSDEhce?@1q^(7N%K>QiBij(-w}>g=
zTbOx*dJb(jN@t1c4OTZ*Ep5)7d^$W)t$&)Z#I2tPQ*J^hDM|+24le|C&IuWts66*H
ztu3S3zu434Tx3{w@*iy<`<*4<nRP(+3T?Oo&hIr?huK%)#M@n0#>WlW`FVOBD_t5|
zd9IhvjCE3P=ER_^OG3^sgjw2}!tGx9XB;l%+We`8&1lfDHg&Q7$$9XL&zzg3Fk3(Z
zBT$HZZp5)vR|AIqdk7hmh|qOEGUvVIaJH=ahIX4uR3{O37kU#+yIsFBM(Zm=tU%WC
z(?RF-XFg2I(OG0L$m!0{a+Wd#?}TB-{3Vt9*^zT;Az4$^hLiDsFwYl2M@;*Yh4ib)
zT=-h}5JZebO?QPlIj}ps!XHf~dcJ#$Gc@FHX#(F)NV(+Upyt8yezKF3SL1n)P1c{%
z3tGK@=jqLviN}9ZSZjn8b)17fpzPzZ<K93g#=2X~c01^<UIjNQ5JJlUew;^j$?D=U
z%OBIsML^lR%^8UK^;=6iW9K6yVf&%xHpHl^s4ESh-|&m5TSJ#;RRkaPaImD4Pt-l4
z4CYci&for>^7r;HF^8VJ?%fqj@hp>OLtS$-d!1o0k^7tfOl*0~54b}ixarBRt7*>y
z-jcMrN&>HvDR-xYLPX3blKL4%$)ADMm`tfnz7Zi-<f>a<F2X7MhPCb63}!!(Xfwr~
zwg+6@ogrWdPD4;*yGr~K9i9P<x3bjrX75biqKG<2kMho*+Q^yZ@v@2>!tO=@-Ljm&
z>|E#pM`!>(G`(m~)Qq7EHosC?MJ2n6T_;LVv+rqLP4-uscgZ%QmM772a|05${f?RF
zslmwscNzWK^o6tW(FFmbz&0B4p1dOYEX7SQV9bJs>ipyCRsQjMh&I!2Bo`gu@1PPT
zQgl7_JFdfQJ6hxJh`EJS&5{E~{+Ii%-6pymN2;QT7!%1w_qJc-#YsxIVs6@l>gAux
z0%#NouQsAJv~K!%w;7e>qI7Svi-zFhs%nEXbDL79xX(pQ#};TF^7C;Z`oHWIRMM03
z7h&s{1q<!BXDQ=|$R-DYTOYj95*`U?COB$VALYnF$U}==Sg_O{{7I|xS()DRo?i29
z3#Tt7Vq0#PiG;#bUd4b<ELIBsRbX2e>xylahGZs_u1msqJ<15{h{kC_d4~vxge!}z
z4EsF8A`K7t6PC_ir7dZT`4f(p6sZ7g#fAPM{}O(~UK8{-Fl;vVb2P*9-k<&Tf&H6s
zURr*Sd!9UO>`!P?<}JFD%s9JZ6kE6*Qgfj;U-~Bp0uL(kJoPPRz1^V6@gihzq-Ti7
zxL&O2Zpp^A(XM|ztNYT)=8>V~>rH971T<OOD#M==8k!8SE)s4txg8AHGY9+2X#{g>
z{#}!E9-lQNu1Y%~_Wma@aa9CGeWk$;f3YXa!YeRB+9N8e2)Q<rgeKT>{Oh^VfS4Un
z7L)~ro6u+FY|zIpqE4tmNv$VdR>KZx#1vo~romZyzZ3$J5_qs|{NBm%GWKb@_*oUN
z9yQ9c+yr5kSM}_#oCnx)_*|d7sJMZto^YXGNLHvNXtAMp+3>FouP!qB4>=u!Ioiv&
z$(C;O&Cbi;M#52;V^rm@s^O{@{C7C@JBqb)hOVc#?bY7H6L)+yck4%T+Z%m6ItJgR
zA15;ttbfIfSJB?%#=#p*3!UmwU1ek3q%WMOPn?*R7V^CH<7b`+vxW(YNA}!G=dq|B
zz;CmEAqrGG)rY?zi)^wxRvVH$SLD$kr|pzA@0(Q<ytJqCI(dJUL%Z{2QiiGJ3!Myn
zFzU*LuLA&qYB?$A<gwD2LSI0@zqzk(7nO=2SVjM<QahPv-P-IvUFfMb7{B7}uHHWd
z>YZOO=?W8s_XlK(dwy+y#8<}l;%^TUstSg`Tt8?x%Oyf{ZJj?T<iA7U=IF&9JK0+3
zN)TeC+3{j%g3#Bg>aU<GYyM6wZlu3ozHFo-J#plkkLGL5y?{)zSgFg4d&tnIdEejk
zioM@A>2l!BhTN&r{JTIaImAw&^ZR&d;#_%fyYN|i+mfb|T3<kj$j13Re=)|*z~tDS
zc2<({f*NNF`nMLNNrf3!jOV=u1<n>a<%qoYt+#v-A-rRbG1E2oC=|vvzFYOGarb?W
zi*V?75;>H)$`*exh|6l5Sno+I*g$~l!mhZ<ZKvvop){t@@>jXc`(J|0j|tKLjwT&&
z>W#ww20O-|_NDyQ|2`d{#uD8G1qH_X&@aU;$cJjxoJ6+=#w3ht!|t~1b&GPw#%tN{
z7m5Bil^|<c^GCSizgGgj)5U*k`F~dRz-oSFT2C=&z1Cw73;l+t`1yJ$tl}J_>XpaO
zxWz%ap1GF1wfc#`^M2D7&(#SXX#X=Q@IQ(e%ILUhRztP3Y;n3bWd9#v!2dFM{P!ZL
zH4xv%2QSG5SZJD-$@CACEOBAZE=ZqTPbJTLv2ZO$qVbadgpdC#qTrvDfdg%QEBwE@
zV*CBEVk*amxMzF_yZ8RwqSZ=;Yw=ZP8cKx&f=gO1V?OnTNJU{bu4<xR;SoT-YP!n4
z!URGzI2q=xOT40&7S_3h8?~1+^|WF^rWrN(0qBZZKk7sye}hj>AFuR&rR{l>oTS56
zcLmy`h+H@~xM!eDG*U>imw=6C)8q>6oNp>(UJ_wC1ykr9COmSbG6Md_O@j!pnE8?7
zEq6-tv9r&}S+XrR0URp{A)3q)qZ^xU%Y|*&?4Hii&U8^#w99V<Hz`wWF~044HX1Dd
z5x-xRGP18||3clh#D5@m9`goJ%f}IT)h&kjM_ANcYy)p4`Fr;r&vAWoD8Dp+mdw%C
zm~&w#@XVdzcQG%Uq3P^U^tF;IWMxiR3$vUv{gyV~?ATp8X{RyRI9koW8S9n;$yKe!
zJ!AiVyP8kblQBT(;7?@4#*O8vJ?xhJxc#6dH(H2!lE6yBKgHg$deC+5<2KD~a?8?Z
z<wm@^r8wU3ZGyTJR=$jhW(T*0er7=St_O2g5hYt5hh5VkbV#%N$qd}M>;hk2faZKV
z)pdOUF{@MD6((f0y;3H6H{>kdbu}x(XnHi{^BK9VoY2wsNyO9-$K|)w1%E1t9vxYX
z$MM0`M?HH=h}k#oljXJJ14XX~zPvP!Rvrft+BaruhoA6l6_*76w^Mw7fcLy#C;a5(
zs1A3w@9|vhC9MwjZWr1sf}HJ?)&l#!9;aiYyjci<FNQq#J9ZIJg*oe9g1WY9OJ5h)
z!wITeGaipbB7QMXVPAn)Rb1k@S3lq|icSD;gbEG8?JhPZ*KT9AodEc)a#2#WLyYrP
z`&r-g&eScX%ia8h7RfN^;`}|A)jx;?m<(juDo#E#n$kR)1Ud}02~9ib4x*!yip4nI
zUtsT0Qak&quixA9_+-mxJsBmH=+z0H=n9!6AshmL4G@p?vy`v`m8cab&sL~>j>oyq
zx6;k1J6ui@rmI~?&!&+PVNDJ^qs$yW7#nSnl-uB!m3opXSrOWUMuh0sW(VirI!oOP
zDOW=b$C~`?Y|E^rnha>MsPL$s(m~yvdC~0QQMqCiK%)JBRQJ|#QFdM1u!4dpA)wNs
zbaxLW(j^8R(jXuVT>~Q0HImXQrP2-3Idlw4hYa8d2;&Uh?>WYGU-uQi=l!1VpYQWe
zXV|m%I`>{{9mhKMI(-j5zvPp=zRw+C;=l%mm`ZrC{EKt52C}8x*Zl62j_B_PN}a)B
z1M#4{lL3yrsor`Z$6bk~ixj5Z2jx@l>6x4{!i3A+MjOy=GNCkai?$F>vEB5|K-OMl
zQFl6357(1szEu+$uXfEO%aS1Gc_kx9ied3F4RnSZiFQfwsxz%0Vka&^Sm5T%YY6V9
z<H#gD2BNfw<HlkV-3oD7F#qV9qifXga?<@utg+q(HM%3_RO#;xEnjKc!e-L0(e`C-
z10ve@h7@e5s7pmXgVlzG@mzv6Yqf^*iXyzu+??59?D+6H)Ku-WZPL{Yua&`GpH272
z6LN6qf_WTTm1+nG+USAcuufYu83Bpi`Qg*VM%nvo-h8n!wAC(o0j@D0Nv|Lcoq_!4
zvm|RlUDmY48s#r6pdetgQ?6Qnx1g?o)^&EZSD=q5p$8IzqF)K@Xwqhd@k66{>Kz+<
z$kf?Z1~2XGGikleEnhQS@0!`H@PdjWvTA^k-YK;KdF0ws{;mj+G^*Uy9@PlO>soiJ
zu<l~Fcq|IU>E$$moZP_ft_fZEqcxTZY1fapdQV*9aAVnLCB|vMb3dqCB_Kak8DQAc
z?SuMtx31>LX*Yd;zf2huxYp_DHjp%HD-kd}R>}rUslT{R4bxQRDsx&C#bwXkxPMR8
z=g^*Mlb~`~!V5zKUL{&khkrQ;4gl#H;K5~>_QYgQ6K_l`{(1;=M9V<pL5fX7d!>&z
z+02My8{&h$(LM|P#}L;?COb5nsAudkNR94r!iy(8M;$dD{>sbewPK12wNGP~S0_%A
zh<3EFOUcd`3r2y&?eVRp=8vv938t2@0wUMR<rLs8yh6~I6x7!!lPOxx#+F|?FnUcP
zKYOoU*L@%D4=>9ph#yrF;`LXe_KLO`(FdKpafq*5cdmE{cXNs%mLd7_R#x<a0CYLZ
zY$*B=O;4#3eS>Hf3qmqF*l*$a!KC-JP07!Iq+j*4fOrJ`$U#TIx^qR^?v>DEje0l2
z4lN0!rfj>hvEM|FDpLqvE|u<tT*c;fp+`AAMNMyBsAUS{)(`Ufu}o;pwR!7zBq*&_
z<mi>S(`E#I;z1X6*f{$d>Pmy9fg^Xh&9;|(??Ur>Kqj^D*5}Be&dy?k$NN(~f|;7L
zdh7*Jr7c3M;eravbJKQPP6=lRy!R+Jec#OL=PEd?+2_BDzGH#2>Ak)3ISmRl^^mF;
z(>Z<K0JuJN+vgs)`7zbUYK6KQ9KbajXu<rUun%N5m4am_@r=P|<*3)Icl>OWR8bOA
zZ<?|WWYGZMU19a*afG$*rM^@UKdvg0lqtw_J_2c+OsRBQU>xBLV`pe6xGQ>m3X)vy
z0PY#w<e(_zBY#paTe}qDN5VQEV8ot%rL;*p9C;bnXY)A?c}En#R=Q)8W%dF|<c~}p
z`x{utZK_#cp0%Cp##h=aCi(QG<j@CCuWTIPI((=5g{?Ws!>TO@8zx$hzIu<Cn>%sD
z1mi{VC4sX^gjUF_|5V9%29SRH0tI_1p^1faFa}*AoCARK(mJhJz0BkvO{^bxO9_V@
z=5y++NSG@%t)MOKaQArq`wF!RA)S3u(`k3F08sB=>v@~%#~U+!pSP~Uf^lv&wGkg_
z@zm~Ir4ByR>+oSkf9mkl9SvUeX?52}VIVTL%fTlx{i%a&-X#4@>9>_6v>K5SR@>n(
zDBJDq1nzZ`NYKRBq2huXzo*S`mI|Ca$tlko0QzAN5m8^EBm{J2LcSVh?=E*gm$bu!
zD~XKkHlXw*Qgkj;^XU$S>ujuV25{FU_p+40xC*p@N`MdIr~}eodY=*^1oiiNcVAdg
z0LT3T6pslqOHl_as~_Vbkv0?Bpu$}pivCQAyngT`Eok}7h~A$!wa<6GgYP4y3U!5X
zfgk-spyd1gY$e{9hf+3Uv=S+#4<yrO4?DT91in96zGgp>cy9~vt-XZH^2*s{i9$>P
zHjEq$gy%Db{Yj=K@tAr9Hi!Ke*i)n!vLB4ehmp4VPdiA4tMHXp_I~oEaVAB5K;2B5
zaA%#C4ig?=zP0Y%Du+2`UE|MG-x=sneBf)6kN^+~sX@6N-GgFxB+`OF$DNh&FD#TU
z2*gF-Hd4L`-j~kT%5d4uQ#~mgGTdBu=SEAvb_-e{RXXN@Y#vbhoqf1kY4ug(4KD_g
z6%a}loF@p}QkDw)S1!QnGFMaEHYY}*{Tg#X9fhf5Z|;j$@Bp54tcK>TxugmG0>OLB
z&TkYH=$Q}&$bJ!MV*n;bJ6`Eh({V}Aa#)uM^11jBNhXzwmoza`tnfLi!DY(sPLx6I
zyA;Sxco9e57Xo<Cy-|)Ljhoifd}F&WO;wiq2$_|hj#^UBXLQ(AXdZOQO@B1MgUeKC
zYTXO4P-K(`ul-GMRhzbyDa41WbcLXCExR3k0!q!35ea8h^=~>l_-q3?XO^!`+&9vo
zQvk^L2f!-wV|?42f?J~Lp`kH&V_%m8+Ux{(h9n6pL`N4v(HYrUd=qlFqAwDeXfUIK
z2tZ%9uOLPXi*(%RCnDQg{U0$pEt8oWC%CEa-Cny9W7#4z_14fVyvjlCoeyv<9{@cZ
za>|ccACbL`k2L=r-~=B2;Ar70`)Y_^FycJR?7|&n9>Xz6HICFmR$7l1xffAavfuQ&
z=xdov$LiP5jYS+q8@b!QY;3^M!1f{$g1U=;U^*MvFOQ{^3_BQ_{wiQIoPY5?gRYrv
zN1u=Tyg<dOnR4=1X8y`vuY?60n+yAB4MAw8!oP6{*tRDI7MUbXFFZ4LoL|ZZaKEgv
z7`uCL)8g>e$s=uEpk>3SWl8TqA?S7}hUrUIKM6HaqAy<j`pVYm+bT<q5DVd(4!Gez
zPm~SzkpY4U(?>P49F`has_AWFXL`<gm=8R5&{HBgHqi%~agvU`UZrfEO%Li!qy@tz
zM34fKkF9%cdxW4GM!V~mVrXa{qOSbm$FW^N9M4InIJ&?1ph!)rqEx;y$|t=4PDuFs
zyQqXQIn2DUq(F2dK3@D`Yg31GoQHd~?pB~QZf{fA@Yk&Vru?3-cFu~^B6%M{fC&L2
zQs86sz3H#F7HpLw_p?!XnmT#25z1lbJ>CGFIXR#M8R!lQv&Z$`$R?7@?V5tnDNf0d
zq{Cq{`~a8%PJ4LHNy!Mh?~;s=$^Ky7*AG&h3x0&m4j(t9Bl?|VvXlAJ(*_1^&Pb%h
zvX9fdW%wdwl0z04sd|QP{u^ik2+VDIAx<lzC)6>t9n6^E+i`*pl;YhVg+=f42kQ;X
z-7vwmHy76Wy2>^?pUn|n1gm1W;>&7qzvDwL@EFeOiDjf>n5mN-;cY!e^TgQ+3dY(_
z0wS7FAX5SODIB}WDL1LqaYfTK)m*TR^$>|vjC+2hYf)AZ_Xj*UQI2CB5cP#0aL^Qz
z6lGfyjprl;jN}Aaef(I=)JWkVy;WlC8ovIHCqEYJEP>5pzidglpY7N2VM$aQ6N+L>
zhd)9%*5ItqQC8nF)yAvnp;i&&UZqtn;KP{MUZ#yPfiE_<M_&N$TPWTP06K9<=WI?T
zO08Yb&Y6F!Qh945?cU=oz;gj+SZ3HGm=O*`UEj0MC*Df)6UhwNg)q^#uw_gn1PuVh
z(w@ZnRGdcKvLR+%s87zO5XhgG#HY?a1CCB4c6MTJ;W{SMPpDGirOqR!uv1n^fOK`p
zKgde+xwAwfE5o6z)MU*lmHujRPE!)`Dsu^2K|O;xn~}s?uYN|)z)~{&?p%!(rKz>?
zKii<Vuy|tfONwB0NA_$P?H#2#e!1O`><_ppkmS%YSVPcG6_FFvdPd2gpZ#6own9Xf
zGPS*^V3+W91f0EKB2^@|X6t4HCA_D0Pa3oAmcU_s9Eh|mWk#h`X9WpMd`ZLEUQU-)
zgTHy*>JUxHL@NwZ^&4eLc_++GwfI@uIJl3MD%uJo+Z`%G740d|6|1CWre&u1n_Xm4
zGOm!kOi9HUH*n{^XU!PL{h(FKN4arzKn!bH7JBpIiPTrYPf3!NsD5gpLOoq#AE{Xj
zjT5@7PC~2>?jw_(iTsMORaDl^oSA~^be~V=NL_*EAP%uYO;AaTv9ff?IbUrobBC1_
z#Is>W`NZyo1#2qiy+{S+=UJUniHKv%8?Gx-kyJUsPqnU4pUiSO;<80YG~kiybPE$7
zWc#M=P2z3Z3qLw>L2S{Bs&${hiie|(r<yEEgJX*X*fm}wjc2}xtxnH|z17Z$BE>L}
z2ul2B#tabi8GCbG%oTouXO&g00SqtuBJ1-5T+Y7Kp2L+7!(ACT)Ui3|LhqwtI^zc~
zht9^`tBf#z4VrVWzj1c-#G;sK^~(lz<dk_+F~dCXZweB-*%FDsv#`t&;#b`J*|9jy
z3!c$)U?)wmGZC{?sUce)S3r8jG>-~ih1(^pNWm=d!13LpgcHc3xqc~vT#Go?);=jI
zId*Txt&ov+P|)hjK71>du>GjvH*?8$g4ilp>bgNiMUrXl;!?!+|Co!y)~K9LZu&9u
zW5Uq|<RMzplG>C!%)IMgQ_=0=G7cYo;<5;>##to?G#F!BVCOA-x#`GG^C3Nixf_Iz
zt_-*zAOFUm4My&5sbQ?4->b-Xfx4^Bi_hN4R*VY2<uuZB-k&(oMGE+^E3ve^xE$3n
z|4s7=QW^mbw7uspS>}8h8ou?<@(BYAPWCYcCpRuV4ZZ4}rU6!S8~lD;et!)yBq-f=
zfcu&arNB~f6S?DqtW0m`I@G&9Kf#An`87F3FmHaRX$Wr_m(6A9lksgEo36V1rkXXT
zwRFFkd_a(K>&hTQw%>FcT3g1K*#Nn-t6qx-_V1Mo{mldU8k$daA*7r^Qq6ngTR``V
z8q)t(8{oZ^iHZUiF_@LD_&@7XUQJ{-a2RwRZ6LK-O7+aFr@Z4b#fT6G?Q#~5Lm>Mz
zxiM|oyLc5cZS_0bD3pxKX`h_h2J?S2RfPQhHz?2_NYkWcqyCvF{Y9Q)Io<|&KU9rk
zg->i#^Xl4ta@_p_XH#OthjzIk=?O>lq=+&;HUA^(pWO{#5axc=`n8*XX@2_7b=ZPQ
zOKDzDZB_n*)acI)qI^d~%Q~vXN17hEklCw(Fl~5}WjDh!p~?B2suju5kl#&f3SJyl
zt;;V9y|wUD-bLE`56kOT8Z0)ar4jK8xbbJ|`IpY-KZ=_l#Ye9>|3P~+p^yfJhP;W#
zfFQa(=Zki}R`U*`__|w!eiW=;5c>M}=?S-DT#uR?n9c0FN^-oh4l}Tdu&tO9qBcR#
z^7?IB7W|;8v8YEFs4qQpo%dK!K+Mql2!wNq{80N~H9iIwZiFo|Xiy6ld3>oagfIWz
z5LOW|i~W^>9Oz`o7(E%Erpzyz{Ff{XOGh`fMD4^%In|n9J;fwUuq7cvV5LN{slcIm
zVf(W*@ned`bx68$xxpvaj$?y2rtVK07gbFgssb%H$C^YIVc8zw#@Wci3uq}Ff0<vD
zBU^@tDhS|wL{=IPJR4YNaw|_hFql2o`dxjf=oF%jP^YUntlDc+W*hdaLWZg(Xk!a!
zfU4sK3w@rPvFE2M*Dqqhwtg5w6WQjE89ffa#s+bvARmA5MfNY?qjP(iu$%E`-4}vh
z?C6ax)RuL(c8PwWOixw5^JV=449E-r+v44=?;VOMo^QP}Z3$`}pK%V<Ini0f4`oQ}
z6+t`=$^fWQblDWL2=s~`(D}oeo%Vc<Nr%uyrmi*gNJ`nX)AHsUze|At#fRH?;zD27
z-2U>@<96>y+dz>wACty2*2{+(lr`)7@lCAg001pkGXY{5#d`#S-@EoW&Hab@Z#`+m
zQV@4ekJP4aX*an(PW}OIs$k+TOX_oGZ}=YMlyaJ5g$MGfGJEqBBH-M^Tfe!`xvc@3
zWY3snV^8<v5p@(kI`2#37v~9b;l`uNmysXPc!gbOM=5*HEW>R{D(Z)@av~Q(I#9sT
ze!N@UrS?&qrSWJ~%gHmw#+ImGHPEQ=7mI^prd*d$s(%zi6Kxr5XU%K9H+ICdPR+c_
zSrR51k%BoeW^)aWOrP&i@8!3Oh?@$w#URTFTcsGFIG?s>2+1Vk8um|na!7@6W1GE}
z8wNBb7*+`J`Cb-sgok-K#usi&EzC|Yvk>mxooW*aV+inHne5TI!Z!Rm&13RgZ#Yc_
zQXS1`m5(H*7gTqhqm@GSr3bI~j1^`d-@4@RVEOeg<`slnfA)f{JBqhn19L{b{^Xi-
z%?2-87#EM3N%eVVz^F*C9qxf*vjC2~_(6A1^^xh!w+g*mrwrwvZ`TihU&H%RmjNnD
zz4x+$mfPl^Z+%A<J_%$$;N`;-S(DF=Hsi8=*T-^oB#$!O*fJH>OUv#7ZGiXJ&T75r
zKf8YXLQTJi^Kzb$GPvzg+WE0U`}~j*zp%R&w}qh3=NsPDUn76Nn*Z|UjZ2SO-O29R
z-GpAnG5*Y2#VppVX};N<_9<)1ObGccYsYs=XyFL7;O3b(swUJDn`;Ii@c(kkP+I5n
zc%{}GHq$B}yhK$RNXg(eeSm&;g^w&ThhE^kO~<sNam!;N5e=wvGKN;=37GNt)Ydcx
zAnVj7JV{L4UtVeOnNC0_Dw~C;=s%{S=zh)!A>2TYl|ruWG4PkIoj@l2=MQeUBtY<K
zJvBg&yB<Dsz4o6W8*x)Nmt0O)8z5(lgCKBm@#{zxexOL}dxx$9&;l|#2`L8DoO4nd
zv9l(?kH7wQdpzk%Ng6E)HE42fxKltJ%2fnn4cHG6%bc``9NW(+2v|fEXa#mjxYx>j
ze{lS@zXd#F#t?8yL;-tRVb*fGd9l+U=3_Nt-XBEy{QGTl8_>}n!&U7J_rfdMQc%5l
zS2K=M2~J$-QPy!ijlXxnHGJ3V>IpBtJKg59j_VssF}0K3U74b2PO(dQmdi&@Br+R#
zZ;wNH)D0&Vw(J^rh#TsI;$jN*CcCKL%Rd8EJja!Aj39mJ*=&Jn+MI^0e_xsfCM_Pa
zH@u%j^L5GF4MKU9Y<LX^H}Ko=<Bo4`gb~hZhjmewN$~_1Y4MxQ+y|YWT($4lxn@q`
z@K*GucnY@<etb&E(P8J=#+R9IU)L^4U2tKP2vxmwdPV@~z&;@|_fB<JE-pF7Y?JHH
zCiI`B4qKvviA()kkmwYX2CIIZC>Pqj!*MUfd0d*aGpD4w*lw*DvgH>nC^@YUC#R8_
zVEU=o?a!|{aT7BazNorVMN=d`7RxWu<$PD~Erk)Ptl?;A`SUzJgHEqyAt3nkhrOu(
z7TWORsX92k#UsG<+)KudV4Xuhy<)4}=iwzwcJ^vpR@1EA)5DX%Jn)pyYkFOdLe%4M
z*jV!L3=PXNK9E)3KGL|lbrA1>0*YnFA70Tal1_Y_d<VUN>#X`-`QuF^pbQBYy}xXG
zWyIz}jBYZpWKkZxr|zfj#I(V<5#-1*{&THUk9OJ|wqJ@A(vmLdlo{52@uj(VVACrF
z^=FkHBAVeCBW5OXnS3(QR0tmhJJn6;ZVC}EuTq$pL|9sXEU+HwkYhD@GUU{*YKw3E
z8o2EFaSPFvVmv+gici>Gz{}|yc;~%;j4i7?y#dfXZnEm-g-OVnvlKGUvTUX~jNU+1
zVeU~iJEW;Qc4oTx9$Cv+7Jg^LH#g|f03@;K-n4(zdyL2v<!5fa@m`E$7H{m5K23NA
z>%_JNSy}Xw+ycvp#|Gt;Y9E(kw5;*o-lzAO-(Y`qB+(+ffODn(e9>Tot`F=50b9%E
z$8q))Nu}rk<{l>X`5=ub6XvSb00a@f>!^7Lmp$v5-3K>zuxMi@TZVsZB@Y^O%_%4(
z^n-6(jW;Wo?H5tiVqrbf<rjG%{P^aSh})+OQy!f|*G^%niPrJSbZk{eY#9($(CNzr
zgak5eMw3%8Lhda7^ej)7YA=Rv*7lIfQ)CKzf`Mm!i;NHulhf;jIRY93l^Kl}**tRT
zq{4;<eKYN8H?+p<W$T(TkQfh9cU(N(LpWxLVW{(7KQ~e)&d_`%&6XN?Aapkhv&8-F
z@#ACrrQVx%9v$twFtV9K)LCLbk+f`hPOT_66ercp2IPH;N|MkWs|#)N+aticyK$>!
z?%Ilzx#ksoOwZDAkw5Z02jTX&oi{^Qhvi;u6g$dxmM?RB*cFR<UqLke;<Ir@Vr_^B
z4{c8f1(oiB7UX73v37Xo721wn#Tw<x;ujln0c3@y<!arnwc6RcCudF}a3{PUs~~1}
zsy-XiwPqzRiXkl=5s!5=vw65Rpw9X*&*_eMtkh<GOaDxFbyT0{%hU@#R={qXMaa`V
z;zH*eIU!5(vLAH?S-DQh>OD0%uW`CRr*w%89>?Z6b2bv05I#6{qOkCIi}y|4`1oB#
zW=fzMEB%Ie3b+OUtx3HlYxGo<$qvv872@{likxMJt(RG|KnM%*1JXGw<HT<`I<=8n
zHsqWPa_7;h+`;dv(b;^rP0Y+?y6W^K{2`uf<tl9tW?V1PlW~HoA$6T{%zmnS4Zqa>
z;Chc@7!T>N$R6kkzu~&bhQ@KI$b(>D^6zx<F>lEwOJqGPPwwd-Qr2)4)E;Bf3HJ?>
z%1B1oE6Soo9;9E*P)~mB!3f4V@tVem?|b5uM}jvJ!?<D*d&7-AbAE%K%IgCkxo9Kj
z)PpDGgKgdR)F*tOm?`V*{)!^NA5W!yYpGnxoS}r1%vm(#EX>E&IcMT~7gwKlCmkUh
z3d6Ge=H?2OvzpY1HH(qE9@nhJJRc`A#$lb`oMPkqkkd@P$9dVEvAC)XYb{%x6i>h^
zEr<{{{;@ssVccr{6I=MxUL0DLX`*y5(yr>7P|jOztw!-{z-<X#Kp<K*c<o>cr;18G
zCn;}J9104G+fbP$Z}ZwrT9QIhZ^Oz&;_;{Sl`E6|7{|<u9)o%30X5hiNuF#9pDh;(
zo2G;LRNP)Q_L;{n)CWVKQ~Y@9y~dJRcreQ{nWwmT7r;A_Zsnnmp}>4SU_n;b!`3@a
zqWzIsdNo`5wU;dUYN(Br@9)VCxl0Xm0`B0R7p+RwTH1Pdx?^?mOnzUV2rAYpy(-QY
z5vt18nXJc9;>T#~3%LyVRp6UzcqAj{eoUe^I4|ff(>PWru261Pc8@-a5+WH#FYe3(
zs-c9o%E8UP9_AZ?c_r3twwRi-{1Ds-x)ZM~j#f3-!yEX^u#4gN-@gY&x!JQhM-^4j
zUDp1Wm>c0obkaeD>1+y4F_q2Zj5e^$AfQ#=^lOz@g)RB>dv9yY?H~dOP*8n=BOka=
zie88S<;}pFv3-6UnJKkkCRM6f*>!{(?To|NjFpSOPifv+Z18h()q1c$eCV#;T9I4v
z!{4E@(?@W|xns<t;l<og|2ZNl*QRge&ZNO^x?YXAs4gqHX<kKgPr$C^fX&E^;BP4s
zP=2?<w-Bdnn<;8x=({|uG%sI1Y8@Wm+KyyK$k@^cBoSMdt+74(z%@ssrwHUPbvBb!
zVZ+@%nSm>sXr=VP9M0$^d)zW1i<QWusWI3H@0MJiJLN`Y;GtJB!sA|%3uIji*2Sb0
z>pGLXA+T}ln+4G)M!@~A#p3psFl9|<o}Njs!yRTt_0KYA)Kxl&>CZ-_-*I?8QGXUx
z%lwIH4O$G`&9P7Cz*Jte6MCiu$bPU#2smm0T>Z5aKxp)9SZPJ^MA0IX=O#t3sj<F&
z4$Mfp*SiI}U|OjF;_6$alXEn&!D-WCUcD(PG&kg=`p?e{xEz2T8p=rWZl|4h{I1h&
z7v6j8AwNe2<VL?{U=?vRF*QCtwRL$iEv-4}VmWyb&wEh%(~0l<6Owsdn?c((y15hH
z1MX6_EdP9V01eNtN0&Q(r9oeznxL)z*Sh}raU!y1;MNTGYcAWHpVkZ6>#xRt<k{KW
zSi76Doz~@1oM9$KE^PSM`2kJ_xZ{IOd4O~RK*7H(^z&<xDrK%rsypG5R=}kw`DvTT
zW5@H;(XU2<`zinRz3JzljepKeL<Ji5m*F1X`QJREzs{4C0B=72LEKw~8sMT;u2X)`
z@Qqi4m#B-m!S|1!9MK`>Cz6Pf)@YE{0=-$O>ET8}o8@5?_wQ}=0rBPCCcv|ro|SPP
zM`X9G&yRNb9XO`>cCPDU=1~TB{>b@Gw!N|&d@aoup0Q^5YuR`=!7b0E{@5rWMRF-K
zsg*^>&CfZ_*;J;bR!HLivfaNemP_4_!M#puLLpYBne+HVL%_HBO1rpHy~d2w#H}-~
zOMLIKfeCnqFY+w4Bqmmx=1TnZLYB5k=ckrt97q$02-%&yb-F}EJReg4hY}uzI_}M6
zLOop}h+VDo5TmAuligir)Rg|P(N-`gWs3bS;ER7aMu7uZc_aU6s@Q1*=Mx~xi_O*u
zl~;ch><1(u*C(cbCeNLBP{G4Zo2)oavvFp}9tGOANo&~6{jc7fg;oU*A9A!C<s7)5
zs^*=Hd(h_>=$WY@J%q=1_QV}Jn(&9w#GzY!b!C-XfphNKETG{ae*i<wt>vC%f?|%k
ziCr1*Bhn0iY-wyrucF)SLKzS*1E>Z{csKCO`n!?A;bJC9m)oF_^HtLaSlsgC%-Z9-
z0{35vBl}$$4az%*cD*p}r%7U17jn5hinFBpXPjOAVz2$7kwj`Ukuf%43BF!%VR7ec
zkgeI3>F2ki9Gr;${dtK)jh@s-`MpEIb97yVUOYF#cZF~4!LIN&s(}1E;b^D?GP0$7
ziT=se-f<X^ymHPXp-gk&;YPGh<K%Z16Wj8M+U6zGTLW_6XUMe(F}!bny8{Az*H0o7
z9$NIC+eQY(xV_9cqF+`<wg#kHjO4%Q+vF?983@h$r_;@W9nFowpFYj?E$OgjTnB)d
zc$Q^P23pIHj&j-L4p_7JHTK|DEYqpXBSSZa8?ElK@NUS#v&uZuJ_z`C_INj6yb)R2
zxFSQLOl=C<tgXxTT}&z&Lf>=KW$@cuB?mC2z6m*%^h0chiA57Yx?K9wzck(7qH6>O
zq&kN|nzhqIJ4D;~lTH|jc3mA)ob1@tSgk-<AWC?mdDaD`ji^Uv@@-C(wn$|WsHF#v
zcK+UWK>l#*+n2-lplgs2AJb4^b`_9?OfojUMHm_>Tz~QCGtWObmOfwxU@{kOVo8i2
zF9=u*dRPVq?PEkA)XIRlqUJdoS5DcMfXM$r%Pu{0H+n)JC?Ki@>oOK%z3!To{Fd~O
zRDIcW;zfV)*G-38z#wEWwGOXlzxPocs<YlvPMlmM1Gk1nk7&_~6SFrM+&jJI#)Vp{
zRk@&_5NXs>)mPf;u^W+4w8g9Y4pJO+h;cSHo(`f798g8W`+3d4^!g|H#O3$8yw(N{
z*PXv@26v=(>fAKO{yng?Pdy;fS6B$&2!N&Gv*X(e6vY}cd2~I%t)t0mzTxJB@}q!5
zMZJdJXA}#kIqO$|B51JpI-RE*<ig2i2p-J-Vuj)F2SVn#I@jRU_-VJHJ4e^JtNKJ{
zND`_cxC!}Aisn6{y>|AwyMb%5K(nz=`hHtrfzvCxmzbU{w4|k&TLPXkaAyYUMT)h|
zzao7sWZSJAC_0x**W<?9GDLKh>Z8wS(C6^|1@rYQ0G7B4-@DTrRCELI1@9ADQry8G
z7gg|nB;WqcON+LdnXAVDwh*<w+IIOx?G`C4dvViktEB_j(>7&+!%KX6?JIIXhdje_
zeLqqlorMEgvoiLsNaJaDdP7|mu={aIR@G|hij)q5zT7WuMrDvq$3jRnJp~vgei`)A
zMpq_Fv2A-Wr~=5a4K!^8pI)~{U`LA3kjdW|$h{R9VXF>;T=L9vflnnZq~IFxlvJ1B
z&-n=wws9xfo92W!<OF?sp5U;CYJre&@6oQ5x3ubKZI3a<B%p^=sqSA#+3I?40zZij
z<o^l~fZr!2F))X^>^O~a5;G6sb3qBo#-^?Mops^T$Hz2J`lqnT&e|TZXl$VVM6lWt
z9Urtv<X`J!#jzoGi$1NY^|r#C)%ljT*2QOqtPS$n)j}8&z0dO0gp9iSJ{q4+99<a;
zAJ*KLBKv8sh176L(2OIRio(ofMGm1qk~MiK42BmcEm-VCE?8V$?^p&5yl(mJ0axx3
z&?bp@m4t<koTV@x$ez@Ir(4kEVr0O!U<78g#}^Mx;&MPbY<ibrj2yumW3|qo#?pYP
z7CM%0^0(q8{&WEc1LTR|3kDL)e$zpn1fAgR)wd+uRHx_F0a_`#;d&_e`MOvi<atfm
zk0oM{vWf7?VhsM$Dv4I)*&#ZbkypL6MVRGu#jO?}h%vvAfXS+@(31$~rp==Lu2r$F
zvjapPW3$j7K(>OIc_sm91XDTg`;ZIzJP)O_*z0)uTI%_jgkw|3=95<M`Rii3v4x+4
ztn%Xp)jd{akDIn&!xtyU`Xr%_dL}jpNg0_3M720RdrVLGp#g~bZL?#UX8X0JB-HHD
zhc;HD&W>bC9g%cy{0c+sqblItD^|Ud(69TIyq*bHK60Vt5FoNFEx)CYFis==gMr7Q
zdqceS+Z9jxnHC%A-w5H~+fc0CMqw8C_8m&?lZ$X8^zD~D@pZG|Tu}1T<~&Xw(e2oa
zdwmw#({T@V%^DE;JwZjfQ!;Zws?SK&KUC*-s8N%%XjucE0ob~4_-vm&4QQ8w9uH&u
zK*CW*j8|)mw%0xty0#Re^!Bp@mh);kTiAf98pDXkuBTva>{Pje2hNshTI%N&7Y*-+
zmq^7%iOtvz0zUN@loi!B3)p|qcq&=g={6uEPTz3)(mSg@tdpx$SnzZ|Q#sgj>&|bE
z4_JFRKFbJ)H-0=D`;Y{HP}4bwZVh!~a$udnwYZm2w(4X;364f>K30Wf#H`@iqxPa)
zNT03F4^R7iyHvS16355@IFwd&D$kdCHH2D%Aa$q^e8%!yownb472HLq9vgXqf~9LJ
zWp*pQ5bG!(VS0X-yVl8RK{s(DVS%R&jHp}h9Sb~}0j|wj0$N3M+(N!dr$S#=)ngH*
zH@~Ay2jKf}`k#Z0I_dn@d`g<jpH&~7E_lKO81qgRAyo#YH`vqa^1HRFZ-3ZRJA9jK
zmg4^4W9wyRS0e-E;?M0(xT>=4JVf*Mt>%veg9ucouh0&-1-djj+;;(;K*q(B#o2vO
z&`)>d=MLZJwj|4*g|t*3pXuYr7q3i)AZY>1)MPG59aT6K)^IX)U&>wDz3v!;o7(Jr
zkJzfZZ<hnq*4m(Q_;o`W(F{523}dKp06k7+dA84ub(p_)z0cw4^Ap5kM`(AT|DM3!
zNfb#|w4VNxJB??i3xu(Q<iF7|3!bkZEx*hqfS?D6n+MHfZTw8V<P=orTq~suFM=vE
z3q12{IR&^@Q3(=@-_&Q{CkNW8kN$WT%g#Dtn?azRXI!(KuZd(4M+FURjEDG^{EuuO
zlF=~f(tvrX7#{`d=0K)TVA?L|B39$$j+wa|N<?I2<|)Jp)z^WeK*09QSIFa#>)|5E
zF2XOn+3PT5Qz}?*jR03|4WGpR1Hd~P({dTiimmG9xF~^nkEI`c6srN{qpEq{V-uU0
zvQH;p-lp%#A+B23N}cc7B+m#8<l};zFYFONnM{>zjm1M;vQ_Lau(pn;BLq0vhsTp#
zEgsqS4#Ec?p_vdv-Bj6_c$qg}B5>B_Zk=kUKC*f<A{BgmmgT)^on)CWeWJc6J79H{
zTxE8>XZKSj@5$>1J^CY8+Kuxh6O{)y9$Fh+&jL;fb7o*SDY}9w490MLc(e9$udc3N
zbP{dP<6qQSdI1`LgVxM_X!M~u8?B!l$hYf7Te;6~3g{<_`J`H#)y_F+Xii~S;CWM=
z;wQWlfsfqR&okfxK*d9*Gj@AlRZC58;5b*%s&rcMw=B}Bcv1?;x0c-646(vYYTsC5
zb~S6tj<zb~B9E4#8`hb0I#UP+MY5N?E5CmefHR6@LreBBhHo-)Gv3ACDRiP&V$0)!
z*!2s`u8-oamr$b`Px9rxvwp!H;7UF=KK%#lX<iyq`~ztIgoN*3gtA`Nw;%G=6Oz)N
zRLYpQcbN&+h8;nU*!PGZ=~7QwxANj-O%6&Ao{HC0#wW9VuTadKIQc%;;_#v8cl4uB
z96nw4^~#<^nPz@zUI;jm0Jj7{8XAYSfUW-vgcU!W&6a#5ceallp1skz67jzUhh94F
zpJiscEp_Ryw;W9i7l;M=dV#!0TK9_*U=8~%5KPewd~Ii}z}a&V2#t!rsuetG-n%+O
z<t<y?Zozd!zunsZB_#i8DJ-~YxP(Ze11%<O)f)Pjhb#pKp58ki=i1W^DV6j!NYTX}
z^ofES&#E#BXlQ*u$D+Sg62&q!9#t?lbNvooe}$_51El8v&im|1-P(g%7EGlqNay<L
z@se0wq`<~dRIsoqu)waJpXHb=7y8--n;h@2?OU+YfjW`!QpI1SJaCQbzd+|F)(c3W
zcgBBF1pfizSn4>6abBQqsfGrB(e6M;z1t|pNbJ(ypZ3~TXK>7yEC-e2UA}xgaP|Kh
z=X+^SI0poj`gspQYR#^7zK$Bs{7fJAjiJOc1h?0t($I1AVz6b@E6l;fZK$Za8RWnf
zgoI87uSX@*3vi3eXs-x6sb@DsTEYcOf<781O@|J@NmT(}hb0pyh<RaHSYa~#LROIq
z$yBX-a=3HmCQQx|WA3f@2pad%aL`dI<x<A4ql!)7_S_-cyG)y}#m|nnLl}uqR&okV
zK(HzC?L+nZ@99gHx_0&a97%r{zmiXPrlQ)acYMKznr&#TKR7o^<_(h2(wn#J!@j-S
zeB=&h;(jn5$*CXnW15fZ_p<)XMaWe>XHq|WfBd7fO_#+2J5|D*Gt-@;zti?l?FdlG
z+kq@i_DnyT%PiGPt94tQ)sVh{{GH;E8*m+ui{B`mL<D2Juq|h<g~jDq(Y8Lx?a;`u
zyZ?7$<!WVyk=o$`dwG`;i)KkI&}Q#JMtJGDbJT09cjO=3yhDNy8;$Fq7X!c1{HI^L
z=WRD6`zBTk7razYpO2m3wcwjNJL2bJ-b&uC`UiF@Jo%;(wo6Wz;?h%krN7f;Z>)Q;
zt^V$SN5x--k^ZfQ@zjp00&83yV`1X2=6^3iF&c(C{#Z6e46ZNP$<dCcER|~fn(o=@
z$y)^zc!74df#JmAuE#+{DgNZ09}~HQoygy5t2_Nr9$-1)h&Nz45~jcL|K~;pt+0*w
z-qhLAj&xl58`mUPML6}NhY&9{{x<(V*qukA2HQy}GOGK$boAfB?f=Jqo<jA$b2=pn
z@V3t^_bCE-<K!oehr(X|hS)@L-V(`!(6gDPrZv0pyWW(#neD3DI6uwR;Ly0Wo8Dd8
z>|*PpVf*8h{KasJ@EhqR<b1M^d%Zsjs$T#b9JY0$x=)Vw++uSlRw(ExO6#XDqaB0s
zyITA_459f2QNP$lkTkmW6L_H+f{3tXxIXN?JiQOa^$bU*`l%eFrQ+~lU$7D8uZE;h
zM712;oh{Htc^?KRWK_1B`9)tRUVbHy`|LZPtbd;b)HcK0?&`$Y@3@ggJneks9p?PR
z_~0~~&omO5<`%o3*02358T;#o=8YVZ^zm=p4maQ28c#2!Kvi_uv<c|?5HnMM_<oEY
z4cj1nzUTgjoth6ZD}6}W<|BQO%cHaX<{7@rvHjR2?{DKibJ0@bE`RHG>i~Fx$JSE7
z$zeA`(}{|UqoEJ+vSxyo7jb`6ttn;ajyrh%2j{t=Rz#NVJ$oKa7DW;Ri1=t~$?W5L
z|9R}mlIaF7)P9o~y~_en_eIW1d(OWeASbpeHEntifEg6XP11Iy6;e}dD8dyf?)9VS
z>zS85SjQrR7@0$S(lg;TeKN(egpI{68rbyXuP4BZv}$<M2<AqdI#>MIho|Q2Gc+QK
zhxY7f%PPrXOLIIHRwCPNpF^bUW^O<-TE7(PhVTmTF@q}W{C2s;7AxSfnDMuhuWtaM
z6mSIv(?8m3T?<_mm8%j1Vsk+h{yZVQF#kNHU=Vn<iEQ1iO|xZRjfO1?ui2@B=+e&2
zfcBfb+|E}VUUZ36{tz)_&3hm4)<Q(+JbQh}0dk6(j5eJOC<<T?Cvg6S0Ux!wipY!u
zL05oOoDiXFXuj%Q2wulq1Y$3sF8onnO)v16pr%88SgLzpChavPXzH<D@iR&M-S^A#
z0xl}6q*~WZU@xLD#l)qkK}-+@iT(2s_?sU-Hd%z=e+7wd^zFYz`_<_cgZxn||1c1@
zE?xZsI#s#!3Sy)HCB2B)qD{&^{;IYYdz;48>W|G;H$LN7FOSVw(ONT3@eY+BtlfUD
z?2W)bIHmur`L8mMpPz(x|AVLd)%1+=@1=;JpWLMW2lt7!E#T$>v;0*>!1&{n=XU?#
z6n}nlHRkVeK`ueRHn&`Y`dh3WG0+^>$4MVPOE*f~94BHvXYH+8INeJ4h1)(Xo|yY=
z<!Issks9@5Ty?_ecn8rGg6vdoL-?BLUF=&RD%`K_o|$$?yi(=99&*5A8nN*d+=nAt
zM#G;^MZ<DprN^%{Rhv7fPHm28Cg#2oGQnFc@@8n!w?yys8;mHJv-~3C-$KPxRy2&g
zmrK1rKQ?{JpEl$G@(|AQ&Qg>sY)>o1BV|n8cNC9Wi*q$l-M2J%l?8GSZl*3N0~wik
z4z#<T$4l0~rB*W4Eo#B)>aJQkBL`T{sh8wUne48lh{x=}7dZO>Nz()Pvv*4}F=Wam
zI>}M-3ubVo+*SifVQx#(wy|nUY`pMZvas`7ovxeXHPhOvfHANgt6$=_;z1>Y821?I
zT*vBIff%lq-zf7to1}H1{<HSUA$h!dzR7`WLgO4h#!uAc##3CM+iOQp<9DOuw-zdB
zMY-ryGCEsQs=PR({YiLNvvnLwbZmRD?}ZW&Y7IKBC(5|^S?;v&c?|FBT>KN2kU>s-
z*9VCjx3pv5dojdttk&uXhJH=BeD-04OKoWV9L*@f>h&Ik1eE1j63t(QCCndO8If`E
zq3IiBMI%lLku#ENHf*T`G+CGT_!GQb6UA-0QpMNhd}X*->o7=eeAlG-W-~Yki&Ns|
zy|N}U{4iH18WtiheSBB%L`Q9d#$1Uvxnn&I#I{PkYRlhNP{zU6rQXmPGf|t9wD{aj
zObp&ATis~oN~wCdCN%=B<Pp(4JiIp~ydw^+R8*zf=jmG>?&$E;lcYzK_fCHr)j4BN
zEoMHYc|ofJ^i)*+h>+XVhnK}S)3HF}P-By)YT26Ejz|3ZZ*!S)a?kc{K8~^}B;yiL
zByCrI^jep5_3OW>gGt|B@>J&}hf67{f?x+mk^o^Oq_Xe@M&^~gttK)1%<?EZ-jgfP
zmJmu$avoAZm*Plf9hcy8mqY5L()NX`e%cEi*pA6Urw<?n)75}1oXxRI;FC-|b;Fl{
z#y=?ibMpC0)qO>-<RRQl&6i0nBXlOG^K}ECT83Or#{4}Qn=M!Bm-@sF@40FDb+<qU
z#LO{EXNq)toQC+Am+at-lDK$c)K+{659~ZFVK_0KH#(l@tyuCBKvEGQ8G!b?Y5{&J
z>{Bx7UU3&QSyx`+UX(QS8)WY<SsCugv7+IgaT(%R4Pl=mTMEps02!TrtACek)St~U
z7N>T|{=T_+sSka(5LY#9TPx8$5}7QVle>`AVeXwciCB&2WnXP$S8dUgphvzCMkTw0
zRr!>A>wAGs7oH{0L(B|02cb@A+}0GA`$qj{IDa@V@DZrZ+2`xnnK7J62hLU*sm!oM
zSH+DVzmUg`%xeec97_X{g0P<~Ip|ajUQ@oVau(&ff1q>Erf<l~dpanWuvW2GSDUqG
zb$znN_f8^-k)ed%Bf>|eRF;|VR))nrwpSA~9olWX*)#})`K~NoA!#UexhK8dv3qEm
z9JyyD0WHdp-y_$F4{bTF8E%<0($6sfy-o4`FD&p%b%b&Bt+p73%Ss%Y$TIhKQxBe^
z1OMvlf(P!iOH2*vxM?JGeb|xF=#G8o`GS>7cKgXue4TibN~^3YT#3fqK*fkkq&Z6*
z{{ka|Jx_|RPd=+7hZsLTlnnnX=B*`b;&siT0wKHolRi5=V&*-y5q&>cio<PNWh9P?
z1oTK*)kj}*Cxen@Ww#{)cXeGAC`I%-!&eW8)3@W4>0(7H-}vuM;N&azN}SU=%YH-L
zuoFebc8@q0=<81C;_@=LYUv#7GwJQI;XA4fw*%Wa8BcC;#nWe{%VkF|^?;*!T*#zz
z%?H}vQ(g@VUWT2&#I||B4*m)Pq(!}28Nv&;<%c~NI3Vn0mY2d@?;c3CY<VVr1Iku3
zBq(!><Ondzo0tzc5q1@ICArzjvR*54o3t!a={@}&%;H00)@yx18Ek5BhH27`jN2uA
z4GnM?p8et~Tp!Ow@pbz$2IM(c#*M#8OoR!jx3cf_-UzEY=6vPFX#WW0xYbHUlMZjz
zah<))lBoj^O!v-^r&h{xyetm&QdX7RE(YvwM|)BeS%s_Gp<<ch>HZXt)OJMdX}_Mb
zUaP(3%EsJVUkAbwQhxPmD)*9k`)>n5xkbW<12<wdBR*--8h`BzvZ{IUu)myRgMtOA
zT!1SM9ca#vlXom#tcywL$VdnXkqB+W*;Vbhk^SjSr_ahUJ!hDugKp*+E!EFWjswAo
zjiJN;mMIz9EY##+cf`KxP*-g3s3smG4xX29Z_|LuUS~PE{8_G<!6O=X#DL{QTh%Km
z?mSA)^+?h8`k<S4V($4vtpKJN2C;oX0sB5?uW55?)7Y&9q`@)2TE;_Qj*U0fbZ^7*
zJ0+m_5GP>6Sh64_CdxU?!-V=^-T-+;!c}4U#)>egW8ghkPf*q0BKQci3uCcRVz(6Z
zlpg05+M1?+n-GWseeB@3j<jzogNk^H_&L;c1#fQ1Ron~3wKM117}UWS62zL+Nq)f5
zqCwLhq23<OU#!U|L)Lm2>Ju<GSMn4IcZx&ZU1<AjX#F<#QsM?(8QGNS03BqC&&0%n
zx?%qfU0u3^0bTkdskr1g2A$I(@BkccFC_7qPKA}iMQ&__pzo)jC^pg`D>TPf?0sIk
zVRW?+LZhVGa@)*ysUAjJQB&5=suS;-iSa_o9X5Lpy4Hn7-0#ED_!u8}W8U79jLJ3=
z{WR>MJZm6ZZacTOa&M;?aK`n3i7eylAQETXlpXF&0d<~2FUF~p1TRQxr)I*69Og))
zj9hDBs8e{tkk~5q?`SXW4j>=W&9$Q)BaDqtUp-Qa2=0Vjw72_gH*#DP56A(F)5Mjb
zo{aSrPrEaD679cU`WhXo+*=;7@=<|aT}d_7-*OmT)LNd2XA>2FUByOInkmO>h8L+Q
zZlspli;Dr(A5*cD6G;;@BYSJ;B`1Ux0Qay{#a_?HL^PD=#h99DO=C~p_qCJ{h+Q}{
z+D^8?2^+PrQ*pJLD}xiH8<H=>`m>!Q$=IZ0bQwdgzCi*W+DB#W$WsULjZih@Jmf0*
zyLtHv2*yH3c_AGsCs?j(s4LMX1&wC->TsgPN>ZD`zk0byF5lTY$XO}E?Mr{BuF(_l
z>*G!z%9%#$hjBu6#i5w8hKOF<7KKO`R@<>z(ubvTGRcv+H8U8aXnS^CYjc0Zl+YTW
zF7?8s=P&`6WOn7ff#7r^c~E(MyrZ*hL5CS1PW(5~W5QQJiKZ|Fxzgz~6cOJY-W|I^
z=179x<8BSmf%6~YarOr-@99Ifhwe`YEuR;jr2#*X7y<%Y=n6{s*zx*~BRvZde+&~)
zJal+^zekit_XKpzOE+e&6VG4-X5d)e)4|X@_j4(M@7?&!eNT&4<yBDr{q0QVOgyau
zl)lSxoo=%^lAeH8A0-DbbAyak7P*J#qUn^2iI;bDTxG+(uBKXBE4_5XrES|5p`mBV
ztVZqY3w-%dSwTp6LtIP(3iJ~e0@l#`@@c^O?1RZZ9e7;Dt?>IDcLd4b1RxT6)w^j6
z5?gE~pxZm*78;aWRE@X<;__oT#B6a9CK&S_l9#i2MF9itFQ22t{pW+W0k@iBE;zN<
z6|G%o5|SzYeALX<>8l@AdKpgd5d9?IdtRnkftBqO5HaI~MH9yB(X$NUEhW|L40<-8
z3HZnkac)a<bTO##&SE7aBVfEncx;udSj#icf3?14m+n}gQm<}LN=~kh#_(lKy0E+B
z>sRH-NrHFYv?_sF_JiS)x@$l%{`_ij;PbiB5yTxjvsheOy@0(G9mMAo>9roA4LKYm
zE4#)qBZH}3jE})Avh^hR^ys3FGFyiRD=|GW7joz^B3&m8xBhP8?9q4C#KfgN9e5j$
zX>38q&1k&SUW{3W4d4YK;I-qOm3&A^X?9GAA)SgOM_+g*M^p>R$A!r_OO3FuNK7I-
zxCq6;fUE}+Z>fpn6ZZ0VKN34CcUaJcLuSv}k%c)5k4A+TGK7=wRb^GqltCvW2|6$?
zITywa%5yJR=N5qS{h|)$f!iTFtAlJL8Ego3)gmc=M((Q~_`+&sINK~xmG5yiT`exs
zm~AFyfpC+zBxaW5aAh15C-k>o5IA8u(2kQl1f%3|AZ#b%5H?GiZlZ7b7?<K6<U4i_
ze9LpdCzhOk5a-BtTY10*iLWllbcB-+i|Tft%OL+}uRI>}e($r9KA4)uV}LQA>N<;9
z(~>xk9pVw1eJVc4?iIX<{NVy+v;4JFJfdxV^3W~C0C&~^8XIcK3!4vfjptRLE$Y-I
zg$@la-2eJIfr^4<RV*g{lY-3+`=?`KU)%<FBjT@1+S~@4o@g-ht4AdCp&Y4LP<d({
z$saS{<dVXAmkhzuWBcp<jyX-D#-*=W3E5GTZ>JYw=h>nc-ZauCggVI4(uvYZ*~5UY
zM&8E)#tMBEzMR0z;qM%^Z;*l}^Mqb%46V6+iHnzsJPisMxosB4bmtft;^1{0D|hkr
z9GAv6ahAE33c)&$#A6%Z5WYbr9Ww1eErv_m&XzfAGSG4Y^8o8n1qNl`L;FM9Lj_&2
zCS1H@dbD^*QHh}LcjA25-qQ9?fGcRQ8}MX(`<5MRc3u#wtz{Fl;mNm45f$_q=5Qw<
zcn>(Ek0zh4ua;&qh(QC*0az@p>U_Grx!+x=9VL6Pkt1)+ljb_+gXYU>;<+>oS9X>~
zU=Am2`S8GBlO2Ym1J4dm<h>hyb6G>%(^2;kKt23DT&ks!Pn=g9pzi;Kgy#~fPDE*d
zH{Wi@X&-I|0fQS_eZ@hqveA+~w<W+2TR%vN%OXaMV5<$N<^$kb&s>+p%n!A)P>g|3
z;@L)Itc+va=Q>c**e_IWgDqwhgg=B~nIvpwtMU8otu8G0{rP1HGk%2wc<2X>G)P`p
zhl{@($B^!6eO0dz%k2CI@i!EFed87zn?l>Ha=a3z-4%>a<UXUbAz<>36d$AA5;&U&
zIX*3%^^$;APgI*p2P)4N>Xq(q?G@nvEE@a?(fdUvGgMnyudih_I8r%RwLNYFMrj~k
z2KWuq(nfd>mUQV3Jj+HJH`4Y@uTKxkf|yKQf_xB`#0y-e*4u*U$sJgy*Q|?{i}4Sl
z@jny;D(e+RR^4dvqo$LncbiAr0q%K6Lkb?>$}`_d5ic{$d-G!no6Xbyv!k)CB<~-S
zC0Lje+FWSKPCr+-H-2TVDGpG#$CjRyCj6?2|A`QhmJg#XQ@~PwNd;aNG44IUJBpP9
z{Jbd6YPGXGv^93xRm)p~J}k+Vl+9j39{V3_0llhL<i<@W0`}H~_pXh*PnzSS$oK$C
z?C)HU=fqh6Mgv>*SPHq3Ax1CbXs5U}EanB8$Z!)!qC0BocZoyCdh$Xqwgdw|QDKt2
zQR(-sQSl*QrS7$9pcV2uMy+(slQp*b&z#m1FU{sJgta(53XbqaM_X%}bq~D^vRrvb
zzm<2zfhXQdERLJ?p><*CNz2M*x*plHy)pDw+`{J?W34793l>vis|^pQV|gi<o{6jf
z!$}sTrc1)P-O0ZYOkNF9_$7`FJE8J*aYm3&E6-dylgDHrzA*A(nLY^YVCfKaFe0)k
z#bxH`1zAt>3$E&s2}#<hD0=3r43yF~JYqK9D^O&~2AmCs&F%qoVD?P&;R(Uf#h={n
zpLWlTLCeXfxLHoiteoyKX$fRXb{cKx&6>gr*B*hdPn%i{B96Y1XxEjgret2tRTvMl
zVo4~j4B#rnZYAncQ>t3D;pPq(S8<s;$Zq7Rcd1G3d!HjtEMNVnZRB}FYo?)8(~a*<
zEjZi-0lw>)G2!P7`kW!3G3ve_*zC<AvsZ-@XZx5h-&hj7aJtr?!euU=`TSsE<y0fR
zWbQYmtarFGya}%PK2b*h7eX$vI56Pwc4P0P2tqr+=*0cgo8KPdhnDtF3(oT|@3ZTO
z<o&ad`A?kX|Ihfl-=ySu-@VNfgG9nAmUl*+6R&w!KNU_C=qMf9yc>7@@glKb&g+*U
jejdXAH~$y@Z0k(<7Lq?8A3ksn_@}C<b*JR^qi6pIyZ(!V

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-6.png b/docs/source/assets/design/v1/prefix_caching/example-time-6.png
new file mode 100644
index 0000000000000000000000000000000000000000..fbd7138596e8919b174a3d5489881838da2eac59
GIT binary patch
literal 25462
zcmeFZXHb(}6fSD{Y$&1vQUw*2CISM|L8T~NKtXzm^w4{ag<dW6DqW<85_*Rq1PHxI
ziIfm}fB+$ca^7J1%9%O$oVjyn&X4;CogMPNYp=cbD$jZrLC=(*kdx7q9Xoc6T<)o~
z+OcCNVZi_E&z=T;c3gYdb?g{OTu%DYb2t5!q%*O%UQ`|+PPxRG1Sy0obU(V<UZl`1
zY!;p97+G|k?a_M~nd0joWmF%%z4`g|mq1!xJ(*h5+Q}bXQwE2-NsIlCyQ0}<9$8i8
zrr9VIt_JCp^0h!+-M|l*{9Nmf@1@6QZ~R~NV_tBMb5o5$Tw@;Ow{24(-k_yb*&Va;
zmdOsJG3Q(_hxKjO+JmHWQ&koAnD|9o7EE|b#6=Mx4fo8OHy>lEs%qA~`@ugTZYnI+
z>(^J_%YJ|Q658{&(G`Z5O_h5T|7*jcB6p}dM0;uv&X7xtKRq<~G<iN5(NW`LPUF;Y
zQ){pO<$h()?!g)HxooXCjJnIC6CBreQVtDL1sE)<O-~YQ9a*d^udlq7{caaq-;?+F
zP{Vxi`hHX1_?+`UUhx<d64dm=ZQ}VnzHwuPq5e%}??_z~`JOH9mcd_$x>ZOI$R7lb
z=;&~+zmky|x{{_fx*O_Zl|dfhSp3G?AMVor_)u?NvP5|7E7z|>YQD)2Q;9VxdhwuW
zZ~D478>reYRjGIbe?HFSYE_kus<~{ci4Ki$l?j;F=f-)eU3U<jcA7Kvq*S@H#sT+C
z*ZuQ{ZL(Sfh@yqwz$+p6wy0L+YR)6=j**M-8ovjf9p2yXDzv2~LH#fc3!3pxzifk6
z;6mF76MaTGAB?s!ptv{+dAJH86MrAY6>=l?9vtbvl*Vn6e|ngH%+&<_@<siDOxPyX
zJ31XVA%>TA+V3T}ICbQk_jfUQj7uW-UQ+OUV~nUb$zS3Ncg-{upC1^h6J;2=2r>|7
z2aOeQZcO<dP~DX(ZJ3NBkKOd_blA*=-(Sm$8M6K@O_VP@I_Pb<#e8TUqYeQdW`eTL
zTau&Gxx=(;&vI=acH=O~+VF(9)0u4qj~9xp4pQAC7PgfKHL2h8kWy;mkA>=`_AoOb
z4mcS`1hs|eY!@N;lE;U%UWaZV@rKzt+Pg%Bq3^8(S1k)M@g({&g%StxZSD+p5F&Q!
z&?>`Qfb-Cm<HDZDK?S)9*dl*>0VbgN>l(?PN>XIn8qNbhx&R1dO1oWeH1vwX6p?K(
z_*93LlU3$s*i~tYdX^P~l49arU-)(@+^i+EOX@Pc4*Y!mz$H^|{z8gfW-2e}2e<@y
z^y!R{_ty{MFslq^#*D7+)15m$TJGSd*{8<(bD;Z98U<!%Nvi`RyU|03-?Vi)3cqz|
zY>2-rZLGQ(&=$d4|Lbz|E}jkJn$R#HYuQmpE#k3SOD)puAY!fTcl<kk<d{}vX#J)5
z9Or%cQ<Nod=w6FYju`XTYPDSdey@>TB#xSZ1~=^SJ{USgGFxZ3&zr8qOiVt-osP8W
zFba9#RRFzka;5sf#S)C_?AT)BT>unz@-4sdlXCMaaDVIf`?1hd+9g_7!=YDyiW8{X
z{0syh{?;)(Rg`$|YpAVXB(&8ntCMz3S=YAswT4s**4QWWfg8wbuzgl86`HBA?oF<z
zH~%ruIm^uOBU}9eyD8n?4bB*-N`oT8E1RJV{T*zmgAnW!5rFyB76~u_|Hk^*R^^)$
zOqWZ|*g#7&DxJUXkMRI+B-HOo$s1?d7&3McVn?kn`C^|K(iDAv=nY<=E<I{bp4_d_
z@~Ju4p!>A8Kd-ssLjl@%VNfNg{(~Y3tUY<-T@PHih3_x}3No!pw$^sm9XFW+-Qt3U
zz9C;281djKa{6%a=t0rNG*96Yq2_1s=pplH3)5FWjko2?D>e|qSH9G#n2{1;<#pWz
zyg%|hv}gK}h@RViXrmL-Pl_JZ%Nxf3nc=<#?L*1)C4#Wf4<>KFCefo*g`x#p?ZUY9
z{10z)4nw#Y%v1d*7#8ODWWQU7VMA$5o9<BPecivMR!0<YvIOrRE|ZsLtAnalHJFC^
z7*KCweJNdxrl*##b0yNz^Wc2$Znm4o{dS0Ck3mEP_;c>PU2rPuFGMXna4ZwqAN<W<
zm=vd`m%JHzJhY`}Xu-8vm_`W5<Nu=AGONR(<+=s0a?yBY*vV#5Dt7-K>{_z`%+7IW
zR}u#?det7nU&T_Q*U)*O!m>ZCMd%)VSM1=xQ6Fa4m#G|<kNv8oiFvOaw!#&Lja3Wh
zY73kp#L0^2HRarTO-XMt@Bf(fL7!2k;b_*Qs^3!5KL&}$`b3h4QUc<gs-h)ct$PM|
zV4mafyNPi^k+zqCWfKh(TedR^*W7QxqYHZKmP)Bjy-MD^xW-qqv0vn(H*1hP102-q
zhp46XaFm_eSLHA+uBDp$?W+{VC-Us}*5IbXpJWI@mD@HMM!x%hz5IQPz8Z!udY_wk
zVXh%XG==5}-{c%lx5_=0?xe0!_jNAwF>|{-OZ;2jq&$R2ptZ8CRyu!WMm;K{zbo1a
zo~rx+Ay#^Uc(~wJ0%qIp=x`k>HgGDLd+Do}CpF*XsiYBaH-xDGPZZ0xZ)sI>hgw~`
zQ1zM!&7X2Hzp;H7ftBwJ?Uvw5tfZF;eUlj#?fuyG)k3tU_7E>Fui5v-q#`kTCV&Cu
z_e)QK;p;Q5yv@#2)gQ3NG=D<86jVdQrQC;gX*sEO!n8S$zYNv2*lzIa^A_6IHsi4f
z@s+p24HH^%<=(@EzaDqCm$&B7MI&{8`7KkkRro!2r+B^V|5v3AzXkoDlNn=WUTn<_
zM18M4vo~{nhIQ}3(#^0H8N$;o%Vc-VJ`sE|)ymuH>*bgNUZbWKlu3|RhKGPnvb#@G
z&?Mhsh8(2x%JKJ$FUoD5Ui38EIayBceL8r5PBm#S-{4S*-sI-J2hXU3Hr_AN*hAke
z-cUVI+yB?nmD(F3J3rS1uB@ayuMZ3%e=alG9_k(~>o?i%q2OdAw-r(`^6<s24ymsO
zEv=`&21pYW{=Fo>=|BJD=i1=YQpkT^v3OrK`M*p5yMm;NW$eo6f<IR&MrQw#?e~($
z)5!Y&EJ@o2Uab4M7A8$(9ps)iKKbGU8%2)KWg3Pr_bAcq|5;+Xg697B5?e1r+PhzC
zEWHQ+-8(|sUH;#D|Nl}Q?A|MFkj@k$JR$)_*z+sgFUqmqTSoTUW_%zW2}#%NwFCsj
z1=GY(zmrQm*l>jmI)r<ni4f-fEcM!*AyERS<eqipB=TQ}PC5~GapLY6p%_sZ3t7;S
zbVN}kxPK+CmUS#zp{rLNpB=366-w7%Qyli5ogs)YfD^gPo8$>~0ZjQzH8*~$H6n7=
zeQ(rj@oTPNy$Z^{JX)x*ZrkIYvP%8-yZQ2!%>hmnLU7bL;cAj|`(UrfJ{$10L8-Jo
zb&oy5uwGvMLBC`r)|1K34Iy0gJ$|X86j`&AKVL4jpi{ro=9RnR(#`C(h&I=%5B_O>
zESaOs5HSPWbc-%5gDvcZLgKKl>$|-l=_;M|J8+eB6s%wHV3nJ2xh+M1Tx!S;zeO>z
z|Fl&3==(vFm#*zavU%OD+$`?GQa#?W{vt4>C0S2|nx&z#T0FM#=Ba)(K|?D~1j5_Q
zZ+x$h<SEW0?$Fe))-k)yuB&>{7&t8K?Qs@{_>3vjYLh)W&s`PUf3bQcseU_MQhI4C
zoxZI!LE8Q4k##u7u^}UQSo4U>!;slJka78_4~;ALlk!P?j#tD!MHj1LqGbJHJ53rY
z+o}H=#axJz*3J5vDI_Mvn~?9-pKn^R@_lPpmC=gfOR%(ihSL~pN@tr&?8u(-)aL7E
zvj;O?I6VrC4!ByXZa%VA^g;PBWK+T{8ih|4jo;nLuBXi(jv$@5_^l4f+ls79eV-fg
zT-n%IY%%^#m*bCUPo}PNlyjlioA&b$Qt(OhgXNGE+j2y*%@|%{-K(_5pmm+c{;PcC
z6M5a3$o4%vS~D@8*||@vO9-_FRheb5tum?Gbl{xjt|7SgV%5{e+Xzkjj59S2v+d0V
z%Z$0Ccc%$TpOMUrgw}5OM?9#<y-ttzf7}H^dUNg8QghW9SuqJf1}-J}Ohxrp9ht@p
z;JRf--iL{<*4a7|6i|dr&K|pPU9YV(>8R{dqEEnVzw7sjvJ>i*f$idRktON>P>{53
zn0mK&VXixdSXP2{KuM|YY#~GkpGdAR?9G2dc1MVBF;aV+_~ixS)H+61T}OC(Jv^5o
zFBLc&PNsO~5Eycu1rwtUocmV*`|$5}$Ci<7HoOdQhiPJJ@^oV*_q+`dBDQrn6@qL6
zcy$#i5%!h`?^Z3|BG83v$Wjmw!Wl{vvVkq+U{+zG%kj3tpzJw3Q397~8gY@c!%g9C
zS*WNPk7!jJMIgwU<e4HO)!^H~OOk!X7t!}TEEqkO+gC-A&IMbA2R{URx6PemT_*_}
ztol2qd@Juk%JLGdyQ+W-KJvHO{E^VDw2=00&sD5M{lO>6s>k=>g9T=-GVd%1+l2?W
zwTF&u0C4LD@=|m<h35;a8q4ZDW*iAy%-vY8*&K6mr>!p6*_49~hAu2Cpaaj?e_GvU
zgPDPoJ8|kvf_1H4-LCf1oF}uBZ@&$ZX<6IxK`bmMH?_Q8C8)vyB24VF+UUPmanryq
z)o;SxXn5&~eV#SRX`9I>(d%?axYjP|5zeCyDv5PX121%SS3H~?83mDItAqyU?vk|r
z=t-YTZQoCSy{XE??raRcOs)NiM)#X|$>p*7h3<f~txH@r&%K!4o9GA!0n7=Iodec1
z=iL{Kf(%~(51yiy4c}&wVdT9ZxwTWbb~8j4{%xr}rY;G!HBU2c<}ueka`BJ8MR;NJ
zX}Pve<SJg<=$hwP#Z?p~CS~9wyf*1xm&=3?Ra@6D^$z4flbmO)3#{l;=+xU=A3ka;
zpb5S@Ntk0W7~h!;jo!*7g-RgK!~pM)e#^TgzOkIUDhjIbx6Cveb8^<Tj6>_<WvXkK
zTY33^UM6;sD6SNzJ(P;}515^5OS-y0xtJh%7~pjv_eq;W(A0YkM7WQLqUiIXA!!2$
zHd%lanF#tN@y$)maXhA;@M*R}>jKwO??xN!qS&<1aKcP6MQ1qRw(P;$&ClZiivp?(
zT1wc0j!j^;ppeFMC)JAD<tm+t+P6*E+r#6GfM=Md{r+(4J=-DaSD%ZNFyT+vIoi&@
z`tm;1I#<a5j`EA3ji)*4AW`CEo9^vf)GvMRMGe%5)P-Kn>b3S>Fo}r(UI$#Iqfkjw
z{*d0W88~?^+BsW%+i_wT>>>FlIyqZ{7rr-nFng|*^cK6MWu-)?L}}{qK{iK+)6l@}
zN)1wf0o<;|mXVvC>|8jpvjpYbdmxu(4=1aP^9&e270<(b{4Aod)S=WyG8nitgC=QW
z>Wc#-5;zZwmjPAEl81xXs;#CeRxva4Q=61MTO~pd@{PVK0<H_NJh)7WlC95a%86d7
z=65;hNuET9rkB)hJx57QHc7V*@msMkyYDe5-iPH+)5Z(ExZS}5B>yX6<&r<>Q@~X(
zI!5lhN7}Bexc+m;(W`QyDMbq_(yjuQZ<!97G}BG??~<Y-$%g@t-QUouw_6I!7`rj?
zJo|3lt72FYs^zy80CL?4H58XjpFShuWqVLr=*6my3~r+0Th4-w0*4bX<eUJ657Ob-
zsGYSO_Q1$QFu?*A)-CGF1QXxX8?@S+ldA)IPy_Uk>`EX$(KXBzNZDV=aJJ10nKAqs
zWP#86x*~(w4BSU^UV(L5p|Ce!pB`ytjIz6i$G~cWJl!c*#pl3ec<D}FQ5ubz5a#jG
z!Tk)C!0S7cD;A>r(>Gm=87Jg(^R8y%tN&^8_AjXt{W%8g9A8~|TO`wvFg?}TBBar3
z|0{9;+w`X`(q{!N*UlGraGn{tqZhfd0bH=2mYgw?Wd}h`#>Gm^cipf0_XTqnF5Zd6
zp>mefbgGO|pDLcXX8^z8y|sd>WTK{WTfyeXUIKSqQ79{2<TZNvGI`#N<jjw5KPx<%
zC^><6a-ThxWWYm53GdS!N)RG}y}~EwTXB7P+Wk*Pr>FiZuA$HSbVF|cI&f6CJ*1r0
z)B(d&W1D&-l*s#Dk-s_5;z2Enlx|wy0ulhgRnYdo7=AgXbln(|az${$uJhWb7d`Zw
z8+e~<`D3=zFULq(G09x=Z7Y{bo%eGz>#>vGZ9TrqkbVl%$47a98r)@N>w#sruTo=-
zRf2{Xd~g@V%W`ixe}buLK+W8k@oFK&Xj@6=^{OVxYjRvce*glMLl{wr?;I6qF1A-B
zFY(oMg^PG0?USzA_v4}k^(#Jt6lb`EU21)(ex>#7nW0>~o0wyTzNu=#sgK0)`&c%_
zI@7CaaZb81jek7%J|-vvm&&JNkH_G_mDVN2AFPtCY9|tfE->gEP9#(~o{e~zC%TX`
zNb%1gj;?h&Fj;a^|E#dSZl>F#oBgRho$Z_XcK=IKMIf8DufT2sj^l4hVPQ}n>vUG1
z(u<k`SH4qXl4*=7P#%A!B96T;sIyublR3O1_(vMRZ_0<Qy}0x)h>mZ;SOPcTD6ra1
za@9w-JWVb5L|EbTw5E2XZ`gdkRD9AoPaaUnm$MTdL_OW9g|}M66xBL2FR1|)9GUib
zeRJ@W)mo!O3W;55!P#%HuN0=>IKyDvYs@jMgpHu5gCr@HJaT<*-Yn~zIHn$Ln&gvW
zV>z*grnog%Qmne6y<zR~C*{2D|Hd~z=Q)SJJ$yv%GId>*EdY;HbtW{F)itjYOD_m4
z{OuR62#TD3PsCEmHpS%gv4<u-e4TI;aoQ8gH&$BzNp|AYB_Iw<Og=YX+ox3VsYEkA
z%5i1nn`$uRUdx)pwv^m+pl`1jhjRQ*u7S+pbGovfUO4X;MGmRP01akIoPebuJo?NH
zfx{XUk5_rw>iJBPx$E8OQ2$w)0rB%^r3EB@2LD+i<zv6rfb;rvR1hAi4S<<k`F-~O
z8dd;R>uozxUzx(1yPIVJrZZ_h4`B|5Mgwv2`Hb7<M>dt8l@ySZ{>*a$RYLx=MDiY9
zKiBAkwpIQk#Q!T%1AdlvF+m%Vzoq}RL=m(R^Y5j|dGg-IUuz%C4}VsXz}32ORPK@N
zEAyzZ{Cf$gO#ZWU<RX5q0S-a(KN|(^_W!0j_99~!w*71m#pE%5oiEI|(OXQNZ|RlU
zCv)5c{;DtQ?`AOt!3vXOb)<j0o>;}emm^Nh28WBUKYV3apVjq72h@l4<-0}62uT$t
zWR|ZVbobu;X(@kKw{cer&5cg*Bpi$j`YdllCzhB+s0)2&${Zfl8np73{k$Y*db<+%
zRqX^HJP29gP2cV)LRIKhLWv-MQX>+)Uojh>kZ$sBp|yO{zFwiLQ{}rDuA3n<JzZIh
z4~2i9p0;Td<ZVf=t`fAN5`>S~kbjcUo%?x2_Um0CM!?%l#(?`psSDp5NxGf-tY_M)
zLnDL%ixhH$7$PJOJxamKA^X2V*YuW0f^CU+qz|c(Ij^YCBhMVpMp4Y1a=7T4Zj!Y4
zeblwu+G#^q2sfR(XYXWuMPV}Avy01Bc4;URUKb?%5OChp*PKWtP~>Dm5g((G{AfhB
zK0&_}%ti<pRQ6n^;hL$<p)uHRWb?MUboePqJqbwF`jR3fR<_6L0wVFrA0$SIp`yL;
z1na1J{C>IgLQ-(RMuq*1h)4ONlnNvheP^My*{8H7CLgLY@dy1lO3GICxj|zL_KH8f
zb)UH)knY3EJ!y6=tqLf)NXbMvkY34FL($@h#FGkKCFAcg>7mx1VJ;v<bypA$o?t`A
zRnjz40PWuG@;^J7-B}wtV&FDN-n;7@IAioHIMJ`buQoSTJ2ji!hv~5Z5eayp`1DS{
z-f=(8z6;I;?jYhI^IldQ++F@u$H0r>z@b8^Wdo~N4odWgE2nf<&pb#PveO?|u30Ib
z3Z#Kct~Uv$Zz{_mM2Pl8sg#gtt@ZjfyYhu}Ig$_mwXc->Fxr@qf;wJFY%qm1HhJ#5
z@hEy2GO|CE4FbCluOF3E%L;||&~*gctxDH4&AZQ#O4G65e}=*-TAfuwORSsEq5{fX
z3&mBvw@&~nvJQiT)x!7%Dt=*B3AZ-ZiH(5zwZXaj$(R=5!IKE?6x+IWf|>qaG^ZPi
zn1e}k{%#=zk8TnoHr6V_T^_Q7JW*zKkp?_V9;^i`cHZ@&!!^A#9e_X<?kW50Z4+fo
zlPvn)MAy}keubX7i22)3mn+-uLI>^>0#<!)X|qNGswZWlbOh)b$%GWo(K13SwwnEL
z8DT3BqdJ-XaXJyNzr=ATh<RahWaBQT&gUR%xNMxI0#At%L=m#tAe#fZC?qDywQkSR
zfDm3ix3b?#2CANv?N8f86jYb(7~IiZT^S~~7XyDLflxj`@;Sy`x8A=;^6O<^fjTo*
zG<Cl?rVfYL^X232bRG9b?Ew{?r9@8+w07kUvcjoTYNt-&UK&ux+HRa!qsWyqOa91R
zJQ5krP#^#s_`ztm)9Dp3fw!lL!_pHoZTCCHV<p#TV<t|5hrKU$Na~I|HGA1qVtT!h
zCodvhM~My5hdG+lko)#uKbaqrQpLV5e4mwMB(@H9V^vpZzljr9xW5x&L&poRSuGwu
zPS8KY)T^fT28zD%t+GMbVS1EC&{kvIO%Ae=Y5RnT#?9{iNXh|faB)de_3iu^$T&6*
zXJ~8XWZg4b-WmJl<Vr6@ND$-4vWqT>Ug4^n?IT>HfJn$=pC_I{+$}dgSjB@cbCpa(
zwCvF@(l>i=h3Akz2b{TL`{Yt>9yGH^TCivaS#e!*?b{442&d9fSaHB&Lk9^WM+TmZ
z5W1IA+Hj)l?QMAU3GQJ`<CB#6Ns!`=V3Pw0r1<!pq(<wZ)%5eIp9PTElc3RdjQ(Qs
z7L^20>9UuH5|<XO@|Ox^R?+HL4f^QLMH}7cQQGtO6svEZExPbdiYbx>d~lYs_5hBx
zPr-|Cn&b0eOQq^ShO%+mzdgnCKI~*M@c)5<7n-if>HUkM@YaniQ6?vZP((#}vQzi%
zf;9U^zpLFTJ}YNz=!S*yZ<6riDC>6}b9j_Tgbti9QS2LHE`mpjGru^VI9{MjFAs;+
z0x4!>zPUgvzk@g>-}EvkZ(y|nyf&diO=1HcUHq0?6DP_cx|77L)>CjL#H>YQY-L%(
zwth)}HO_GO++g=ev3G&>o}b=>Y|`d$c}SC-;_2d5|Nil2E-DbCmr+1Gzw+X{aFIsw
zA7652{O<!L-S~z@g6+z^6FzNnPu~Qbi4}y6|HWG>_Uz@H61l(QCoWh!YNyp8eSd?3
z(<YX|y126X*j{nZ$fO(b<mzK|<5O~GZ*1ia<0dAbIcm)YV{e=I@r<@EK9z%SC52Te
z<F_%o9$MuW-c>KsTU95NO~lv)AGWvTC<7^CdxC^#xSiCE35GH8ZOH*=uBkUt<jfT8
z9Q-fmY?TL&=|jf$32EwAk%foz-ZpgbXxe0#1N1#>^xzQqst`|0BFjORh0M}13U!;l
z@lxK(akmu@{gqHI4AEFUI<V98n(^VU8W?C>@9!*{>Dx+f$6E<0;^EO`AEv@<%@Sfc
z3h?WccVJwM?(HKYKq=WYQfwn?1g@-O8~1Iy_Gt!-pn9%c{UdnpJ_mHwd)ggO_eq*Y
zsy>vAyTHAgc|Q$qzj6lvDU6=JEKjOnGB}tl)O!<Kf8aW{QfyYjdIo<6f|70N!#90a
ze7F>D;9t2iAhfZJjz{z1J;*vHc_=cKL%H&$oNQZ12qycMV6E~b9+;S9d14(1QJd6p
zy2Y&H<&=<lhyLV+^}826JM2QumTR7rGPB#O2+gEvt@7a+J(kFaCZZhtCk~(O+kmwM
zZ`*<wf@BieQACt#wB*vq+zE^yjb3Nj+(sOR+dpzh!xr3Z6=c@j%Sti%<ux19K75T4
zBg#cwMjF;Oey7usj^hH~%~c2kan2p<ZBz+5bCu#2Sy-x&Q4KUR_ncs2QIE<af68!V
z9>e7PyaW7QFN01U(Ihsd*)*AcB_Im&(%uoB>U(3G>uV}h&t$7imoxS7n!n%G<J!k~
zU|^QS9dn@i>EI?dfbo~Ivw9O8H`|f*%i%KB@mV&3`xfNt`9NI$C<xnTOf7!r1Z<}J
zio^xF!-lYr$bo5fmiY*-`gIB}Ci5xb#+DNUSI73B2I1acy(OXk=&ey!NTMDR!7w(l
zEj}{%fTf$2O!D2njdJk5)t>@cttd0opX_Wnz7LWA9<qzDtuUe0)Rr02c9p^#&NeQ9
z!l9YBQ&7rU#_XCov*(%jlEwC1@<vvhLhdelfO@{F+)mH~0!Thf2jA^bF1@Pd3`^Z~
zFw1hVB8@5sTcv)Cfd$%YH@v!XQ%i7ij)S~%(*2&V_+1{@bEaMFcpDv`2|nGR<#+Z)
zt7B|5qbS2NwnnYAmwDnN8CZLw!X`p&>O%Q)ML*It-)`L1{d9m{x3}i2!=z{?ZODqx
zOTxK#-y(3ciwRgeOggtgkrE6|9d84`f34SzEZI4IYnyimM~v?-3qPzG^|ys)-VDJ=
zh)<fU&IoRTL3Bv>s*g6XEti6uEO!l{Os1_6jQ*q6(g_Lxk;HloSL@20!yKEYvmcN%
zt3->vYjXTSusvv@)8P<aIeZ-P=pe>N5Vi>}Zp&J()1lXqju7<VRTG9H8=qA9A2)a;
zhh!+QP~<c*h1j@(5Jku9uO*)`LYs(AopMtyVs44WoW^`C9GFf#KchHm!~;@jj>2Ak
zmMz5e8y|MjlK|uS$rAZgM3IA&#v*PN_LRnSthZ^7`RRt{3Kma}IP`CfO`Z5*WPvLo
z3*$mnHUcN=2^EBiWmM2P^q+?)gxxfy0a>wzk2oO~R?A~KU@CZ4o0`BO**^b8oPi9u
z{<i!U2?I@Xk;!UkB>z~z?&Lm`HN$%5hmq9OTbwU0%n|NN*EX`dY7~e&(+|cs2S273
zl!IZEIO@j?zh~J9`_VJ&NV}TYf`w!DyC|iWs@1|yLm|IlrUQ|S5p*}B!6!iqojF1q
z0+qV&sO)GZ5w0GZI{{P_52vx!%<2y(O@W$qT<^T0QE(s~;)_5Q&BK?TjKPi_<p+*F
z7X)Ck@S)4-xR15No_=~Ca`>A{9Ta(mV=4TD+(01}wtcuaj<1Quh0CR%o~&!ZVYxt4
zU_eguRe^mn_W7~?n-L3{FD4mue0ygP<a8~xS`FXZutrSl&I9NMQ-jm4r_glVLo_@7
zoQInqo~2u1BlTG~)?4ewV8}bOW0>g|f)lIVdZ?R$fEisJq>h`CvMM~Z84eDIW~9Z%
z0s^q<^qvf#^bvcf72qW5PKAFBI5Fxw&m;XX$HdQkVMGf*#ulwHHaySabntZUqmJu1
zZlFKyk(Dcq+$h43<4<=VUp>cY<uR3UCt{j_3W8=DC&*)Yx+5F+aj$WFp?t_47r4up
zf)z)#<$$11^OpfP1i&{T>{A(8xyQ>y$M;|pM#ItPU!UNn@n?=`47(@bKJYY*D=YXd
z^{JdJC>N6lt)^%K7lkg<t8uT<7`r$@7%d$Pd9rWFl5ob9TRIsyswh$t)H32wqT|z=
z(_PUc9(#tuR3LHdBBgW)#SD~JxODm>pfn&)ayU?ZdCqEwEflKq+{JFy7s2LMz7#AL
z-*QkUog+eW{--UJwgl$%oi3X()U^(kH#m^<Cj0NGjw#Tejtsbdihe9CG}h5z9TjJI
zXKSs@xcIW=oOfy*E^tU6(!EpAHMPYFS_<4y1olW=SAq1>`L0!^<}M7iZgrcHRZhKn
zbYWp_xQ>KnW&Odj6#PyNTZ|OT$(%Z+8Qt$e1MBO<wHgpvcws(G*F{w?NyWLrJo_1g
z_YLEaA!g)6mCSNIK}Hawgq0C~9VGg>XQXdho}zM1z^C+kuX&?cxXwt(Ob<xK1wjzI
z0zy>Eax-gVo1E@&rd<(OUb*}$IpKfIW0b``ydKkWp+jCw-2zI^x;vdu89V-gUT1DP
zRm_Ife@1tD)R>iJQ=U$z<DfG{>Qty+{l$Oe!L&M633-*Pr@KrNH^&1jj#UtN#361U
z`Lf=wxb%<Q+{O2kJ+*>jgB4vv>=e&XO0Qj?F%pc0qHkuM;$4P!o=iUf6b^flM61Je
z)9WHDA<d<`JbB~Ug|^3r@_WJpfarCG85Zs3h%Yj8S>XN;q)h)ibsY48Zh(_amJZKp
zIE_t?kx3ujo!BYCf$m*iH>%iXy0D;YpEO7I^4J)mT$~$*!l%-Bl4ub}2>}Tatd3we
z+@r?Jbsw+FCj`~R#w9+!9HkA8K-Nt=@&FZFHv3Li)rXtfXJ0Z+deZhilsFLO@));T
zwte!?BSX|Cn%%)$t`ey<7TD3A8qe(IP66BQuqv>SoO1!5BmP?G`?+}@uF{Uo<jUvC
z#9&HJve7;6=i$B6GpxyvJCoiJrc08j<8E9imF++I2&jZop=b|GtSWC;<s&O`%xi4Y
zBLr7la>rTiO$&>yTE5eKuue;)*_dcB{7UyV$t4>$4rJ4a|3|2GA#h{ScO_dQ^j|zz
z<0@Ud*<}@Q`c{D?N1+AVv`8k}3xKu6eTBxX;!{s=HWi_tN~7Zng`iC%8+qq++<9l6
zUSLhI9aDeTrhuC6k6xh=Wg;8s#2J1rn(jJDamL0b)Y*xmY5K(Hr44{s#6XW|j(=1b
z36-H$?W={NA9eU_97Df-Rl_}@{qg~D#(aXZmVZ^M_S1KDC1~HR=ANJ;PF~6M2-@Zb
z(xiMVC|CFK?%Nz<wC`kM>}+1;Ax^$$;FzGPT&Q|<7)JlSF_i1nrH!*~k6EgyacfFS
zJa2Ov-7=Yg{@co65ENbEsN@GS+Hdy9y&EBD>vvwF(6rPb*qF%xgfxfxbDja$CGUhp
zOIJ&LggOIowDj%IDLl8RhAD>c3#@pVO+VD0&FTCRB|!z=8$Ta=6Fvaq(Za;*tU8q~
z?uD;w8M*RGe1e;4FMn&E#58+@+}*H^6~2YEpes%KsPby|-zZ*%#@^kb)OmCR@Ai7r
zmkqJZ&+WIkOIreJS>_ubp#`F}rrhNH@A#E4@ge<E*siFnYxtI5Wd8iNsjeFt;wj*6
zZ!>Cg!E;LZv7kkj>aHV|YC*$m99=s6!QTB1Gsy}FZ(@^#W)<mmN3HcV#Yhi4D^fK)
z1QK%GgoWdMg{vc%I`32=Z}L`6>t`=1c<SotYI$=R9X-<=*E&~$nB=!+i4tDrO{yZI
z?>E8m!e{)(J5zS4FJn-?QOnLjzgq9NlS^`hrIu@x@6!%nLfRIX+_Awye+VmMML77`
zhKceP1y90lg@&#*FJVuKC1}|c>TI*t82F~0__SYaqF5ig;y0a`rZ!NZ69aS*j<7kE
zpa6}<+V@oaO~a^Xv{R<CZ+ru2_Q&?+I@-Slbt(DBC(Zr!7k2NH=<((NIzbV#n_f0`
z*stpk$<ZiVb6fM-Rv*8U!<DKIZ@%O6?WaaV627;rm5bT&BKB-w=-)%-{O(j-(W_CL
zcvE5jVb!c2%`H&6dd<J}WaQ&Zi2b{*1D`kSW*h0RItjZrwVU!t*U9)hzO%lR&11`<
z^PO5=2(_a$V$vNFz=;4`Pn%!)jWd&+QLInX#b_g^9O<<}%x9}hQJT-6+N~G^RYwhA
zmY9nacSS$lw}&_E?mbq$(UhAB7MX^mKJia#_xyrqM9PtnqkmCacD1Ow)v+_l{lB{Q
z{|oT8G4yfCiFI$G&TrZf$yyVO_<x|AAFfc#=Bz&}wdvVV<`kq>7?|uk6*_gcbh*c*
zi{Uv(*nDz_@69}^usU1e>;%Y2lYCc^VAjCKr+{lQ3MN`RUkjg=-sMdmhMl-^KqDF=
zSj<)LOhU~61$Azgn4=k%qih}J{~yUf0u7<N=s;}q)MDNiUbFO`^4|=j-z=VgY2hbI
z`@bgkywe(e%`W4`eDOXEBu$dw59a-Uj4e0jfg2)k)=`|H8{_?-z|{XEs?eaJY!)x9
zukeic6IuDl|6`n5C8%9{xii(}M2zrl(>@b2*Mx)RCEJK#*N@IpzxyjeqnRthkMowD
zR@GcdIPu@WHK`+NoW?ao-fGnoyYRifN^UWkzO4cvqhz$?&Ezpy0=R@rNo>TN)-*;#
zAbdUXCO`_gvkRJdc3v@gS%1_V_3*|E%^6zPPXKu0R(N;Ge_F4o=4MEF)W}w*r6nVX
z_Y4Wq5*u)na};u%-&v;|@62-C6<1xd5$e+O+TI7%aM~jlL}izdVI=cCA{^A1g=-f0
zV$PnrY1&F|P<YGv$$!s^R8Hab^mArgtw!A_c(8({OA3R(cJ|>4JlYIQYc{FW>4{Ot
zo{aiRazsas8y;9qAD7sf*qmg|bi#14+yv$CYe}m!`TP<*&Xp2pDRHQ5ndAAVfpt4{
zWi!deR&J&?Q-QnM%coB`MtTXaCgwy!^t^zk2Uso!x<Uu^ZhfJd#Vm^H{wc#nd55b?
zX@CMqPXQQoPZvmVPa*M~jM4!YUW`7U^TiRRjzpz00+X7&kv{8#%uc8DB~Zi`+^PUa
z#*G4q<$*i;ihq#n72=Sd$Ha)Ezox7`^As*1q`pRSVs4oG_rrsPRE@0r*-!2adtGLj
zb{)#ZIu_2{Tv?LbSNCb%Vz#fx6hfjdM#a~(iHr{&(N*LNmn1gUNpPfcq(Ob)-|FEk
zgqcSAEV{>_a|>&eiJjH(6t4v(yu>a-f@iFDI&3KkU+;KP*JC9hzVPLY_hwCi#Q0UL
zcy^O7QgDLs3?aHiRQh#Rq>W)KbRxT#@Gc|Pws^u^HIvJ*2-<3OCgZYZD7XEc$(o4d
zO=5~yV=XfvSZ#_&_lwsXan333;y%M(M^gRlCab}(t~md=+=YL#QH&eKOU!km?Xy#`
zeN3LKRt35A6w;Gc1QJ)OuQ18m)o0YX90m?~OyOs0^S<<0Dd&J8I6`DyahQB{zkIS^
zu^fN^hnR@Wa~pHq*zQe4#plXn?vi&95yEIN-f$aH`4E6t4a%2$;<9JC>$a3jnQKXH
z(yu@t2;lMPNH_Wo`AAT@)?s1T#?_js({ePX)9uhZVGK8x99Bsj9Imk{TyOS7S0oan
z?~-4w`h9O#=C?}p0F=J_o85Blm(~=sNq0=V+sGtSx)$l<BM7~FWCQ#XXs%C~kx3)+
zi`43X7Wzb4XwHOXb<%`5LwYqf^$A9SyZrG%Qvw9zU;OM59sF%a2PLxVn|Cr4$J8w}
z`j5Aj@xfZ`NwkWToG-dQnNfOiKN?uV5eCFw<_&<qh7yPH@>4|*>mN?6#s#Np0UdHK
zDK!l}(8^rxTNL4hBRBu|Lg(Mffu$LMb5+6!HCx-&0H;D|Fo`BIou@2Oki_TmX%`U0
zR4;pf_Jjyw1t4oVF?#M{xE5QhnJ!odvVKDZA7>^xH}Vgg1v<X$5L+H=byq!{cjgNX
z$65!TswYWK8V7x@J`38oN=cB7Aj;H}MzX-R!{sh+%mWv5xZFWjB6(0TcCU5^eCmlL
z>01A>5cQ_!l^wo&b|~T0%HAaJ1%YlmHKKCNY0FJoEBB^2nyRb%Rrl1#o|Hc@=*R7<
z9L&S*>`u;}`0+S(8?|~S?R|^V`?q~0+(tWGjv|apK#@cuD!Zf3r#R^aS&YI~qC>bV
zo$UMdnpap&z2eXs;HzJFYL*4z3HuT|^OO_s+?2zQZsd^%1AxUSGAf6iP}SQPpWo`q
z4+B*J9H^=eif{qAE5H@B=jB8Oox+`wAR0UBh$*U6{~*H?NMti)_4G+qpZ#AEB_fgs
zmYZLx9NFzWeZGb24qb>0*PbJ4$KLT8M<A_dmz!IP?D@$U)UFHyM(Sa7@2$n5a1(Qa
zSD*%G7<u_pZ1;A|N6kr1?jzbJfK_rEd)2s%cr%+R+t%zKFTZ#S74~4XhQOfxidUcx
zeR`dtKu<!w0_`=8L!kDfsv$d`PW8Viyezk;<<ajV)=A(vS>X(;q7zqs-aBZf+SsAL
zyzB*zQ(ASn3m8%4vp%;~Vzk>TZhY8NT;r3KcYSlXoWv*DPo%U8=(ehy162deL!AQW
z3zvp>+nDz9bOOxQfiBqhy)F`*z)@DW?h3KUsx?Dc>s5v##W29djv*?>XineHX1E}G
z#GR4>C@f7QYJ?vkMY>qe`zH1sSU?u?>J;_K6c}Ns3rnj#-Ocw}pFR>LLF#ne*Pdm!
zvPzCp11{T73Y;&rHPC;8!d*zUA@gCA2<;djcimS&5$3#ir(C{_*6;072wK^GX}^*5
zSp92JB7ibZ9wR1LnS)0S$evAMzZi)C?X(HAdk?A<2qwA%V^o5c^-MH126N1v%vHn>
z<^~?ONr^m()g#?#GG1ZNg*nR$|F$bn7HaLLp5d3QaowQtjcKxrehI?`OwWzB#90aT
zXK_F~t4rh1#=LZRj^fVmN&tXbIXH*L%;Slj&QE1%$1?f?`3rdAx{msa0QN|uSun*Q
zv3;)mV#+?uWpwG7rf8=bQ#@6owNG;|jXv>F9Xd2gw=vZIN76xI`lIFCAoX&fbrR&c
z=^KQrp5)i3a%ft|(T){eYPof5+^QLsS34=&<#{aMWeHpaB`PW~WBj|ygPR_2kN=UV
zLb;MPfH(a<Z(`A9kV4I#YjPRY`({*6htzp-M9#}?ljF&aw8{y#<(mY}x&&M3%Ei_)
zi9$*LUXgUUFz-Zlfb8fCMeARKe<BMnsxHcXC2_bhhd+5SWg!_qW$UKA!{lAseT7EC
zQG&2(BRA}E9>7jiyEDNI@KF8x&&76}d0<8nWE`AdGR{JzWN49)X!XXh9Yr*axUhBI
zK8XnNlccjT(bcpX5F2GT#~;ci+X~5^OHKd@#a)_E^=b@YKpznx)#o=S)ck=1_VTG>
zE9whwH~@yQ>Qmcg$9*g5aB)r1h3?X&dj!<RL({a-=%C^|iiznH(|?Kt_|=(9$t=%x
zee(qX9X^v%{8C9CAwbPm`o1JFUJ_LdY0O)ye&A0pg>V-#nr<;49skGmj_tDBUeaRT
zi_M)+b3MIuiB%hH9+gU@6X(~B2m&ZU(fwBMUg8=u9yV>wRe$am33+48b{u7K`>R@9
z<4!(-mFq{i=F5?&(=pf$>|g<>0~>&@sUL_&QEm>M#hwd1C;zgW@Mx*w6Sudh0b~Y0
zJ3Q`SR@$Z9>fW+OJxuMAJ%N+Cn)ES(K~SZ1ay(6zu}Ti$d9465R!t)gT}15j)BU#x
z;iCrqrA@fyY#k2Fd)wXxSd*1^vb$5hD~D%3fNn7c7Y_(yuYIw5m3&H@xGZc3G>9D$
z7D?zjWeQP_M|<Liw>E}F^peX*jWmZHZO6r`hE9@LvCcBErjh&ILy+uT;pR<nqPOiu
z4iWYe`}`REwqUyTHtC-KEce(4sYl5`49cgQUS5DJ`|}mtfIlxzM26PBnm%jemgJ&v
zE{rQJMe_76KzcE!XNFs<Dli^|$FtaG@WQBI`#=Xu4G|~@BZ{+{LGJv{GJkH82_R^9
zVV!icTj5MgB#z{qh?rqqo^N#Wl5Mr*_c^=S>NJPRUYX+lLu>wE@A98S2wENO?&glO
z9JOq-I|YHd3UyYJ#)-S}OF->(J!8%l=%U)*%XUsiQpHw_kJ5tn=dU{J8dYT#6jw1&
zfQRYwNvRu9a`0t-pw;pB;Nh%Fd3siLN@xi_7LT$R9K-xz(Q7@^OtJhR;7U~Pcs;xH
zdVQxL%1gCT&yw59Qq~wuAh$3oJWtk@>Yf_KgloQZy}H(>!(gc`*~6x8x+6Di*Q)k%
zfUry?L>@r-_D(m`yNG69xC-0ur3`U2%$QJ57~VaBE>o5ffn;{SD%=9Zs5wTiIwc}j
z-F$93)-Y7aR>-=<+;cxqma&ELy2uWDSj$M?gfHbdjiq^I>g{{WeTe5X1II>f*kco1
z{1~mk?y<$!m<%Tq`>NW)Ms43kAJ;ba#^(U_tRGmCHW^E6Lxx>}?`=OO<vJwFmjy7u
z#5)+9QIx;{a%f%+6%)%jm!<Jw*~V9HYvPCKWvoz^Gs;yaO5~F_EsSJiR_*1JH!n~O
zgGRj1*(Vhz(%q*^kn)<4zG-S)X~UVwW>cDbIY9pkI<D_&C>Lm+S8Q|AX#UJ6GUJEM
zs|dw`Hh~4s2Q~fx0u3q?7L3s|y6t*TDUrYZnrQXp_|5H!pZsB9m-kaIAfJ-8hn8h%
zUc|_c^uO)6JFRG9T9L1>!X)sIgor)}x`h_)Fn+Gvi%vWpP}CY8e6jnuKb!9u{pzf0
zmz6hRBkC;>BzwvxNnCi~7<Bt}$M~<<EAh3w7P6yQTUyg)?be)J>otwP6R{|lC<6?S
zq>zL+{dgt=g1jM6779JfYLq6M;(gZ2#bWCQS553=El^18<HUmZlZk2%vsz=@^S&k2
zj|c9*>;^P#8WfPv{G_4v+XYCSfTeQaA?j!A6j1e5)e-bmOJ}ec70_WFHH~@Ea_jMP
z?Gd_<!ZCZD-brM9UL&3AiH#=HCCZ28y==eL;S2`93#tu-G8>?sku%7uVDiHx4TGxa
zUK4xe?Qq(a-)4LpK1?n!6z2(p_yy3w-|7pDE6@d?t78M28ouANq6%*E+_Y=?>vd<1
z#K68e|DSZi|Ei&~wFjFq4(R^EGI}Y`_sb{7mGUNDzo^R8z&{o=9VkvIKPvV!e+DLp
zvRjt4Drm+wpFMXP)vV@O4#b?q9K(AN=~=aP>3pcdr=%7CS~NK1*R8aw<<YDf(h1yn
zdhS!ZBK(0kRTSeCgUlr~+w{{!f)$1%^+Hz}(?uz%j8W??QeN^4R?_720>~$UeNw$`
z5_x5h2Kstr-o#%!&~^+wj(%iWo%!Ch9Nj9l7Gk@{FOPYf`l8#0b@>q6O3QpGA=A>C
zJ7qhXGA2RDAm-P)BhQgYrT~qN`!?AFAH^4<>z~i-nANfkIONf>P}f~)IDhh0^;gQm
z0hmZ{1ML_g-NZ436jDfu4PaZhSDg;$qV#o#Lv6Ef2H%_Wxke^fI@h~shD(2eoa;SG
zH31Len8#MrhP3`w3jRH*1(G@1{?CPf&uO7ve#o~-O!!kRQ?~Hl>B|a8GYKCJZ{`Q`
zO0y>vXFa5Jy&acCZF6Xi#`6WBPR?INU>-JCe<hOt|3uc`V{xP!o;N2UY2hmGjq4?e
z-_ITUN7<h%xufLZAUkPlXjT&Sm1OuwI0PwIw|T6T&@Ip{Y{+{2CqhA@1K*x2`;kxi
zAW757zi{zm70aJA#^Xl`_go3q`>s6Z_(;BM4=;6%(vs#M3$yX<1SfjfUSDW-$ACo_
z80jvNYSyR!K$8>~Ncc0+VP=J*Gy~-O)NIa+m@pL>x9HCGUJ0vduc(yx6(A4y1)1s1
z>K9(E3qHTTI;C9B8|xM!PT4cM=|kN%@RRvIi<0sHBD|FxWVejDVn2WPE=E8<@*Ef-
zzJe@&4yIfbp3Css^9}0F9SS4@MHPL%r0%->7at$nd{QlPWRL9Eyf%YYW!pIInw<2<
zplfL%GD{SiKnXbX9^1dj`e$n#5-;&zD3PJjACw!vWl=n-Vfn-W{afO{IhpDkaF<^8
zkn)<4QXTj<FzHl<&m23FyCsK%)OiDIxke2RI~2~~7AEe)I=#B9c}Z+tkmdA%0x($y
zusK$NKK~DGTW^o@(3S9N?9!x#Wk2x+qdH~wR{#)`b`N$QhT!jTg`EbzZ&gtJ=-@yH
z8`KB<hSjfu1^{}BPs)tZlYju6OY1CG#1Tucz@;q<|7YUk3f!fR0I&Y;Lo9ZNsSb`X
zc`gbQlFu+QW#a#IfP#j1z*XqYFS*}ws6Dy5_N!yF2E=vdbkv0>P7%e}q}3w^J1~?O
z2Rsd+xO568!nEp|Z9E10)!{DNZOljtV88=EUV#t7=+R^OF%xhxzC91B&u$Rj!nfL<
z`>xd|t`bISh^mb0KgYy;p%YHIbgN~$<Br}{0x&9H5|~nlz3~M7gc0I_6UT3k2(RoR
za7X03s+xk36RY`nZw><+zo?CT*x0By%pIRj>Ny^bjfJjwFeo_|nvCIl(i0ZgK?)S%
z0PX3H<mSky1f*Nl-4tM|ugi*ARia{gH)7URVk0t_e@e^96SDb71wfIwa<Teox=nJa
z4=>Q45A~j*fpnQoxarj4A3d3UlHvvP<Q89WJI=wD>^7Cq<?J4YQgW4;H5N4q${a6w
z8#@N<?@zfWecS|?o^w9U?-h^<?9WD+5My9(S=U&qgrk^Tmy78}l$0kzKXI<DCGpaO
zF(0jLx>ZClAHDicoUeG?A0_C;-%-u{hu`mJ#{~=n5`a0J=t3UnsJ*R}3$%=|DQ1uH
zTPhOxKCeN6Ar}fEn@}GwCW<ynWL*V8w40)v5~j4mE7-A5A^8A!*dJOgM(@BUMsV0M
zHXM3GJa30S=u4O6%3IE~U&c!sc<`N?jn90>XMZ5q#q?lk#6jFKU*BaSzq{0-Ygb;O
ziuqnePt&UKI%#Z@Cbs5tHH=q=$_W5ah+j0H{e3TaU?i%M4<=ouT3+7&#s`})F7%th
zVSVoU#jLTUpw#Qjoct<6xF#<`l!goPSSL9Kk&djJt4hEnV#>Y#bHV@*HBNG`lao+`
z>mFWKwhZG!?ow;&4@QlBDb;=jWhVq+Ax&JOhaF4fWhydgV0y1B=_Fq4Aa880ib2^C
zGco~q7W72KZ~i9$>7qG9Nf2@TX#sxN^QSf?`}Yd^J`nLa7`?gZ%j(fH<mPdix=&1~
z@lMH?TcQC1fqj#Ffe$Gv{d6hz8)|%r<{*vA2!0#=RH#XVu%NKzBKz!`Va8Z|VO%D$
zH?lRxNl=E#S8Qi~Zev;pTM;nfr#bP1Ozr{pXdL1Q=L7h2O&2haM)KcVyMYu|?v*u0
zh!MYX_Sn>nYIOG$FRyd^h^O$_`)D@f0XkA5Qqtn1RV`?mzMIqXux>LyMdsI4AR)i!
zKnh;&ul=Z10j7MJ@*~Woia6#@%8f3S2hxiB9?!juTy#&3Y+loIby+=6k^%6tmcEKb
z=g0wI9*_e$o?D6KTL=DRLlE<+3;--mC5X-$evayfpxlxBxCPPG=)(GSPu<3m>|9QV
zKbNPSK}SM;Evag8adni5UWcqKc5+Q$(S(Yc(km<Nqo)|Jo@phu-l<Fj6}=BSBs*L@
z5mn%93nnNwH>)A;VhV6QBlkZ5n8nNnX|Q$<OUqKf?ZHrqq)p?3P250Pquxz;pOl&%
zR@a=>_)TRd70{E;DepF%AB>FXRi{E*KliOsMRx%VrBJV!+KYB63#pF?NuI*!{US_e
z)y;BJd*~;CS3~OOX55a}h9}Jfe-*>|>J0t%pd>&uo#+Zfj?cp;eQ1Weg3jO0YgGka
z_$M}y1a6FeOCeBtaF}Y?A}r-W>)E$3r#uyerHZ#$H?W8r+;HTshE*n$-u62>@xV$8
zdh;(hE}3<d!K1Z*w4c8hi5zjK7<Q)P+v{9H=Y6FDXIa;kRkPYDKikQK9|BWyE@cO_
zsYTZZkR)pjdw<BjneaH}u9J4Eg_hQ_@C`+Fz+jc1-4O83p3ng*a9S@@I?UNLXMcW(
zzM+r1vZ9`In5oGbpqWnH)_KMz_O2x*JrGf-fH5cPXUCOf)Xl+WzyM#tiRL!ieAt{w
zf#FhlGvVRw3dn(H`v{CA`rxU9@*Wh8$90CrGT=zHr+L^suenKwKx@&-Pm-YI$Twv}
z6FuUn%0iqDhlHD~9mQEQwP?}{ovu4I8?z_$TnEOD04hhZ@_A8{ZJ;DRo&M%VC|{J`
zH`@SNvhGv1F15g{7s<o@WFZ550of<{;{aAZWDEwZfP=Z>YA@HIErv9ybSJfVLe2!N
zGbrSe?gv2Q0K3ZgG0z1KE=TaYx0fg0?<8KqoUWfqeVR&BGQ<3yM8gQ1d2oe;+6(CA
z^X2CPW1&eQz^;2wd}6Jpd$x}?kHUla+ync_9x7bqa&vOtfKK`h*ZKpDq7zpL^4FFQ
zsdsf|aP@2DpGM(Qppbca#Ww+~aBwq&4jIf<eD{=QKXWkfMvxd6HB%xpRp8z>u$jR?
z<kKBwUS7>eM+Pty2B>=$B=47wIt`Du@qYlH8Yc^_&$Lq{T=(*RMh(if@*df(&$ACB
z)+gyMr9x#&d(`Eo3V~}ur(uXZK8r<fF6)?2$;3~n@Bevv#AD_T<97_e*p)f>Lkg>(
znU^`Z5<5mqyg_y}E4qYJ%%1McMgWIc5i1nLbQzc=2mW@_{QZto532-WiQ@yEqbUEh
zwYTh9tq&57bpb{FE|J@TQ#R!NlE?zk!-%sAsC&$zeo$yVJ}ooi>NPUuQnFnq(s&l5
zQlg08TdY~}<*Qy&AjDs4u~gdHQXpclir4qBF5|sxGY*%iuTk0UxzK)ns34B%1jeIH
z_`ECNE_oq9?!&LP2Mau8KHt^RAs-Hm4(35+o+}#KjGkW4-il8BYs%^|ZE!I#uT+7&
zAf7640I<Mk@6e{5Pk@Mzym`WeV*GghMX+c+Q6W{5dRHs8q{OIY6kz%ZN->+&@%g?Q
z7sHh*(!#|XG!=OUUK3AHqsB`t%C<}cm~<kk!FBszWoxR!Q1{>%3crGH{`MDN0*p2N
zec{(o^dwUFk}_F)h3DMMDL%ez9HSZA2|dk)_*G2b=lh@Tbw5D)ykOB_d4t-QN`-Jc
zxjrzoOBYishF|S9lu4|GnuLnsH`lz&F*SH?!SG~cC}K+r9vc`uvb)8$WFSf`U?Y8B
z-}oYFv>({Blh)_0K?WVTw(lpU>ts0xvH|PslmR_uL0UZCToNhle~oz&|9LVkUEz*z
zt=Eoah+fIB5WBT6qIh3Qi5c-_6yw!2uEv&jqGfoK2?9T*su|;Rm6SS5?b#mY_h8BN
zK9Z;RP}1wPI4dOb@Bv4QxcfzSX`n-gaN-`<rr0GZ$9G4T`$Djx$L9uvk)LeG$Q1?c
zA0z&P!c@UI4YBUu-MequGwqOZkSXOMv23$yYwLw;X-fcY+=WZnGN0i?1)M;P*eh$}
z2PWvZaCdyiuG?IYo7N9cMQhI`cX{tkU`Ly}G8;Tfe-trs)p=nUf5f+)B_&H?J5C9Q
z9Ge8N@32S5NYzpqTQ-zj(w%B95+eX1b;yR#zth5L`M0l&J5|&>9_$3&h?7ZhcqlZ$
z7Fn=$IcUh;V;bDx>Zzy>w8}VGEA^YTKG3l$`Q0)(IO36G<MzUa-}JJf%~Z1YWY5>8
zj3y_)EdAaw9&Ono*LfVkKS;xi>FNKYy7P=`Vq5#Tjb>;bq(~4%lp-||LKPGcrF%q!
zp-2g#29csj4WJNI5WQ3pL8_ubNq{IV0qF@4nh-<?f=D=k2n1<?+(FK{?}zvMyYBmC
z)|xeI?Y-xj+0UB&-_QRy@vt1xr&w`?Gum37RvtMg%rZVpw=y(Db!Gz#k;6Nif282R
zKsy#=)A^st0&FN9;&LK>^}AgydpVW*4Ppf|?lP=UyG)<jREv<Iv&-_l9|Q|j+%q5Z
zPVFi*cr2aJB<6gXQ?*z<dBb;*eq7p>H2x5Oa<Gz~pt>C5RS1&H>%b!VmH7YU6=c|2
zxhdk-n4ww|{Q{?MoqIV885Q!^TMc#i<`T8J+NF7_Qt`l{3ogAKcdQ3NE+Q>3^TL>W
zVFf3LjN~?c57A_8H1yzBP0z@%h9i1L+QvTUnN5op{$T&B&IuCL<8idv)z+&S(@hPV
zwogO;m7#CguYY%uWuT6h-g*U05$j^Vt$vs2rj;7}%oVeZ*-tNf3)*f)0~X~km*o0B
zEL2?A*?VkSXtcj{ZT!f&H)m)e&o2D-E8gNIZwA5*$zEPfX#A96SC8<s?v=m1>Mx{q
z-6`Z7+8EBf-j8c8@=7=(L;v0tdS=dW8U14E_*`Q@&9~X|_apg3`}Kfjb=aLVn=3L^
zMfE@D^J)#(BqSKeE%UJtk2$4e%W@o(K(m`2sNGxLQTOi0l}5c+nvgp7>#HdQN4=xc
zEn+kxJ-uf(zAW^MMiZltOHC?sh_)kJ9R1jQoUF=ng~i=Jf4OyVXn-JKWG*bi3)vP+
zeOUeVlMTr;ljlSkEX02GUCJVt65|#mlE>ZXAyQAcCRg}xD$6I?5Tm1u5{^FIJ6U$+
zU;j7@1@l2miEVU?V0d#&vpFmAf0Jg8UY@xs9HLdIj5u|)$&xC?C}p{~?{$76Jo?b5
zwB-QuPU-Lpd8+DM<$xXEN59-V7k&W#*ZWJ%9TWTiP+>mFraR0Q=_&*3^_^0xUEiNx
zv!sLV-%IfSCQIRezr)g6zT0R<2=NrF`JfWOf?&F9+P`s$vn?~)FTA!Fk?-aJWFv@A
zEZ270$zSJQ+b`Q@v)XVEt*+n9$F%>kLqGa>?>?Doy5M7AP`C#;Wv}toJwjdSp=Wx_
z6o+g1#C9CMTx0I>dbeiiH|(jI{(aB0WA~U@?N@fWJ$wLe>A11s*`y;GYOH{G`C%^O
z#QY=LH@>_hm{QBHyps$(1i-UQBNRD6TIaqUD;z8*ghHLWd7<{~E^`XvIV`Ij)j*YZ
zPOKJm@Z-oS+cJ4Np?|Z|=u-(h07ML-Pn;Jr1g-5MdllnR3AVeJO))=H(XJ9HlGV*l
zpepFtKmn&qvm5@q;l${eAS2bPF;LG#yE3BtTWl&~x5HkScAi8?yp8v68yiCFdmtq}
ze7kr>Q;w|6vDU+S`MVe`rqQ?k%~{3@mAi1MEo@|rgdOwVF2=5a^H@k2dGLA9E**>|
zv$WUUDqqB5xBG1qASg;9#-QKb^kV*Bbn2dA{WnG{DeT++{}a4dR>W{kPZBJfImi@%
zxscQi%IT2e9-_Q|5zfKe<XM($ukoh19<j$TUd$Vzrzeq_-i9_$6-=!NI)aI4p~S-o
z_kYLV{jK|qiW(=PYyoJfG;-*M+R+%o<|$a<Ju8|;0Oatjm>!|$PeedU<HfI-EXX}>
zWFyYWD{@l*`kT~AsQu5B_$Am!{$CF;lnFMlgt*nH2Y5|s;}N1aPO%EC`6cX`pN%=!
znd0%+Pq8wMIZn(R*ZSTDHX^J}@IB}tteb*j&x<aovqYaM{*K%CQ;#|(mOc#GHHMFt
zIFl9*hliF+qgH3{E?iy9!?DEX8&>PG(`W1dWa=l=T$C%#%CC>kJ#A_0I_T2>$eIvz
z^pyBGxgzKpa$nN?IxJ8y27g9<%`m^G6KBvfqSg1k2)}hUAEO#__Ax9Vzk<+rGw)E*
zP;ye)@8|c50zn{d6m{w=m`~NZ!(l4*qD5VRzLe7CA{t)Zvvh950^xHT<LJG3EJ|~6
z(pdX*^SN?WViXFUc`C#ke%q)X3SMg~<)gm`$fn|lvp-)Hdio@ts;*jjZnmJE9=<IL
z)_9_Xto6yK5`tVjE7gL^^TbgoQUuq`ApP?4$MaQI`6mg`S1-A!CWcp-?+`wu*tZlv
z0hQZQ&#SKUJ`~kEqr5u#0axXy&1rhn7)mGgd~aM0J9soJovdK^m5B92iz|ir!f(v3
z;n-2U&e>Z*?kl;2EyG2|3LaBS&!I;gccxV${w&4S!P5h!n;SH95`nnQZLm+^Y?_b|
zg+lP-<^HP~{X#`5!)Rp6a@B>OSu4pX!xtNL>7kc9aVkyNaogJ++!R_CDn*MJ*-G6*
znxG*Np2QyO%<DO<xdW7MX`3cLxz_FqQ~K{@tcqoOW-M~l!I56PvOXX#AF1i!iY3gj
zx*y}<Gbk}_bjGGDK%K@dDG<xS)<qg@Q0O;O)8$zT@g!#^31`5Ur^;GI_(;eovfkr9
z;i6QOF{@vJ;${=b)!a1vlxcJO+x|5`Xll%V^K(A_Mi<4u++PUk4kS&53TQ*li%?ib
zd~3!$2mgfI30+AE+FSC1W+X1N$zw#d=$HJs5E>BaxrtF^S>&&ZZg}P|c%l4-JJncL
zlNHky!wsCc*k~o?T4ZE-k6irrNoC~TCV;%L#t;oxRbHK%jR$awtYWXZJ)>oL$|1M%
zr$Jj}^oz?e$fbH?Bb5*n_-*R3A5h}&c^0fa&+pkT5>HA`pZd#1?6Zn87^#TGC@F&t
z#=0qkHVj@<lm2(0s6u?sk^*-pxc3T){mTyFcSKi{$j(?|3K0k|Pohwgxl9L;1exZA
z(oKgd=PfFbtfVAI-Vpk58;r}O`ZtTZT+wlyL%-T7{_#DEtNr7(t19SaIRxU7P?Qj7
zv%4|$k+_c^jATorM70Ghk_rBlze;rwNJ}cT>C~c(RINfq-^(?2TcrIrGL@1lm!=(<
zrL-O9>tyiK14VspbnNDNxFp2)ns?NR@vk1%l^gDQ_0^N^u3$E|+kp;jwv4Sex|DC7
zY+MVxw6U@RCS4s#v4!wvllcSdm*ggrRpry`-tr9=awn*9&%fY)5b6EtUVf75`prMW
zbogfAvF7o>nW(^=Tp3B)#tL&T41iX$&v%~x8Q<zPj`?c((tXccip8W<oGiI^@oBa^
zU`OaJDB(w{AeDF02P8Du3Z4OW>2`=}uxr$laHJHw`5i~~>BlWp>bNV?d?H^uZ5qRR
zC>PgL@vlj^+2!J-ne%cIBfO4K6J&%V9Cr9mu@!@?bODW<7vYlaKUcFxA|&(b@z2xK
zIkq~G!}3cX$7|y=`Z?_ud8g9=E$NxrarR>#5o$mr2+E1jM^Yh8S$)U>6H91vkOG&F
zv>#fRn6@$v>eJzV2Qy3ojPc%Z>3f&nz%n9`wN8`zHwo?oA6InEuce3?3;tN>3hLCL
zZpAj;bbrNOdWs0^(fyE<B`oP^BVl6<(z-Z6!DXh8<b5WD>`78hNb`^Uq*8b?b<3Ye
zL1G7py9(^nT5}>x66Dz&NM>|r17LOdtnT~aBg4+IiE}}!4&RbP(<s;%hmvsI=nghm
z=yG_!QjNnc9?aM8AR@wYN|`S{fk`>(Wj|yq=P2}pJK3Uh^paZ9my5$U;4dED1qx#-
zi^Aa)3ggn45H*t}SX5_O{dADR)44?RKpl~cB(-r-At`CLkauII*wzdN)I+U#^Khq1
zGBho4Bo<%9rfh6JLrF~h#1M0wnPka4ER-mYDmEA5tRgE2R^48%GC(;}c=XcKg!ebW
zcXHddg!bX^E3OF;JUfH_GeTl{$B)&{UM95&diQQxi@jeu9vAzhKx<IV#4r)4kTD-y
zUc_m_ftYPz9Zz$IJTH!DVBx;d;vL1c8|eh8ru&Ouw62-CL?yjh$n{%PkeIXt+<(;g
zmYR?eB1d18_c4^{lsHBwJx$+FpI_)bg_w4JGV`FptHz!_A6bJpJWX@^pj(PanWTjR
z3Zaq4wyaa#e<pdG2MPiC)bi;re&x)1%whri`z*qVRHQ=HfMsB`@i}5wyff8DlOd1O
zv}84jn2rfCOBb_g_11OLV2i4&nUc%oQN`H=)$Fu-qeGGJ`2m4yku*l{8&CKX7@OWJ
zyx`_sM4PcU+Q)n1wCBLKA-ZJ+5H-ZVVm!ceB8!D9POUIYu%oZPw;TW06D2dxQ|}F)
z_cPWFgg91u8|x!2jo4(EA-46`8t#jI4fLMIFY(DLIQ)l#bMhG+^%tCGft+>p^@%Nd
zb62}@!242#X{ayF3{j0?GeV+X6c2?)jh|R_R*hk#0<@P~I_GdRMO<G(jmC?<9RTG=
z#Ap+<#;|dyv(mGusld4b`X8VgYpH&MS=b_OV68Kr9Wp4%Q1?U)3667zxRmqjl>8@d
zX@``w9qt~9gA};6S46_MJ<T3-vpyH`98)D$UQ1|_h$XyK2fe3e67++~oxYJk4JEKc
z0_S&2JU$xjlT+ZQnV@z+W-`Lb-At&R64iX7-(o($O6>Gs3a$Gj@x*S5|D<)KiO^Kk
z(l-vPoIFOKkYVy|yp3%Utc%a1VHsG@K6^1&^r1Y4hzd_%rlyZ;vTCy3n#Q)RkcqI;
z^4%o=LElJ?D+49MXxatPmXdPv(b!|I7r3cDAEz7EMLtW8bRdSasj#|IK&n)b{C-+9
zEd)_gd|0Hdqlr!{Cv)5oK65&J8bl|lv)30qP_sqTaX5porGbsY?tPFu<Sq)QB{YSb
zA6jLcyXv{WRZL^6GGt@1IqR~`?HY&u&WD)_wt6??0uAc%XJnN*LqG4eI=Q!Ek?2@8
z5QEC?P+0v!$0kH*Z|liTJh@>($CEAisGS1l*K#XI!)>^a{gWO0yIm8mAwi5z3y~Qx
zPVr?wujwRD&)Ma>=y6e9+y&5Eb(fN*mc-GV^r;_Rl0JaQeho-w&ukOEKs>rL&6Ti|
zU^X2=>>_M}8A{WWs^d%?>m<wRK;7PCROPn~sQMV!eyjCoL3IJb%kqa)4y3L6bgi^z
zaBD3n8DKr9I8|<w`cnZM%vU$>LxA;tK}tV77@99Oo~JQ%NARH*Q7ZLjl4je($po1_
zM@Dd;==;iYrFdK8f_@Hn+cXAQDSHasT{VN}dY`^KCE-W}Klss5DI6{$Ch7QMeLDbA
zZCzvJy8Tg&FAJKas4(P*Sa(#D0s!8w1-gxON4RWUleDy83m#J*_Bp&5wGc~~M(W6Q
zM76)o39E^U<drR=24+sK?OQSKi@f$84hGN8L{vL(3q?~o^FYn+*_?4H#!v-RNKFN^
zSD=j^lsdkKv>6t^p*KKZlaz6%1x_CyU%S-QB#;CGfB3+RZp63tRLH1%{qvIRr===s
z3l7oy?3ShxK0S&q?lp>*;pDSVpAz1wgX|1nw%n?24a$2bI-*;K7X*p~TTh1{nUJN+
z3#evIl_pQcKuS6{g5ZMw&4qXNiv#z}?p}bjH;^D^SzE3nWZ%e=IvVkp5AMS7S~U*M
z8?LJie)EIo;B~&_$%830#}9!hA!}TFG~Ix`Wac%Z8Qpsgi|3N0x69adaV&Imqn7J0
znBefE>dMP<E|Mc-9TaZpBd}SR6_8K%h!O7Zm5M1ve8~}1_Clo^n$fh&u&%FlVs#oK
z1!WnfX1BouKX4v>-cv;Z`VO5c`>hy{5bJNt%JuhFDH+D(iW1hg?Ahc>(*+mrhq$>v
z8t-r4$5mBH1Y1ax%<kW6AGxbL)tqmZI8UE!s{#HKAYy~ZjID(;NmkPAcWy2FMzVeT
z44iW8z?d^Q8cFJ^H&rECNK9LSoCF4sJePd?INiBUSH38iL`N#^d<e*XS5u^QwNC{3
zw$~tryXeM<U^963fHgq)r_0Z%<UOepRA8-#_gV(}iY@S{=Df+$w33<fqm18<x~8fO
z#z-_-s9VukxGJ2r-%=9#1-2N=)(CH(j=Bkl@vq<N$mCWC59!ka+q$RhmEJGZthXi&
zQmsuR&lK>_xiC-uJTc-mwYV$*S85-UE-Xca@z>GLo+c2IQWz5zs|QCs@jN^0``;t|
g?>QvOu5)CrIbsi+6`KJ6)7WEv-p2H;v1{DF0nA$VYybcN

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-7.png b/docs/source/assets/design/v1/prefix_caching/example-time-7.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc33ef50d4fdb415de61cb9e75a085d776a971e3
GIT binary patch
literal 33144
zcmdSAcTiMa^DZi)1j(Z0D3Xz!qml;+5|toP7?C*SJSszmA?G9+l#B!=$q+_KL(Un7
zoO9-EUf=KBU)8Ns_f(y8?ybuoQ^Ve~cdyl}SNH1Y=?;0Nu0-&F>cO2mcL+er@>+N9
zU~=ENgTao21zd4ucN4#J=i?oa{0kiq<IVK@b=1fo%_21x5^kHkv9W^P-65^r;ciLE
zR93ft>Qq5u*0F+i53FPIqnx}QE}9$db}o-f1y{nG#^d!?-o&^bdS|YjjHRRx_a;+r
z`OS8VRDqtrh-GmY|KI*FfA}sj#<;g1lliUI0Eg6%dVdn;9vf1DI-ZaXd?}Lw0_H7)
zfvAirOE*=o12r4Cnyoshb2KaM?@(O?k%>zWR_m`}7=-ZlQRXc$(Yxo;h8lXJd|3xh
zs*bkH42_ginQ>)uVgdimZuPw^GC=>c_FXBRfo4wRb#3lA=L)_KnTpp&HVG0v3UX~s
z*B;Q|K=y54!{p&%e^ui9#KlOD7dcm)dvZ#+OflCT{>&OT)}Hb5!*EOjhc&-*adUT4
zTvxdZ$p0IWE}y$rZ%}1gV3RmE{RAfHhw-KR+sC=1&PMsFolqndvzi55KadqIOl>$|
zG#sQw<!Ypu_+3lux~V-@v|H_;B?zw3L_x+`*QA;ZqolP#8raB&PYwD}NM}dTP5R;T
zg&@1ci|YfQc%S)dvf|v9Coj(H7<|9RY!kq<W|`5|%#3)6hcC3!i5iUul2kJ6_~DaI
zZ$5}|S1~uveyUpd<80(eRm!=x)3SH-va`x~q=iyv00eSP&mN7bbYA}BWK>#`n>EN`
zE$aO`;Se=Paqg=%z5G|>-y`}U<4@SB_l-Fd4;MT6WIGx(C5v+-IoEDfI<XE={ysK(
z`xt>`H-6#!xXf2Yxy|?1d?7}>GbTm3W^>KShY648gHL^AN^-BVa}8rEs}61eIkeq(
zj*yTue)dGXEzJQ+W?wzA<F1(evZWv{WI!@O=guI6q6t?TsyW~i-{5HMxJl03m>Re7
z5tYe`KE~LngW36dG84c{a_8&H8C#!$&b*Z`G-9^;9{gj^{aT^iy)wO-uguFBrcvei
z&(^%-1!*WF-QPE<d4WZ&a0WC@XcE=@1v@djX-qw014%It?wvIVU>dTZiAynKjpdrl
zvdlftl|UOuOWd*Ox^SX_H}w8h_ZMS~gJ6^-NzCkJB2NgAZ9yHL*n7~J8DU&na<|BX
zSyw|X+R_tq=BsfAUmcJ;3;O!SSkhmUvdha@bh6RP@WkEg#W2%nT;yB}ABg0#?(GD(
zalQE6-%`;e$}LLkW7PrPSD~`i0Vz2%TVm2!WT5=40$n9GSS-Puxn=M!D08DBjfQe(
z6~oMrFFsA(QRuok-d)O--qQZPDwU>E_fj9lZx^Z{&k?Ks+|f0Row~W_XS}>tPMbpZ
zYF^E;iz2LO{>7)b|0s$NWI9@BT1=O4xXsex6({E9l<4`*sW`#$ZUTYyPI|_8S4~TW
z7aR}khX(y?kT;id*Wj3nv6ZeP!zM(wQWiU}4E}P2Dp5anhPb+!1uet{jOLW~H;ayR
zkWuvEGI-w4qp7S`vSHD@+B1orJ?PGLwiDMQLbjcc*2foCe_c;un5&T_EB0D3*bJK!
z^)24T3Pb%5iK1B8!j*gZbD4Ljxi<-m2(B>j4+dW&<_LuiAFJu`#+uis4?(gW$`&2%
zs6da1lBMJ%I3CPK^RiesVR0XXKDdZSG1ju6EksGWztR|-g^y)Ddz$9?xeqd5PjZ9I
zs;pA`&XY<tS9r;{CPg&;c!~5MbI*RZB*qORZKS2CXAoU>&wnESK|V@-^`VVMllECS
zL^bK5=gwX-hok{WX$FEzzF(#X!o>fODP0=l=7ko(lA2`nc61thcZU^rMAgl}*XpP<
zAmRKxxz$x1R?>3!#RNwDcY`4(qrzrf(9Vgc5?2?*tOqv}QrMw0@X@i}L>JyqR<4c~
zK!og-cTrcplTq=SrCy{>lwn1;Ukw$>jPAH7ij69F!l~w+Gniv%dI*z99{wG82{7>I
zbqTY+VXru45X~yg1X4+HU_IpvHRi30cppb^W&&*g=^EacveH~nPh^0F^j1->&y`xt
zI&Dpd?!HrZ5VSEWZkDM^6(sX5cV34V^Aq1T%5wRmT`2CP_*3PRENxiZ*>_~@eoA^D
zd!+{wO$72}<>sLwp}!7Ds@Q)g#@!&%&Xcy%KpVS%_d?`zgDuOQ?w>yTpbJASw{k}^
zR-J)Ux-=0S@)hs*JOel;t>b2D;bm1OtZ4VGR}Qc<UwsfXhV-x3e$QG>Lykk<-<){J
zeJw4bOcd}Uynn;2gULm*;nJ<;w*)sux3n$C5EH1(Or|u39L{^zkc2j~LO81_((!~l
z-H7b*%sb5QRm5B9dm!@?-Z;`Xr3rLFFs{kcB)p}c*z(+4xm(b_EH}@BqYuZcGV{y2
zj))hu>P(WVnhhBH=~ba3{WmgBU<pvw>a>Xy<7LSwK5g_&8s4={ij(_=W8qbu`!aT&
zoWaW@OtKd<>^vD{W@s)O+W?{A=3hx?O$6LB8%lRy6AS7aeoP#0ZzY<P{=A_rnwJkx
zfTFvuAtP?q`?J7zrH4D^?VH$mH?u{T)TPxGMsYf*@31Fv5w4OAPX*rP<cdT}b=0&D
z9d&_zl-jFx3h3*<LXoqglgX~>(jGMvR0fSxl*)<89hc?K#vR&I{W_TsZ;FePn;m<S
zT#~%if&XvdeC+w3;3lie{~JQ`_p<8#fBCxmxufE-nx?D#|B+KcFevB`OX_CVfQ)(!
zRty#a)uFD|$NAR4EAc;~CSON>lW1Q~=7WZ*_}`VU7k&&Rp#JK`!Em*_Zu8spYB}`%
z*sI&mwnweH8m>A&d3;k<@76(Y)-gd;ud+=h;Tdu+rH@`7>6m=YG1roM9J^lBq@&cH
z>&3w`tnI6*)VQpD5&VD!l+Jx#y|pSPzoJs1qj8icetVm*C}aJltB&f|0^rGEa@Qdn
z>FUTr)0m;P|DU}WQx{C*{l(BC&+_Z{;}4v5S4g`JW6!00wwv5y`UaQgi%HZ>@IYge
z#|gW8ADu)lbTQDTWT8}S=iW(~!%XBpflhy!of3*syteTAaPjqYNYEB~#K=;t_NZf{
zP9&pFsMF$Tt6<jIrsQYC1v1g$b)Jk<SG%>Ll<Uglelh)@^VzXEO%2ml8$Z6GBxbIf
z%VyU8AYDiFeea*7Mw)~-<&Yc|WHcB<FUe|A+=fbP`q|QOsrNo{eoKD!`9~)!D3dU_
zrS<IFS_pZDmwX(2gt|V)do@37;HLTDkk)?GPB>&=Fgrw3$=%2^Ve;Md@Zj;31@hSx
zxJ-%$U6I|xBfV9~=38qg0!b~c4ZK(x-BCbR3c8^!rx$BY$nw1Hs{V*?FYZj!=ZSp1
z9vQSt%$)DAtF_MeJ3c1W^y4|lZys*G%$;&SwM+B8Y+2j!7l)pga^^S}pVHF)JW;p4
z*iF5e@gF^*S#(FbU!7y~$gdnJT$QjMj%e4`Z;6=5Jli?l5^<LCTkL0x9L75_*&v+q
zy<kgU!S>xx=u!`Z2b^!W)GKX$KAyBoJv^;PO|7|&TP?0OG`%+;(iBlXy0yXcY`9lX
z!7<5W+b<thN<^ISdPHjL&jtp)Tw2(J>W5#795Kpa)xV~16cYz)4HcA0#wDE8W;Q&c
zmCHr(FCN-o3B7hBnsR(`E}|~w6(@225u!QsiQ`|ZCnF((;5QEI8BIvJP3CPEUb#EB
z1`)XKNO`}G6M00m$=Q2t9|wOSWChpPR+^ye8o&+OM&~Dg_%3nw+t(X5uSTh-?*}?D
zk%3mR?o_X2uDVS*`L9Y)X_{>O!Bg^HrYvyo;;L|T>0XbN@oVpqTCpuUpv|`sB%~G}
z>Jhr&+U|6QhxK+1RKDbn5$6%6RpawXs=q9@@RnZuJq5iw(`?FKcc4x8UdXcVjAqiZ
zNSWOtqb^OZfjqNF6VINVlMh-OIw@pVs#_~apOEs}VD9(SPqD51Gv!L2_9)N0@${Fg
zeLx<$^UXz<Is8?|xuJ$WyzA~kZ~S$m3G~!9Zg%72-sF!exJ2EGKy#u3`6b+Yw*zlF
zc~**T%TQwq;^~qxIt$%PF!Z?g)MEX3EOr*`JGCSiShuoMpU;XqBF~5ue{ek2RM>nZ
zPe14P%M{}(<yx@SWw6e~S7x@toY(f+bJ;0V)q_{ut-nYq-%|W&^(x0AtCA+MKS%K=
zTBTVb6YO8D2RSizH+Hel0Vc~xGv*-eC9Q&b=u3~}RJGuJFy2tz*0>36@)A+G+2NUS
zYQEV-XP~B#lVWe4PhalmtYo`<%WOESyO<{O?%IlX=k_h{7w@_#<<d+*&H8Yulod3M
zXiGt$+~+6P^Gc-r*c63T#oj!a+Q*bF3_V09B{HkS<eExSBihs<4}@O@EPk%A9;9Ty
z=uHx1nP7WW)gBS+*5z#E@yqn-{+A=`?z-?KbC2a7Cx+1T3R<b4)u|`uE@X17w*Is=
zV+=xsw#H?VCVmV;GzhmBGQJxS<D_Kq=M<3&isL{Qux%#}C*2BXS{(E^RTb&c*}mYJ
zCP(vpZ5kmJF$iyuX3AIYGHip0U5#N5jNeU#V>TTw;Qi2sPoSRF^!fDfaS1KdNjjhC
z7GX?{CjV&~i5?9f)JPTQxfUSG^;}7j5{j}j=DDs}BpMe`k+I!L9~@6;berK9dbqX6
zHSBEE_b`krit1drk>sd0qnNi6a{56mT?V@H$7QzzHMD2Av_lYqupp!{se+onU&NVv
zb0KK;wr1*mNd`*tr6ZY)&wR$}rQ-0cm-u#+)YKEa-pPaHi#u?~X9YI{0`T63p!s|K
zGT>Ctj-T+bVYS4xlo8+eMpEuO!$tmamcc^V*CYLwV7gfG2K>Vwfl=Ev+t?n`Q1FH)
zD{aDcrr)o*Ih~w*)te^Q(Ki*HYrjbqrMG@Hn~9&$rp-8hj26Vwvzt*I9`!<uuh4xM
zl+z?yI@{1NDN{i%mp8Nek>m03I-E-)lF<&F3xZ8oLtFN%OR=we=(I(Au%W4mP?m*{
zi)@Sc#&*8u0S1v#UR#Gfh&;V2{=WGzI@YXuD}UqXDBU+Y`I8e-YOxko30JWVRe@aJ
zq!woA^y5VR21amMA&2X<TlQxH9kdwt`?EZ<lAmT4clc9!TO$aoPPh$quXrJJ6u!CF
z%c<VIE;^~yd`Jo*vqy2(UYYbeiGnf{?aB|bjIM)_?g3}4l0Ft6a`~!0_alSFL=@~z
zU9@_WookW>=tn#@D9p^Lh^AZou@w{#o<Ba)p6+oD>3x>XbUe-6h#O!Xes3^CxQtw+
zy!cg7Ea9vK?HT&!CHK#d5<Z!4vnBOB;dtU6DB6K0vV0Pj$_v6&CC{+0Wk+qwq&Y+3
z??}H|)4$WB@DrV`xp@-K8}8wUdC48t9VNU_=UBDE_=?l$Yr?sKQZP0@eB(8}q;|YI
z)KnO?tQ5|%FJcM<91bHPzdFtdNt$_@g?E!K2{J%<sp<V7wI+1^&x1uvG&(_<7RQ_G
z5n~`e*C{G7&F_1vZ$k3d{f5gv0_*V)l$mZio;lU8)iOEFsOELL`Ny28b)g#}$r4tq
zzN>7_3K^E3S;j-m9T&c%uTikw?l(U3XVVennF|R9wk?4)<QeW1$FI{3x48Oesgnu1
zwV~_5yfRkZ4OOxii?5a_y>kVkejTKvrV7@)DgNY|84|nMqww0LY~cD}l`dLwCW@Ow
zjRCuN_fBLHja%+Q*-b-(v3cIY$4XMjfE5D5FE^@_3sYteV;52+Jg_^1qpPn|9e9P~
zuX<m>2nM+?39=Ru;SYN8Zx-;s@XSJgu88__b1qdRoNH-RxA|gSM2(YOTOpWnS*y<)
za30s%^@MYy<U7g4x$!fcGEb^<@WtmEPQ`z&OgLZAT1PW?MjNQxv2ct%e@sNt@M`uC
z36nhqe8@lZXNQOBy#RGousHk<{660dO#!L-+?)sRXCy%syd6gp`6GJIcM9jKnsSS!
zhni(7_mqUxFjCHCg72b05%7|nH@lAN6#B~-xC9nYPoo;$ROw|YM14jtP}9?-Wp7`)
zybHIz@Td5l?f2EoQ@a0$g1kvNBO_1zImT=v_|#GR$Hscqw*d-B8^%$NNB9?2iKEr8
zugbWFos8<R&^StgYm?!p-@Swx8;k)1h-RVlINm`PA<K^kZzxbBrri&T-j-4Z4tbh=
zhr#i()vD6Eo~+x@qC>FBW|M2L;n<{h9|mzP5LsDQMbPk);v-o-1sd{gF;Q7h--cio
zE2;ibKI2PY0gWjcN{r)7KJl$zn1N3%Ir$BpR&>F6zW14idZYN0V45?dvnj6u>k&qj
zcYMijK9DdC{6#&ZCyDnXKm89ls)IT)0?_;juSAMa)o0CiNa{@Rxwg_Fh81(GV7Q8v
zo1JkRK3SF1wZO_lvK1L;sKr&f6l!BWl}BY~>#~E*eYI}acFwP3Gu_o_RcUHN#%ph8
zRU+(bzgIR9?`c4*Nf%_V<`u=)pM!eT`75tP=C3j&#~9|Rc+=ixK##o}c{?age__ly
z;Lwu$f$_H|oEyp`*izH;3iA{<iZUFp5idk(Tx}&%QDMA8eaX#Np#p>6k{cB4hNrcC
zuW=wpnUH&8qh*;#{vQ>vN~lQC$}AYS-Unhaw?oh_mqbra+qTkbX1c`QFkV@nge?Kb
zCVDP8&AS;&5|j<cTc}UsCh9^y;vk~%OQWEhVr#?*YFS>a-3SN`Sg5D4D`+^%XTy%+
z*fN9xL2SY!eR#l&VDL>E>SSEg8D`ebXY)wP8#TOEn2^a`pvKqs25Q;Nr1K2fE>Sa2
zMWSe_ys@EHTz2&6Yw+Ud#lS4PK?n;X+D{RRJ6Uu>J%`chkFbCD@pQ`&8Ne$PE^r~G
zAjr4=JIwF%Uf274spSC87li?8U@`_8>(;)dd9VnqoMWMG*D-fyv+)~=TDO-u%g1rY
zRJsS)a2x)Mr9leHT1A$V*ZYvN+F6p&mAlktbEms$zeI5(1?qcG4w*ZobpS)?2aj~~
zErjufd{U-Rndu<A%=b`-S$8X%>UjGX0j7GvfRd-N^~N`IA@Ta_w1Ror)|pkAiuucA
zG@<hX=`9S~&nUve;{@sz-lR#P!W<Q36B~Sq#ksG4JP$5*H6pI+@DwOh7<w-?U%2pV
zkLD}7j@a59Ozc?hSd%pV&Aqvxbey6!-+Rvx(Qmbz(9Gy!<blQirD6QsVi3$oa(@J5
zHl3Zm_3hRK3Qv1TwmW`#u3iQr6z9h22`BoMM9IOCl>t!ttB7$ntQlPGB@>CLp1B98
zx85b!!A?(O?`XDQeccSli|CC=Ne*{(2`|S~RFq{3isoDo)qJJ+!M3g~RbxhP4AO$R
zaudhgfED<FNPeI6Wmhm@jUreJF;~j^ngFOs<2NMfabDGwMXGQGOK-ped_Y@xnD6S@
ztJq>UGph)Tc^%<}4uqgbT~xg06Z_vZ$XECmzxAlg((^~1j6B9nVSzcOQ@eH2S3}D@
z*+4`sd;-VIae>(dT2KAP6>0gs=i25F$A2coDEiS%>g;U|74BoXa9a|j!vR@;TOm{1
zEMhjSDnUqVt39Q`(f2;$w1Z^X54{npsovSah2c5&gR7WRI@Y)i@*?gT&8oEy>9>r_
z=Pw$gSLqwCl~Z`|i!8{@h94(<?L-QE@`2$BU{Z}lN7B|J-Z1z{rA@zkl(JK_a%Wf@
zx`AtfwbC6Fx+iIR%;b>ZU;Q4>53Gs;hubRFx00A{)bVZ*k7&Ptv+oUV6LEMg5pJBJ
zc6&l9fICVCJMCr((@R3#&l`GD{#X(R+XS5GWS~XMD58S<Z@dJcY=8&obT+Ev8K*=j
z4k=4jwA`z5netv2<F<8dxf80UbOiV{Mp(;aO9Pjs7&kuM06nCXGH&jqdu2BxB-S)@
zuo`$u=*dJRC0E9gr@!=6W~-1ja6l$+wvS6!!__Xh8E>ILD6qQa68vZTTb4kwNclc1
z7ba-5vbFOu9wI|2z>3lhW@`#;0C<Qf;rFD70V0kh@#aZTn8I4_<>}dcKj;ds4LDhG
zaoS{k8bz<oy-(ch1z!_aNYf4FMawqtk>yYO_K2+^PK$|2?0bzk)AktR{IQv_-cumH
zuc#a5dg6AO#Ywt+aI6%oO<0r-m&)QZquLwy0XM!y?GKH5J<x4~v=Dz6pqq+2v#V%g
zE@lmPwgMMA$aW*VejZg04mwl28lBpXVm}@RR$sA^vfOkTNmpeK>xb)^`|#f}Z(VpJ
zhbC|x*@d~HM-Lyl8r6}bBeKZ>TVaK@v|aV&8+ryk6^wGU9gkw}4$65Uy<H!k7UbG?
zmENx^u{)QZ?a23RLtNRS9FdTBd%k_oDuMa@X4i(Vq*vx__v^U&L=ykIKN2S}?VzJa
zR1ueOn;r^&ir1W~g|{(bN~BJNx3k9~;}Z$i-AWpXbF7mv-iYQ%HQviBx**KfSd?wj
z{KnI%KZHl^5XNr&mEZZ~y8T|}3B=%G)CfRs@W!o)2-5-qk|N3S&eJLUPc^%)f!fr|
zC?}&=6vw{>;NEXUMUk$>J+EIXY!sE5U-)kC=+_5oMvm<50v0ruG7VkcK_(FFIDe<F
z8aTc<&*_x<vL%0?y0Fv9h*-o#JeK}P>wD7&qWC3W1c?1w<U**$(^+UNnXm>WxM5Ly
zm<v{mkrqChqoUkR{^Qf0Z|d6J1wM4f`Hg|rKBl$$WB4cR0CFhw4+2A{S*i`?uaf*S
zlw9mdD&Qace>%2eDgF*&to76!3CC9C53}4l<h}}u?rey<AGcE1r{VdFf53e3hu;ih
z&}-dBDgbcRyd@6gY6q#+0(ilnFgkTNuBd@nE2`cNn`lrNniV&1#HwQU!^+KA#dLFf
zkR#aj=ABFI{_uoBb?~Arbj2G{{_QrC*w41(qp|z>xPt&nEub$=Q1JPCA4wK;#0e%U
zJ%~Td=-xeXa00`8=q6=h0{k4&*KSQPc=~MYg$R$LPiv1t9eDhc&^b9{<&jmahwy&G
z*XsWJmQ`1^nn$&8;uCrL8F`p&o&7J)@WSWaiV!=R*O|+-dRPxwYZgD3M7cKee9b;;
zn<K6vfYX>0!<tiXz*~I^g35KI=7j+BB5q~Jd*UH6I8eBV*y$=AUC5x5Ci)<Tsn21c
zp%Cy4+&X$XSta~?&euZ!m>pOq3J)_ku5KmofD<t+#MoTBp9xf6EC%$k&=gY_IF<9k
zxGI-eLM?s~1QYWwy!Cv3AUF4EdGU{@<9s{6kh%wPdKCMw-Hr}hY#9Tq*Su73f;HY=
z1X1SXN#29Jr|4mtN$7XC|3XpPrcU~;m;0%azs*JEg+lHWP1W&w$Cr6g5gdtqgN?j+
zOwi@k@U&zi5Y@AZi1l7@@frr0I}ZApd3<?Xu{^Jr8$O1%Yw2mlD_}wGb$dnPQA;~6
zT3PjO(;@LV*R+qlLxFVRpRnE^^kU6@wv}P23aa^zjqdXhdq<$u@-Uzd2X3CjtPEh|
zd>7bS8~qW`Iq4Zv{kVJg>=gsqo_1hyv>)BBkIV_BcXybV6&BK12H)-rbP#pnOuwcf
z<q64L_UpPDr0aTk)hm85)l4~MhLwuJB-`X8`Zj?Vz+#u(JLaNI7EPh8-A;$DBiy&K
z-8GRStN72-kI!apcixVu$Km*=+5~pg#S=X4iX}=d5qc$S!ddBW{tWQkI2SBazo9H`
zM962Hhka$5hpAe-sX5KCovV3$wW~nP6#ezEMZ`hCiSzhjHn5nYtn<qX6xT5F{hQ))
z-Okj}KmZ2MGMuW~>b8^u{I=-wyy64V!69hfY36E@qA{1z_l(fc6;RzcUNg4UX{D@2
z$e1a%dMX@n+)p?W%@O1L!Zo<FmI0)6$9o}+ozbdyZ+9}nv6DKh6=&|nvI&bKD}#KP
zCLz8~+MRl<kMLH^&?rXP9)O)ql@YOm3vjd`xSD2BF>`5sMRn{^LDA=qkHQX;UFXj)
z`}9?*ntlbRQfKiCKe`hw5-vfgyOS^eO1`<~A3P#^!K@?Y=>!p`7>Z#R!EkJ&Q}?5f
z&XZ~MyA1Nciquiwc9Q>L=ljNVb){~y<)f=7>%fYby(1g1z{BoBiSy#M-0f-zK!=O+
zWm|zu8XOIH$85zHyyb#$7BhAO8s0Xn7s<HF`!w2V@H8)`Jw~_p?f3Fa>^h>#&l)Js
zEY)NaFg{aIRE#Rl+Z%N&kMDk09M1IGUyxQDFBI|SdO4N3^~4<Ehl`ip4525y>6PSD
z{*=@0kWtQHwX#)TLBVjqn0|HZ^{v(^qjDNVt<4@iPryn1TAh@9-bwYW2En`Np8Qxi
zjN(m8bg59Ew)|BQQyTHe81dRK92us;>KYVUpJtmlpuc;L>E>Zf+Z?#Wo!b-e+Gby3
zTR2jUb$m+-yt2&0S0}5@jYlAilA-#d`$6*o)$3E7U6A<+{OvvJww5O<ExielNQTU-
zCds4fjD2|=c7JS9+XjymjLqVcr;N?gd^Y9$0AKYg@>=PNkD=3ja@_67<9*q)bB<2U
ziu4H;9o`}c9|x5<x0biUzbj)ELZW8>U19c%L`54LRgS;!v?hLaWd;Sim05+FLk)#p
z*-=z!f!;_oIgxyQ{o5*7YbzUg<h>cw2PLV!Un?T7-tD<SVR8Wi<FmBwuO!w(s?Dwl
zW;vcHCfe9p8tPn?Fj=|rx#@F0KXu5+32<a9mg{u*guys)fR$XFRAR;=VDl1!rM0<W
z7w(D(tF-!jyU*KxeI%HC(7qGel2Ht%$}*k*Vf)z>?5t^AsLzJw^T%c~=mpC<{n-mJ
z4;G6f{6KcS3JCUJnXm1w{NB{79IB1{$|kS+a9vH4h0~ATsTG*sRP6lts5k7b`|n?5
zSr%8G+u#uKb)n1&rCWvkfh2$bkDXeDQmaO5{s>`L%%#x%z7|m!f7C{hc>0zab6R!p
z`8yZha3Xu1l42$vd~8Lpi~5Z0U|!*^Vg-{SxR)$!<QP0s5zPBJteb;O6VSvs(}>cs
zD{$$wgztIOQOvmFaHsf_ZftBs>9_YaV>201OhEFNqOT!2xNlWG$v#3$p4@{w-0j8j
zM3#tk<O?kcVZ>&kp3abF+1ZeE_GeH;STwHWhg9xrr9+#^e&^s`yL=pLbG&!;Sb6$S
z+Cq~q8{9=6>3-V?A^!QK;*j)Zu|R8iH86=`*QIu4NC|MdW8QWkwFpAE4U;zi2-!3I
zP0F<91Jx^3G8roMGVjrdCQmrlEpAz$wG?jT`QS=hp}w9*N*FSb%A6*tIKTB$omy95
zBV?Qmv5AQIF!9Wa<xOO~5UI|~p5lV+CuxaCTEHQd?@r6y^BkLy(BS{57_{#4FKh8S
zG|%fR&#)%I=Y)a)Ca9#83Be56wDqcLlPG@k+H#Ouw`wD#Pg%Cg)~jBg!gwU1^=N3*
zizB-!LF$peKrvqB8mZiWSBF+Xn4g`57h(SoEiQmUDuUm>^A?}iqP=C8k^uThf%Ygx
zV~dF+>8b0_)zO-q72<Txc7Q@kTrUJ}1LV_x9nJsqi$A#2R+O^aSm4iin-Ag&9601U
zR8PFs*sMvc4x=gyDL0(1BK+Q8%})AqtC)2p54Q7V&%BXvC!%s~i8-z>U&Qb~BB~7<
z7z&yVZEFmC$DtXuJC$X1zLVA&9y~ZaEr8htr{2M1)Ss(0TdNPX9}lj}+Fj_nMR3JC
zW`B9}>KV9&xTGpK^G23jzj-t=+YY;Wd^O>ANR#bs$b7R6FQz_EtLmX?|M5^xHN7fo
zYT2P?LSJK(u+=-xR-8!6_@2$6CoAdEyTQ~Fw-FK|DPiBbopTRPwHj3`PPNqQX@7T(
zSv$*r?vkGH`#E3VFUa!XY<R2jv8CO&D!JYkU6g**W<{i5pR@OQ?EZ2(rwW*JqBv1P
z#Bi&asjdFUs)%s9yKBSY+>x%e?FE3-Xll;r6#~z2+cvz3zEDeo_xHeq=e_gF={;m<
zcS<T|2Om^dWz=ba8&#9Ne@>s`UWA^s!g!wB#TVun9~Ak@-?z#0XfcA;p4(K9fg5=Y
z=?^ONZoy^WXx*8YbFyaTdox79Uh-IVV)9t;<s2X<*&7*oV%W5RKG5?x5Ayv(tw?;~
zf8fO7rz81$47Zhj1^Im<^rez_X{}b=<X_b>LIqaNM}iS`<`Smfggj+Wp-jCiZCdr~
zxGS()MsTB0%3I`S*91ktLY`5B=FF{Db(L1zT8~xAP+_~9e8wEVzkrgC=OW1S&$bL_
z4YPn<hRL_-*LJkE{k{P|s>FUv%$8N<+Kb2DHgG~$!&a{Mpr`4G#>{6jaf<H?|75p7
zJ`ef#GhkJQNzBXRy($6PoA;3U`p9~SJ<v)7K#jy9M73}{Ow|SoF;$BVzkRpZ#m|^X
zhfo_ZK<%$e`uq^`*XerpL)k#X$!LNh+#Ub>PI7bz=xyYb6QmW6S1;+fxPT$*Gu9_d
zTiOio+K8Nukyb@K+dKWV)VMYh7M*>+S%Ae=d6gbO6uZ(K{SZ>8lo3y%bv<k=3tZzn
z_VH}B|4_3vvfnWsHkQvvAoFV7*x|FK99#J>)zxOpo?qKTj2tug9-P)!(U8+PNoaQ&
zPXC!zgX1}XmkH)XZGk7IN%8q%n9R9*3Dz^;r+{1Pr8hnL$beb{3kH}q{#U-bUz5f4
zKx}+&8GV064Jro^L@D36$ffFQ#^0q2LPdbN4!LLiuzFeh3eZMBMHa^TMjS@7JwS-)
z?WN{KzCi|n=bj~|j~pc1RZ@<j-NyhSDB*0l<1TO(!upwfc9f|h?z8jVucMpK=FreK
zpfD$NYe+BqbG3@-o$w(T(}5^IIRFZ$sE?Bwo6dZdDxv%X9s*AkDHS-_q8Ym<qmrvz
zFwtqZ5IhG<6uWIxY_t*_B|eD1=01SI>sU9>U%5=b-B29qyUDlv2Q~S+@x@00-o?Yg
zp8&tkwI%99R8f4HNv<&7+`sV?PhEuoN73w*#KLr3Qt!&uh$1@w8X+itKG)*^p-J9m
z=WUG&D$n@rLJrVTm_TFK&E926|5iJJFF+*12Mn1zY2Egz<X40Zr@C?PTP(vjCdHSS
zp=KrvJ+%U=B5L9vlx|lAM|a$AQf&Y`!_^3<Z1IG?L&?N>26t_;&Qk(XE6R;eX8Ih$
z)Bf?+=(1r^XQ(L~+?>|nCqO9^IhIA;&H>LTpzM=H-fp-z{mI!$MkA+hsS9vorJk#u
zc6saGsd=FQ$1_9bC`A-|d5Z+HDvCK=Y_lK(PL6mbes$*oxwA>Yj&8O(-$2FVpMM}`
z=?~mUP$lWLGi*STvU!M-K7WMCekn~f3r;L`GP<m3hiw@k(Et~0BGeat6%x_yYXX0%
zIB$@3%O0h0EHSpMGJ5A7Ab8D{&nhn34J32<7I@X|8#fP!Jn~yfA0G86p$d3z5OXaR
z^8zc57RmN0MFW%{%Vd0ng$f}!!PISQzm%8qH^4YGGal)O2k|R3iQ)p0vwr&GFA-rg
zyr&S^Bv(@my3D;mzI(PlE^H&8PepxqB!>(OM@^6c=heNfN<1^EZj#4uYtKsRilRgS
zRBb~d#~zk%O5;Qmv>S^{q?g{sh41=EddYyjQ}-Xn#4djoDT;1`2_V{zfb~2_Fc6L6
z^Hn*{fP(UFSfL`K+?OmXDZUAZCu!(9Q?`I!8NTMrj<q0~4Kk$V<#Fnf<Pxy4(n!<D
zZ_WUIOT88UG9}60(TIW;{N>|?=*jz1dLZL%mbCbd1aF_=yQRCSp4~5;UZ^9M?QRU|
zE8{|<`we+O*t(ZsEFFsQR$lZQVDSOAUhiCT#Sf6VyCAn7c%xm=JwZY^WC!m`pe`9c
zs8rvmGnLTPxEAnl7v;Kao}?iM-W0IwNAvC)^$Wu{B2nN#|DV-iT!`43tngGe@}1d}
z%JdB8)mm2Kbu>ay7^V@}C(YaAPra(cI7l#*9vlnWZq!CW5cWyeL=hk8Y(+UP>5=W+
zp<ptE1V%()=)9(YT<a=rCObV;yHUjTWE9fEW|pp!6!=Lgb=0Y{0U&H!k7vdKe)^4p
z%#=?sfr)qZ;{i8ZrDjT!6(dEQ&665gqxzqzD-Y8rRUrZ=S0cbkTJihH*RB(v4i9tx
zdw$q}T6PUt;{Nd1_W<x=Imt}xjc<3E$i%JlVF=*8OP_5`OMZ?Ym`DH1Z0ixGxQtJA
z14Qw5v%n!Y%Qt{u0xAen2?<Y8_z!t)P&*N*>AAX7SB~qM7yyB7Qla3yDR9_JmE5TF
zA4CDdc*lO9KbF}A(~@tayPxht*B>noG?!PV(vaJ7Sv~N11R!PQE0^F5&+ZAnS3Q7F
zZ?qoM8^gWKBo-&zy|8dH;(svu0Fu)H(DB~}Je0>o1_wN5_I^FL8tS8OtvktjnT11Z
zAsBPxZ1H1vh{+31Qm=aYItkkJV`Y;a4~{4BV1BQwSCU2O;VCo+j#n|<EnO};8@7WG
zJ-|C13xG}AH#pus->BOyuOXzUNAxYrcwI#gS>u;klkZxIaVu>#8=ilBP|LYwr7m3`
z_Ml2-%VUnI^Dh%<mtfy;8t3BH7qkZCrrn)pqI*!Xe1WjJh+@F^1fLw`H~_e0i^nhP
z<(o@A$g$6FYrxjKX-GSllxCv<6i!M&wC&~!;j(RU?E&0@)jTw4dC!~)82h%h-<^nY
z%(Qw(rDnj=$4)J7Ad|;Fy+DngQgiE5MY3j)Ly}t<0Qh$ZRkd;9X)^ekh87a1BD<t*
zQcLtkNd>fy9Tfc%?qg1u*>xOOoDm;qUCmg18$No}3B=Umi?^duCG<S?`ifP$KkO$%
z9*l69H3SjfUtlz=^=@}Q$zXxMq>u@%`h;oPObtdxR_;9~q8|fr%Nk<BNCC^B{y#U=
z5F-kLe4ovq3Zlv6z4)$BRP{>!8LwXJV%n{*U{|bv%O#*A`sk8vtmsJtORs7YV2oL+
zZbLZ!(Ls8rjs=(9{M9y}q|w4xl98)ZRw2X^rHRZAqW!NS+-EXoF~XVTBHkBQJv!gk
zLmX)@Y6G7vlP?O;pqHf04?L&c&M5nZQu6eMN&cztX&&x!c-`fYQtfq_DFhWZDbx#e
zsj1AsVXT3EOKe6~c_h8KdksZop{K_y-(i1BDbWMYDeT7%1ntkCz-csbuW2#@0D)OU
z$5cI9-vD}~PxlEyLong|b?c4iGG9H?UVn*sN~F8%AVEm>au&t?U8SUy)ylKAIsG>g
zjm5}Qfj42=3)ptaW<&=M5h(F*j@nszMSr#3onL-D0VUkP;qyS6vI%$}*QXS9?lb_H
zqD_2K#tD%g4^3iU^1R}wz3V<T^1RHFMOn6oP`SE}&waQfxE@NmJaD{uv3bbCS~C0m
zO*l87|3f+ysX8H6!(04Ny*FMJDc-Yyr!aFTT^GR5X{8`+ee!`Fz{By{S_Q2LPpPkP
zM9t-se3oiwCm%qzz8Ys}xY$?8`v!{8os*0fT)$yexC+Qog-r&WiK+XMwOKV5eog+M
zE5%p&&Y@3o*F%K(X2mQhmcD5;0NW@jbCz#D{BpMJ@rM8dU8P{kdxjbmueznr$Y)8m
zmz0E(Ar)so3g+(|-00cAo$2(dR3xcqbn->GImQ%mj;Nj|V-DC#L5LcUFG+LM1piqE
zJBJ%9+0T&GQyJf&6;>OTE3qNNBzkZ_3+MF?xYi&`Z&)(}@rv)9m}$w=|IYLy4y(t3
zp%9Hq#?7op9D3K?0;MCrQf1q3-2`74qFXtsL(Y6Cj!A0;7L7-PlV42PGgs$o)c4b-
zIEnjX>kr0oEbzxv5XHrG;o@h;rk&Es+<A?uHe9K_UJ^W1k%;0SeE|e!#4FBvA<hNt
z59tg%_Qys@Zrc+Cq~*l}utI}}sOhHDTN#8!J0>n`zy>L@XO$T1ZNGjL7xI1uCl#bY
z)RJO$h^M|7=BdoOgs3(Artm{e2E`;^t6Q06UOZmt+Pa<aijR|;L-Ea<5^6UQSg0jC
z{$q<rgiP!{i7;K@fBo@Se2QFKSbTcZ#2i+Za_u;cR%-D*E5pyS%=KWP#fDv|73pOl
zp*|vn#ic*LR(7kDRo38&%0$@sPnAA#Rpz{UWr>c&Mo2_#`QRmZvticf(mW0oM%m!j
zqoQkDQ1kF{v98x*h~Xe*fTK=a(7mwr<OOgG*`BvwEjjE{cnggmk%SxX!0&Cn;4SP{
zAqtO+>*dd@G;R{+`=|P}G~}hN$~<P|Z8w1(5*?y5<?dAbG~}J_++6cr{<8CL%wQzP
zW|54&c%xrn2;LS*%O_LDG^x=ruU{BU^a7I7f+fKedn*RV1l!@iApR^?bebpg2p{>Y
z@2zb6|JF9ze`|YUy`Jc`EU@+N!i`Pw@VA2;JCI$BkmgML1qS$akj;ALqy)EM)m&Du
z_2@}#;6R8D4A1#R=SNS`A)m+<&w%`TxYx^^_HHwy%$r|Tca=t$@!IF8{B?yJ$3K9j
zDoAPTo%f+wsva6IPq}e9XP9+3y}syw0Mok0{F@}N7moUTpx10MU?+nFOu8=AmD}a@
zjH=An!3Gy%D{-o;2({}KF1hO|$CFd^LjPy8@;96c4NQ2yeZwng5*H9J<{CLl!ZmoF
z)Wj2-c#RVN4={uOt9tam=Z4k6Xp67OH&w4)>&aZaD@OA4PUGWvXkN*-`Ca^Z*<bNL
zKcfFlIKlr40KxLN?v^|FDFFomo#g69?wy=`nzM_ii#|mLgX&VlP}>xp5;Hm0*R+rO
z;+T{QGakGLGLvr^>+e^W&U+sB17*6m*HVfOQ@<5x(O!xDTS)wWq-6O2^Os_V>Z?#0
z4(KsIdp)qEtCWh2V~!|#|9&_+1htxbVxULUWnP&pf)dnKVv%l*nX!o|6n}z6Reipo
zV+~JIUk@8(-kS}uDxq;;9$HBX#d=Nq2fP*2!Q6O~D;BeVHh6*0Y*m^I*-3kIx>2)!
z%?UI_xI79=IP?Mat`afRM#n|uSydVic=s@OSA_ILgD%|z&s{;YYF}@x@I|4Qn?SEq
zamt}F9<~;~(W%||HTYry0`fDFWL0*+`u6l{BWx6qzjP$SEEV^?H$;qOt?^iO`3)J~
zQ#5}QS?(1ddnVK->R5$&tC68ZQ&{cmhI@1FWjYee`nEV+xwGTXY|L0Wj;pT5_A)QK
z7`HL&1?fXE?rgCkXQKx7J=Uj~NXXuKaW4BMu3D}p61|q2=3c;HXD-3rEhv*3hmx7f
zb=J7viLt^P0LuD<ORe64W}l+5S#!d$pQ59PWgpcjd(jWQKkF3h`SKRt<py%a>GERs
zH55B85Ok*m1X04^UB|$LE}c{ARSU$qU$UZ`Z8CCw9L_+FwmxFs+SY9l{f68d&b2aT
z0-}K`nndPUCyy=u7Y3Tk>Rw%s#JL?`GB=Lfaos8wUzFpz#WP6^i{hB(;wd||9wT+U
z3rNfFS2n3i$~O)r9+<3Aa@+x=*I+n^329j!Q>xy1cA{Kq(Fo>Q%gs#-YA|GJmz|JB
zTDm&pmx4Nx*Xf7GwHb?uleE;hE+<dWh8}(C=W4kX=0>!Yyt}faBnl$N#va*_<LQ5R
z^zx_ZnjQAF^tz(zIWQ}#Wr{wT4@d1>ISEax7v#Pq*ldwrzI?zFynnXC68a#?Ei^f^
z=N54Su$I_8(2kV<?3p_PAInc*=)Pwen|14YD|>_}w+IW`$369s3u)=2RT|XjFZkwb
zu0ZU}EV>9Z{%5b-WFQaZkeuW>@BKyqb4mcjY9!*40q7#Q6djT~%1dPl$LP=+xjUB{
zGlOl<CQGe{vv2nCT}k?jQS38WB>F`x(H?D<_GdFhwQUo5?0HzQjTrZ3Nv0mWkt%mZ
zjN7a*XjlMbR6wo=*KbUY12;2tLByn#@gf?iKOerjiEo3ZCLI={>r~T+cEYk2wZ8Gh
z*k3H%kGIe}RA|BjQmnzxf*X#veQI3KPmqocQd2T-CM~*-nn9ABEe=;jPkY0RmA{IJ
zvaQ#ZJ7$jS?QdDaf%Ct9mia~EVWx@~+rXRVyVPJBB~i&=9R$}B9|&N#9^-8*Dt6yL
z8XCqgyyp9R;g@ZCT*O^`yaD^2Kl;M9%1oOJ6S<P&lvJ-Lb@l_`U9&I6rP)!true6a
z@f@Cf;@4WB3o}qX3%dO<P08T>JxdaC?q>Ei@=w<$%vT#RWQVLz&)|5_ZyHc8cBlv*
z+;;-AQGc-CHdCE;Gvg3m$b=6nhj(SPtW3rjV-sQ`3!L9t6$$gKg{e$n$NQKMFuNF?
z50J|tnK#2CS<yR$*chkHtlxMl;XMWOrv5-VL(}i0^&Us7fY2<V*nO^2ndQ-FUdOrC
zMT#U-x8uSXzeh`GzeQZ^R~bNe;lb4ls%DS|P~YNo?jtN&6TFDn45rsHa}1nih6kaL
zzM(H3SS9?BIkf@W=fGCWs$P31U@PZ;wj)v|7-~v1^ifllt_>Cj3+yrkoz)wc`YdBy
zN2(Z5IPnRx%e3+N2!G2B0ARG>AbreC>86-K#>GLL+FeIFYf&NnEX#~1VF#41Ea4qM
zmmD#?UG=fGyY7Ib^w7O$dZ2?NIa_EQEoTS0BDy6Wbz4Z-+01bR6)ciQ1;sE3KcvtG
zW%96_2y}C@sL3LkH4fYRJ_4B+urD0-;MLT>-1o<i70&I({%T9~p2GR-==QT&jZP>j
z?5G07Oi;ISTJ6y-i@~VdBq_?tIREjZiMQ4ekYBg0R2gcKLyW9Sd=%ZEHrgDs@mS)D
zJV}yR>#+gHe$+RF&&S=iz`aHB0Jc&QO-W8f^j4OgR~<ppKgxmhRKED`%VF&a6!S5~
z(b14*s-NDUdVLn&@1(*IelOEKT4jby7I~lbosj~}k#|5<RSZc%*^c{>3&~7VU1Q^h
z<GzRFy$jjB);|l+(F*H>;971(^-+@%1$p{u4)lb`z7!AWdxgZMeBIv!^2Fla44VM~
zbrXmtWgOrj7&dqD#5a#DYw=;|{^Km5`OS(!RDWJV(`S{ViaWtk3dc8d+~CI3pMidC
zTp8FLH0f~%#QB!sxKL5<T?lhenA<zI0JjI&x<FOV$y2mNcgH{5A`-f~aPM6S0V9Fj
zbdWU$Q=l2n14eTP)<(#h4{JEVK5wZ0?MTVXd#Q7)qVFuV?*$p;#E66$*ch4G4tUd8
z-~wA#?z3P$DHUGGRRwA6{$}h=_vb}}H^}zLT#N2;5>xz_vIxYb@d?&hTP~<Xi2Hep
zjGsR9Gaxr-+_RqF!A8vXfj~!Ru%KatICu5AIWR|Aip8<q%SnpZcg^odQ1^VTh@PTn
zqi?vITAMfS1ytocuZx%II0tIh>WK4I=n#!u3bcuQAkQs=-pJ<s(IC)HnEeL8S0~UM
zRH*`COa8S#<05^UcajAlTiuv}D+DFVRsXaxtn&)fM;3jM9gkwezqz}B9*++vCdZ*o
z?8E&HK!kbdgUs0VEepBfiGYHP#fMZ<ei14Oi<NJqI*v;jH|`_d%5p1AGf*&}0N&Q`
zmLNQ4G|5aK#EgIl>8l)Wf^A_YNVgcjf4;$>IN6^2QVMAa&poL#$pN4mTk72YMA;<Y
z1Ei11r6Jn64(pz+2|g`(8ZP{z+SZR&_1TAKy{Lux+_c2~aYqe@wgW@cgF65*Qh)X=
zFuWv}Nk&3v2*N1ic`mTHkj%`E`l9J3Q|^D+Xsa5P0JK{kiI+`a-p*VEfjLFZS4^;H
zqi%YYd-5WpVy>3Sa-PT9L{A#CinqBW_acakZ@2kdE}!`-*u^91!sg-eeISZlZVV#I
zE45>t3jdse9J+nTs%a8EW7eapv^m!R7%Ci8MslJ!KwYmslgPOSYu!CT48{v8I{hFl
zKjPxM&V}lO@My`KJu#{TeOTI%L1miYwqMW;9qNGZSu*t4Dylz$j>pxD`Og=F9KieV
zeEWl-d^Yn?8&BWp@jC;nVMf@iVf4BK&BcJD3EK&Ktt0I$Mc<U51i(o1U!-p;UWjL9
z&M-ntV}$Jn>OWy*>G<6VNKPZ?Sc@ZBAqo}eo}b`LEdxBC)cv5S-SORPe#3!_#_sj!
zt4R|rv>xp2^!0;uw@tV|<F|lDRbp7xJbj<>%=^1;=X+~hW9`v6t?|5VQdzOPu3B`O
zNA*|`H5sP9;6;9pwYRDMnDRITVUVj?;_Zw$1;G0RQ__-6$R;e_UAey&7eX0X_W3)h
zS5>}ZI1z0ND3{w0Xf-ALx&$c1D*sLnYA1JD=?`S$@`O*o->l=v&j^%5Tg1t?ru7aY
z-MJAq1Ho|1*k9v%x8hEGtRPz=c}RQXon;~wqjRFrOvd73<YV$q?E#uG;zhWD;CkKM
zWzYuNQ8_8=knn+xFWJVQ^rv8*f&Q|X{n<;)D6*MaKpAldH!)64j;8kjXW&y?Mvvvp
z#<aD0pt<80dyhu-hI;uVbZ||C^v-rN&B4}fk7ICdL){S=wBtV=Mxsx5$|#xC2-$20
zr(s{C*j9$(8(;>eMrX+(yWYsx`_Gx+c)WM1sYH##JI-xZG}|DIgi?YGKvHTI@9}3!
z=JR4ON}s6^HsYDnf!7*PPEDTx?0}CZg5j47-s?RQf?2#CdZkMwvD^wYi3YxWE=E|)
zmQ0@J$L&B)4}UOV)3YwLUUx@;)UV^|qZ8K2g0oZ&HLqIP6GVTDCZ1d)Fuz~;d}r@7
zqk!7WSG_!(YgrWU$E!sQA7GN0(aBV`bERnDr3>9o>!~Hvog7aeE#p&_-_a)WA6h&X
zPU~_v4~ZkyXDS|(bdY@JtCZ8~)xd9b<>r0bp|>O_{jr)udOA|n+mUKS*~{JBoHgqS
zaR#5ToX==dOg>XA2IhNP1?Sha?5*8cd?#`KM_Br^^Nc*M&yCk%LPCsOUUY^eIg@nj
zQw2N}x!NYwtc^$IZV$_dh+|o*SRpd<;fIl+$_vdg@|7ZSt-vFCE?6doFW;qFkZ-gj
zS0>%D0$$cT3(gPUHTd6GIBRiF8i2MA&In*o=Y-ft*V{I}<TQzKv{<qla*j(KEW=QN
zRK_SW@x<q=v{l8|?tiZMZLs2eQlFwFFA2WW+a|_!JwMI(+j=f5&l`o{;;Yh~iGDBS
z&T5H7zw&<mZ*C>&Cl@ZY^@;3PxZu2l3F|OV@uS<rKVTLdI29IZFYCsYzljR7OQ^v*
z|F(=X%B#I$kYQmjyRl)0eT@Juoc#XxRlr04+rGUg^2Z`L|9Ti1o7?mO0GXhfk4(qP
zvzk@-dmR!2iRh0v{x+%NQ>0x?Uc?N6`-5x?I)To`<OQ#qJO7L$+c^YZ8uUT_ka4oe
z>9(4oF5}t9e}=ef^u%8yhKj`Zx7$~M)=p(F1>-UNGkWpF-*d6Kit_Z8=^Q|#lU3-(
z_}#zS=w;?VeOg5FzbqyN*_YL7@N>Fa$D}ooU^HP47%SGJ=oVH`M6mQb30<Jxpy$zO
z)qP^McHhGzC@41SJ?W$}@FRIvQc9ngq2N`y8f`T3E$bj<c&A_INvr;cvfH1OWb;pw
z_S^++X<O>);4q4({GQujNq5;aw(m}>`ne<JY3IWAGHAQmB1L9sA*^==F7tutWYnbb
zA$kYvKeI|O_FEUPof^QaU&~MLEU8?0xLuNHAF@C6@ShPe^QYw<hYD9j49RBiD{nov
zGp7atk|F>}WLF174nHXKGdrHu4<YO~dPYOxVCw=q5l1{8(a8Fv32kqeiM@-T)&-R5
zRO2_N#}TV(pSaJXwse4A+~u_cB6>Hbe2Q&ui5s?)cGk%HMflWo1f9@;@4;e0PJ`#|
zqGlJ#2gVE;_p{VQ8z}406*Xq7*)riW$`_DO`b=kI?}*B{iYD15`|LArzs{5)MOPpo
z`re!Sn&(6_4M+ld>7lnCBWC>-|3&J@##&dK(k5HoiHBi5JL=clGnzA!|Eg6E^jEFV
zo^nL~6$&HH9t_5BvhP>iC|Piugjk$Fp}wddsnO!K?vNgf_?5aZ>9v!6CFPshn&;2S
z|23`y`O|b)Ey5Z1i>aTe|EMu8_kW;9EWUbvkKHdWwD&VLr0nTME^GyULT}Ed*&GTD
z;9X~6nuUBSDa$zNK81fRRFVYYzLkf3otN&qk71v6e;K~N90C|^aQ;I|6`&3)bxiRC
zqo(Iyc~4=)>5A`mUHyBd(Z5{Gq^h3zSkbn>i&a~u`*DHzD<f}(b}bN}k|vMH<zK51
z(a_cq>YVmD`lD-2p5|GY3huW3FM48s$(rVzJbCqz324*Z%bERY`P!gFh1$L%U1Fn>
zdOFDD>Y2ER#?Iv?(RS3Lx_eg{|42zyKiT$(l5zOHOU|w7{@-=n=X21zW|aal5OJ%v
z^b)ikdsTZq!I_8Dr=i=Xsp%oJDOdx}C>BKYD^{tl9c;c|!v5Vmp4oI|KUc@Tejn5r
zLp}Ss3zSm)_Z4M-xp*SeT~af0zIJ1Bb(s1+{aE7C!~MK{G=IWwR&IK=DBq3Rw#m-L
zKEqKjK_8;)-t#u{2a~#F7eS^u%-aQ_q}gOo>S{w>6d;d|p5UBl`)|#CWmuG3)V76y
zh@dDbt)ev2tstEeB8?&-&Crbkf=EkBcS%SOHAu)PASewp<N!ktF)%dmGZ@En^qlwo
z{(b!5$INv-&)#dVz1Dr-YwbOJuK`_wGO~&P%jQBke-^);sciFT)6jzJm$bVpX5y}W
z2Rm^bTGSr9=^ece0C!ekD}4s@Ufj$;1HI@y$H8Z#Vx(OmJIpA)3TcyAr^K%(+MdXS
zjYkRcPi~q3)rFe9e66?Z@FTcxy+gCU`^LdMvf5`|7}J+1-kJ2G2Ki=CT_zB<Ztg+r
zURbx=C6ie{dShXLPmLg`Jt~{cdFV5>W9O~;Bl8Yzv($1K1=ez3-O4^KyQk$NVk(^r
z)>X@0n(mVasnR>ON)<uY*XaG{co_U_(=7M95^J;L9&!R_P{$#$HZ>mT^g1bMJ&@|1
zNf`Ui=YFK2n#&*>R*sPb?Df#0_0+-9es=xroR9ZWPNMPF8#r&=RkE4SB;O-PbeRTl
zu))$6{08=9F}df+&<!SOE_RhX5%n>u8)RwyF5hE0u+g4O&m*vIY5;e1&v0L`eK0hm
zOOdMbb)JZ}j&U+!uQuPI4~&r`+XS)@CIctLh%R?Q_HR@lU1S2^k_mMdUV-=kgCW(i
zx>emvgNyE*qTdbtJZPO^WBnsgoYwZ)+kA^XTnxXCq`62fxwiy~N%XGGSC<^7krZGr
z?-cP`*r%&nG8x6O)U5@vJo%9eL=Gl9;(FZ2EXpf;C`z0L^_tkqzK9SuFa`eDZraQX
z;3d#Rq|GN2EyW}8Nkoad<dz@D5#<cbt^0IuC6`0U8>WM?H)e?~Y>>|Nm7?y(&~@j;
zJl*3Rpb`tTkHDM<$M-aZsJzF^ZF5o$kCp~khF!d`dXc%Kg2ijnTLS8s4E6+s2A2Oe
zCGnlPNk^U|7sGmDYDw|KiFC9;@a8fX`b#o07b>am-Sl;ZEY%eGOPq)7F3GvH-WASW
z42#FvTkcK?|0~7=E7b=Y_Bv|mRi#g~sa!7h$bMu()POeF-T6%GvpAdNG3Hnl{26v^
z*2)NAZGhhRv>h^oPHMa9p!uV4f%zq#=+4=@{uH+*9ac1}YpL5%ix<QvI}13QZAgLr
z^F3+&GtnN$EUXa=YYuOSra!K1y<if|)}W_3!tO!c_hstvIrC4)22OjW`{`OWTJ9d{
zaI>2kxO-7nDU1tl52=qf2Xid9bqf2qhkYO8_Z;zu+$niW0jRW+PLZrQd<w-#BXzmT
z#9&t)G_xO7l(jX@F7w^5K2A0%tXG`$K330!A%LUTx#PR(b@NR~cd>HzSF25#%7Xmu
z-;D8}F9JFkz|^-<#!X*ZRX2s@M7P>8y&~GdD_r)Uy4PHM4xx@9F&O>$0owBZ*v}~Q
zN$oxq2U$@9kR{uS8rUPuK?eHx%l|ZC0p?VD|KQkrFHA^qK~gk~)<aM<!1!?^(d1BJ
z*{r)w!hDVk0c#n!Znd^9LuB+O5XioT3jkDe+e!u~rgXIqZxjvn(ZipgN4h|8$mCp7
z%iZdf`ig!8h~MB7HeAUaLjr7c{G=9(u%D`l%PqHsCOJc3?zyAA@0ujTg)@;?s}#17
z5A~h)TINj&kp^zT9{c^YV10~W>t}(D!uc*oKHKV`+RD)H$x2R*@kf;5{;+9v@}7Fo
z`Rm_u|FQ3lY@}tv@y!eq4pWmSn}_)g#A-%uJaTm(1tODNHmz+AK34@nIu@HPQH~01
z)vpu1ifek6C{tVyqHe12W3kNt?LFU8=_QGm%8b9`!2d!ffAYFNL5s|I;2ML&m}pzm
z*L_TcRD5W0xsyO1-bY<S?3?D>i+&Yi_!mebU2y#$xD-1cwb&uTSNfS&yPuHy^UkAt
z6E_<!t}!T#4L|Cw(*y#27U`uJnQstB0`IH&y|by;cp1LqF+O#@ZqhqEF&|aid*t7i
zEIW@Mt=>!?&5kuRv5FphS=_J(;%`wbzc)Y1{M~gJE4BVg9)I)=6YOC`GKI1$E2Aon
zvtrxpC7MuI8C+<J+hTbN4+^}xE^nX{)VyJ^Gaa|`0^>ue?~(QcYCbVvGl^BH=BS#7
zE!79Yknd2^^PzJU!NpEg)-_(hhz3TC)T@VjI;0LtQsxko!JLxT_5BxCq6vz_dgc?$
zyTr_PA3yN0di?Ug*E^f_3P^8kXP6e=Zdr&hO?6P*6$_s?OB!aYoxZ&?ntH?~3K_eh
zTO)`};rHr>YV;~5rYH!7b>w|juX#bL+s;<A^eUf(Bt<5kGN7zs>kbwmAMb{q&hoAl
zh*eJxf^&(}dmLB_YL5||NN)H`yRh68=UTKEXZ=tQO-VMJ%hfoDWLr+=6)oKkYNz&^
z`@7t>M>nm@2gUcC9Dxa-s{YGivsov>ih>r7LS<K9@z_2wa2?gRc`ikY0tpxZcn57g
zS1q#d>oHXsJ~}}!3D-^aNu=?Zc;NzPwk$@9ZGSLS17u`9FJ$IC*I2BWefzCZCyYp>
z_bl8a#Z8pC_s$ikN8jdyjY?**&eu^+7_m21)#B3P>C4^V=~~Cwu3Qi3%nZtBzN=w9
z7QK&wY^!&#?WOv_MAz2?zg9Nby#$GE39QOix7AWfK`jBNnAwtPjEj}TUQX1<^yj^$
z7Hg-+Ifu7#PD(bMn%ZYQRAn@#T6Fq?=*oG1tL@AApxxQ<MyFn7em~vEZ?3>ga}DE#
z7{wY>LHLcvK=1+~^*|09VK&4+mO~AAYd*hE{H!slO}Vv;vRM!3p<qB{_Yzgt;tgHY
zbWVahrAw=vSuWI0&osq43=JH47a_s|N1Nbfm(b!+yAaa^6F!%(n=WxluZYylD4@-=
z9qRX?CgUyP4#!#EMvz?s?|$m<Z1Q`OyTly8efy|6xQ4<krN0QscPsBRRg|eOT_p(<
zKNukDy>FLvskm-^$Y~TNzN0@o<!NdIK%Ja9_ffiqwr|n;`qfL#rAz1GtGA@l0_;GT
z^{$=$kh-yoX>YVORby|Mhwx;5p*7|+-9H4n|DYfWF8eHx!-mwXRQ!V}kzJxx!`(Ro
z{z+UzDH92S&ghQr-HVTbHZQ33&N=|q1Sj&M;PGxql(x%ts1zRk!k^jaw+t%X50c*Z
zeHsGAA{5INcUJunS|JAVO#e4nz-q4-@*U-?01Fu=uvX~!;QHlqmPdKy>CgE^x08e?
zlRDILpP_tKM!S0tIqlo;&!5`$3m4@jq(&nr!0!fb)sIa_5wTrn2X{&9W|a$gOWW0r
zZYdAy?5LJ=b2QK=-;(g>9N7SL;BjrENwh5(fQRlqGhl!<d;NjxA4BVW;1G9{{7v$@
zXIur44;Sicu+6j7`aV;eDr!stt0gEeba3A)W5`l^+}PVi@WghK{h=n{!LE(x8Mh8M
z8oKKvDO(=Vh?RG|5p_|!{L{B0RZBZ;+~^2T>RQyqz?D6&MhJ0u9+ji^1w_L($^fXE
z<z@P1qRA%IiO-SNd$YRp!%ksL!Js!<aD5W}PSBt1V8z!GCA-0?K4lgdKk0_dZB!Z5
z*g>Ur+kihfe|2HJ8n;8L!*j3~&x<Y>veyG>saRywppyGe{vtddx{UDN+~NLO$|?#v
zy1o*?q8!rmRY@y(iKY%S^$SmuLI-jf+RIRvSw<VE{EYT$kz}funp<{Hhe%u{N(vn%
z5?~k|i$#!)+lA{s2{P$HEK#;S_U<V+xbx{_H4&Ha<dVYZ7Gf)?c)4;@xh{8RH5K#<
zIL6%Bf@HDr;uQMTEI{<vpW5#Ny_DMY6c0E|R#CJw(|0!R<}%e2XnTwSuFjN6bvLMG
zZ_6nUEj<w90M2{x!*Y<^+<b0cp95c&PQDk}V4K?#+MeUl{}C%{<vQ3J-cgBKWC^du
zi)I>YkC<VK;O9EnDYG+Uv>w8jYhbg#Q1Qii+HuI_fW89`KXQp%2D<PEqKsuJO+hB4
zFW%i5`yAC(c%CwvJ%LFvc$2fU0F<<;+N&gPTsxbeg7Dpl*c7{BDjDBYI)9AWH!ySl
z;TguA=kA9gx@%jhU1mSbAHAWUCg&FL7r%50inIeA7KPWx2b7F%PO}`Z+X-OZ*mkFv
z_uvuqexp)mv@51>+}tsZKAytu#@C_ONxy3dmh+?d)pgfN6UUXfK5z=MC*S=W*1A+~
z#V?Nbj1KE<61sF4-qd~m>?6(n;Em?<xG+42{e4Rt6-i>ju0q^Lep%eT&O>Wv54<?I
z?Yu$Wz_I4-M{`g78TLae*#&NGk!N%UvsGGA_5_?p^n&zA^_~Q!RPqL<+L?C8j=ubi
zE6dmyVoBm`+UW6XDs>9JAa2K04e;2S#>0h}J^{#4l|!b;1AD#4KZPY}CBbHD=l9Ah
zJsV-sh#rbxrJNL-f!#zK{mQdB)W1qPbf#LS@!pg!qbAVYRF@Iw)71l%9a1}mc6y{b
zMk|iiA^jNy-7L^-zOOZv(}Tdy-NCol6Y~40`udofV-am{(bTA<*65*K^q2)!H~Jyr
ze35iXN$o1zpGm}@M<QwKm_zjs*KmoX+$1N#s%CDl5o%GuQPS6l9<E=<`47n{Ep*V3
zH<`W$(6-b|8HZLUC40YI3-j6AK=zKd)e9)sN__Y^|CiHX$Mcct*N$P*Z_sP}4jH{H
zYrC0DY3W<G-b|d?#119y%{fP)DlFz!zc6%n&nhm3<u88-mJL&Y5>v=)kAN@-Z^uHP
z^Nm?RYjykNLxcY$&QZ+?A^ZH`=w7Ae!=1qDf*zmsuPx@_q0d@zW?E&}{_tEK_ZOP*
zp-sLOw?%=+lk1lmrO``U*OadQQF}YBzu{x1?DBpLCO!W&R5Q-GgQHwFdgT}Y{W;!d
z&@g5dQP6{hfAf1HVIn(8=p!qsy#eHSE!B$0h|+eG){IEwc5oeH4PVpzWp!@(w%|Sl
zId&kTWUQ+V2}V`S%$@x{iWFA1in-UKwrMiu-MrPhE(o^KgYJeaGcmhNzbG>i-;^0t
zwL$H5p#Yo4Lo|AOuVf5h{(yVlH_TC)xd_|`060znfOY}a(!9roP5FHwx6NMLo+`wL
z=e>&j4On>#9!z^E;ziU}zris*0DHp1-_O%qOHkavDC)8t%IAXE60Shj*;%-LqO(<o
zewwV4kjYJK9k&_Divjr1CF}a`N8?Jcb8Frt24Zr%i>U`U+Br7$|2Nc(4>jqvc3$Dq
zD5+mgM{}n??hJE<&ZHSD%WX97ZHU)~R>8%bRQEgak)h;*Jq=qehBY0wUSB@*bXbW(
zUDs%BA0UqsK<10Kbj(2VRqbk67%1Q|?Kn$B@ftE0S^aT*r}YeGnkuyxNTXt2cdxm*
zGMz&vd+jK#d?50U&H37e4(0bo!%|Sqw;*eWW<3?i@jbcS9CzISn~Olo`%fC$45Hw5
zFG%^4CfL^5yTRue<^lcCcyN`grJ1Uu#fhsWmpl?e<wsnGMcDd2D0TC(4{_RK$(~~|
zU!Cgi_hlY)6eb0r0;xdj6<N$b{TWj&x~{em-^B;jv)K}af;UQPM3oXP7R-CzP>k`Y
z9Ie`B7(Ok{E*R7C={?<@<2H`P@>p_c&>fqudk+vp|AWD)#g<8YkN|E*tz;3i#%Aak
z-a&=XKGm^dtQay%3PUZ>_-x|NcM@izw>*9spui>AO08D7<SU2Lxz`<Fu8l6#*xr;%
zepIuIzDqZWD6;b)83&#YP5!#^Mw?86@Qwb?3I*9_CpaB|`$%W<M%!H<iS87)J(>Fy
zwNk*mzl(7H!lQsDWrC^}?PyHncDe9MXK-&Nho|@S3{lVh%?2XDHtVuY<*zPP@*xYb
z3g=hu6~|)z6TpvBc1V|oDHAJ|02Q<_*BYc3aQ=-*vNnQy(!i6S+is8A+p}PFOO!{+
z1Ast1#AT0Ka%GQH^61JvT*Nmm97k;~J3SeZ1@!)~e?zsv5p#QVJ~DgKNNaRWwszoA
z;3!6+rBg!YIWd>1?nA^xuiex5H~51himV{(^N+9IbZ$zp383h8qTn+WmJ%H~ma5C5
zuP~FIgPKf7t8aOFFU6p8eaN~(UG(brb;zm)#Q<2^LwI#o_vzO;;X2K}#Q!!{5SeO<
z^ViDzwPea?8ByLBtH>5fhqTWJnk*CYE3`B>BMQA9IvgnV*wLptqquFe#(B9Uq2xQ_
zQ?2Fjmyzl@ULjw-4r3Mwkj(>TKsCDjW}h3mkn}d$_-E-0o~Fwr7YGg}^VGXSA?3Q7
z=D2`%`t)DmCgGEduUq*b<YQ4PViiz<>Yb5%`8Oj{8Y!$my_0T$r}W{`YbulzmY?aG
ze$Z5B`ao;u!bOTM-_V;o0UQn}ciN&HW%w(u{a!A*emlBW3cvyNx^AIPy%^|_@$yel
zdMV6*{0Y5+O8r(T%rs8)ZO=Z#e#`?y;pF=;#CyifxQG<cCnu~tUm_oc=ovT!ieV%#
z{$-U%Z<s!^(h>e+W#NAjYkrC-VUJfQ?gH0`^K;8BL`jEJ?xbd&0PdkYaCr^#2AT+5
z-T$BEj(;l*{Yiegr2vR&ZneoEur=3|*G&?{OeAZx&rFn|^!!gP?EfHWm<O3WkcG|q
zJFvf%7sm>ZfM`H6V|#Cj^@EG}#(>$^QeekCIu8GTD;E55>B{^OWvf#Pxu{v!h0nBG
zS-|m}W63ov9M`}#mnk%ka18yvT$7v6DtE7SF*S?hde&8>z8CCTb&HnpJ^gQ0o-!VQ
zQA)w6GxfKGK4Qh#`|gZWCh56JKAO>EMZaLHKU)PLktxp4bxl5R!q?x1EQ9U<cQaIc
zt{NTw%-PFogD7%#1QM+arL^=e0qUqvNp#t9#lM_GX~V#?MbxJKex^FqB-Dvt9_9t8
zH7+h9V)V7nfRgv_TL3agYpCbW)T49u3VO^j$rusN_VJf<r`1EqFJ`;E%`?N+IR9OS
zQd<Bgd8c~-{BRTyTQ&vg(h>GuSNL<4gALzG;0>{sf1YOtZ-t?`yUua*byg_yey$L;
zHSjqoAJ1AYRxK^dBlxbd+v&yXx;gP5gm7zpeyt6iY2Y}VX0KT@z*d23VYCMZDncB2
z^~;fQ1?!`B<GsgOaH|NQwYjO|JM*YaOP9oN_1J$O>uVRi?W24$gI<!`z@0>iHsuZp
zM2!8G`7;j+&fOZZhk|U@KLzF-?eYk+_h+zMF&@j9&7hivt-w)<V73{)5&OLq^cSF+
zKiM{2s#lE5`=|w;<j}q7Jml4g9%AJ1nFvtfVsNt97O`oR?5!v9UX9W9S|)@8noiGf
zU0maliQ}>j$^Vv}&ry?AEa)uO&1f04P`jc(iH3FPP+2X+>N!hQk559i<^jocNf7mt
zq+1^5ijoXpmWx8Du`M)L+-CtQOV_WG4csUVGoSnNNc4j29C)d=n|{79L>!mtOdzI<
zseSSpG}TE?$k+kUNOI=|xU*O^3U&ShJ0&o5w|M=G_u4#S1t>4kP<QkgLf%Y34wv$8
zHt7B+Zo2O-mKeyv0)(o6;cD7tVZlTc#+OHh+3S`)@^p))zxX4QorXhKikS@Mj(m<b
zZ0jyE%3uKvJ+(C^UBY&MuG(Jj)P1t5$b|QbeCZNE&;_#m5iL*`v#F&pl{ZrW9DO+c
z$^-rK_Ml|**kO1iQ)=&qXlL<xN}ij^A2!VG^$M`@?mKv>Rf;pTb99#!cRZp7NT<-^
z^l5k}H*lZ`@MGcQUW3mhUDMmi{7!zjME1Q(!kQKlOOOuJ6`!MR=V<=F==N*ia<z1$
zBouWV)L_Tk!tU|M^&@Wf8g?gD&U;ws(xaNza6wGyvjPn%3Q<vi^dYtPOInMHfO=QQ
zARD*F(fytea!c>AYWp1ci}#X1^&}}wQ;^-k$LzTy0!R3?3*w*~?b%&RW&37~$(1%j
z1-pVHtrd|#)$%I9J*{;tf^gsu5{qVXQY@nPpw+hbpheSQji1F-p-r$CT#;LOQOpDN
z9K8Xx9bL>xwXtw%3)+)$+i3kR$7i!zi`%5~WI=Pvy<cqNA99zo)aTg`3?mg={2$`p
z6(YdM`CyPfisHsg#f`&!E7KdrisH2P@Oc~Hw@Do0#you-Ly!G6AKxytT7FkSZTb78
z(r^tu`3WhEr9S{SK3%Y>hK*?=mp!&YD~HH2`J-^-4$})uifxK;U{A#7kjUSFW^2V%
z0jm^t<Mwu~94*8-E)h`Gd+e@BZ`sx^Oy3NTxf*Bjv#d>3RQ*bLYZU}!?xltzUE}`j
zUpfuca$kM*Jfz8gbii^;m{VbOc)&8Y5aBzN_k5!PI5tMdZIe|lNlvf<(inQN^od{0
zi81%;413+oa~{+M1V+isdPsliD?V*@IVZLN;sKC}3?OzeTLa|=Y9*isA9Z2GOt^YM
zULofQ%4uLT|KgZ17yNL2;3z3&M-6qAHivxo&l2kQu7NY)Hva1e4OwqrpEc8qmz$Kz
zd_>6+1~(4}uF&O8Z5tTG7O{mAs{m0e8%7u9-7FZFeIc|PIxwue%riEMo^Y5q<MVoG
zmdWN$y5v&MY<>Ovn1T<H7+Y+M#<ETQdKP-4j>JaAb_vyB?-9)(4Os5J*>UTMu7=CF
z=Y!g@8EkJz*vvmwlv76>^H55%e5VX0MJk{q20iZ2@URtxg#z~o*E+<g=srnzgQwJ4
zk8SL{?M_i_`S>MH+>=2G>z&^#5TqBEc^v1aSK0}}gTng%**0;euw^Q!{ckC{M7Ls0
zhh>+_Eym9DaK9t2SA<bU@oSXWlaCWTN?{6>%$w@KosaH9d(o%WbF)8=FtYrpvHw-H
z`dgMBB%BguWgV%<cCMIe5C?C9RDtG{Jf2ijy@en9qp~$26ML;QlBVId(1UwFWVFBZ
zhrbVz%5fKe58ESiW21pC(Ym5m_v!PROh*oE9{5p6XBR`hbevs0&_cZoyK?16?fhTb
zMX2U>8yg|WSqg65+RS*-0IbG`K6|vTJ}3nx23W!eXs4iMR-@0Jd$dnlaP7CACf0+!
zZd9##4o$KUdzCl;sNLpGpA>9s)KeP2vqXnNw+3r?f}XBM&^IcM$UG5H__fUkcp<FB
zP)V}Yb1tqQpwi0jBDNJrN7XJgb`P$YQKjl9yvkVD#(zaDC7zKtv?m=(88aj$ZkfTb
z^-}seC6~q0i@CrCI>Xkncb=pgm+!t9G7k6EZw;WNivQ%+l{B2}HSV!8R&M8vs@dE3
zX+%OBxXo5NMUmQG<G1QK6l>^<Ij&d43=Y;#(3=-r)_|!s)1HjM^e?U?a%c?Z;valE
z8Ih2tn>8Hi-hNI^Gw=Ny3&Y7b9|M6-F)2wL&&x4Vs}Ve!=L)P-Bz){oWZifO1)F)i
zy2;f4!I7l!P*pR1K7d%_T-dqoo6N!A-yrhhaQcL-5@#p4u}e(-0rBnBriC%B$k>Q0
zK)cuo?v>U18l+8<JsP+2s@%>Uzu!nPi<ZHQ)*Lab>EEffzcBVx`z{`RmY3>DS}Bu@
zBG;5+d@T}lzocG0k&ka0pHswv;VWTy0`cu{KrwjHvjELz6-3MvnR%?ffm!)#s@5Bq
zccV_zy};FVnN7%P#d*svlBKdf^#I|yrJltmMNRJ?;e6-!xtUIsU$<_8=G>A?xwe3P
z=IP&`$$L7L?Gshu%E8lwOL4Y@S{PuMqxXLwCdIauobKI{ixRR|(oc!KYG-hjO<>09
z=Tk`+7d)ZHy*m85%GZF=Kf)p%0`U$r*EyG)*QsOo+(rOA!k{nYa)C$_S<J3Kbtr4?
zYo;ixd*|DeObc%P!^)rEn<<Q(a}On!D8T!JVV`%$R$M_-()(*%{PSGQS>|{p@bVcn
z-pkzAK0_p@o}D%_dkf8kd&lCe7;uK`i={uYnpEDi@QgA%n%6n<cPl@u&#7E^)_2~s
zDyt}M5Sad}KdjRKb^6Q_|FR4?4bph@n*P7L)!!%Vbj@qK?V##WROQmVkB~-xOwUFX
z^uu7_h{eX{@<m*t(r!^MW&QDs-)9!vOG0E|?E*9J*rp*8_v8@XD8lXMo$<%|>pvfy
z?AAV6)otG~B1LNO(R*_eQD9&(<)zhnaxJoD>s@XLIpty&;=;Sd^OQz)PKy?~qI?f3
z^PMKWWw9pAG4DC8HO#Ir1Ve<6b&uWNvZ~u8r5d`j$7a@;EU2xYc;oLTzTNopFnEO6
z{D~#+rKTsBJzX^P`yB2ITPG?l?F2hkt5=Ub>{n14w|w_ObdObDuX()&O8ZoXt?Ffi
zaphrwQW49RD!#3p$=V3vfz~_zqMqut10v{QSIS8K7^G~Yq_5Jvz)|l?!QI29Naqjj
zrzGr;oo`3#l7{xH4EuiBopPCxD=Y+fpYXA+G)orC!d^6x{I<jjy0uK*nBw-uX6^ie
z@ZM5dF+oh#)a}Iy`mK;+{iPNvLN#j({n9{rSA_yzgG8@wgEje~x$7PyR9Y*RK0=dp
zPD%;P)LJhrBE&ti^h?X0MI?Rq8MD(uzL3K?P@Ns-#S+-^u;*cb1G5Lw@B@9S_EeCz
z?Xo5n5wfIV_Sv>b+j~u|GQ)aIo`w2i^FajG39aRbMxBC29R`b**BjzWVN&>tv8Q)`
zcT7W>Fa4P+VW(mAbyg;raZ9_xe(%x@eWT~a&I4%tHQqp?(S~7LB9Fc7wd+L4aG^c&
zz4ysSmPpUTiiKQ8JdC6(Ghu&~n%Bl!*7govP^6RjaTGKqzxBgb*lp)@#=5?Va_wA@
zicbA%VpEsk^2hCj#w5?D<+T`th7vB2+s2M?pP9495AVQdVKIb=mugeIZiRTc=Uet-
zJ@B|?Ru**sq+!wGrkQQElatdLb5LQ+xz6srt*WCGQKkO7ih31x;>RJbm&Yi^d+n9h
z!zyU8?l9pXRbH_Y2{!5-x8#11!ttyuCJ8{<da%`HCGUOE&<y0h0%pRDgGvkh;=xZR
zz427C(?@dV+k;ig^y{2U<3e|h_hrOOM|$$r8yYYJjRn7riU4_yUBCg%9QdxMti?I;
z1FZE4wLcvtQh&^6Sxj^AKF+uB5!=ffmt=)o1bL<ZXve_0H}vc<E{d|K_!bk^=Z|Ok
zD)_gJ{d4yJjQH5LYP?(QzlX1%6a7byI{rKh{CmtgZezYo_FI@d{ybtT<F`0{+!3v&
z_gidKC9ajyp_dX5F8LUU6S`93Vr)&KAic|>JtQ9+-ltC*n+Awt_eQwXeC}(pvD);c
zcB{3GR@cuxfp07@x9j>E^rE@@mw7z{ue3iERHuROiG5mhM~pn#Bm|NU)o(MsiML3_
zzoq8R3%2ntul#y&*H&7G3A{2Zt92HId{M$c82RF41H-5_TwpAeN7U^g&HNms;6*uO
z!wmwGl~BjGtV!Xl)^IOv$p;m81WOt&>G3z__gPlot7UK%=jaR`j6iD;?2&toOL&bq
zn#-M)*}df!>Q8AsU{!F#KY}@@xY#D_>(OLp-mCr^b*3Znrn0`l!Oy(cN08aOyY6VM
zd!C#-1lp0@V)1D1WdfO9lq}IbcNR5$wXp|51g5c(d7>odF=<zgC5F+m!8kL^N|7XS
z7HgER{e2Q{lvHX?sqvJ&=zQgv#L9a5!8<+$18xw#|K$t3>Pml4NUFrl_%}+EO=e^*
zWrTWoKI0iLcC$xHlk=C42;RaNDSUNq>ql9@t1gD0zaiSF-0pD$WisVU=kQjR>E1{O
zm`(jsX9n#R^LWtRDw(rUehfu%^<pF7-Bov@cF0Sf+iu}TR=qh3<euS9QWU4YGBQxx
zz(vtU9g31!QhvA3l;0$%{%W+?;~g0mT5(_XLfH-W)1h@tfeAg_yRKD%QoDYj?i1bY
zXHN1#I*!N(#Y$SUFO^u^>THdix~rdU5EZ0gU_fOv``L16b|BV(;8qNH!oc_~Y%THv
z*UFAtJa8A#xzOMC#5sxUP1D+uInukgd!TJN0(dC~;b{9T%h1HT-poA=a)~sNDIsWO
zHSy2jBEdteyE>e1ceCPrF^{IoRI_9lyvWT%K&jz(&fP6Wsv%A_HcBNJ!Q;KsE}xCU
zdj|=skXoln-SqjisHSI<(DvfQ4m9lo-N!c3Rk{=VV_+llU9BvyD4}b1+bfUm)LN*`
zqHq^Dw`3SYL(y_sZ=-}Fu7mCphm^uE&pRX9d?&{IJht9|#*YHB4!hF4eGm|oD?*Iw
z;BlebHTG?T^g+9ByRa$3%!Ys%64iD~FyvB619?3W37IP)29h+$&K={iY}qSRLN5pW
zj$p=RcfwDz`&*Q4s^{K-jXGeChzGCiP_+CumP5*NqSdc1{?S^#L^m$7GT7nIn3+d{
zJGpEgkU0i9>ShJyn9ov`7voiR$&RI(>8;NBDRl0S(ayI`HxS*(B_A9#Zo?@U;-<Ap
z#V3i2V(>CbpiP5xf$vb3BNC~|rU~&DE<PKNtlw&yZi+OO8EG|qqSv^+&-0F9Lw~(C
zyA|iTpf7LdoW~bbb!03j8*UUh7q1yaB<dAHAPb71qI(Y|&xs@6T?6*QELZHilG7ms
z8j(+?94>0?^`CerljXy-DSOUt{GA$t!hDtubPLJr$uZA+O_xN;t%N=(MXTgR5JCMa
z<f}mocfeqTfAl(K+c}-}k)S3I{Q3YhJRNzxiA1-}k-D7vL5U|*zGtXE8G<DOms7TD
za1{0M=wC?xhOls&aHBFq31lT74N%a5-00m1iI?bJ%Dn3(#w-8W%``|CQE$+fYOF_$
z>&fW`F<&5he)H7rGuhsI%JE>j)mBzFysC)_hX=15i$QnIWib=HzH)`+bP>(zh=^{b
z9iw$S#3_q2MQqpEp;Y`N<HS!pWCd_56U_zVTMNYujs|#+Ld-%FXK?XWODW$O;!kSt
zw9fD#UN;2Xfl*?1VAlp+i67dS!bPP;#~A6_4%Q7$C@^gnDe1Ml5sr6w;JKDe4)dL9
zW|k2wncS7H4@iS~uqo<I@aoY{HB@DQQh~Huv^C<D<7Chh%dt&hV0jDld~nr`@Ok~<
z{+41$4X;5R52RM`HfW6wboUb255!^s9QXV;Q>DLN7M9g}C)MG1hsav&wtE$=uImX?
zG*>z6BK=a43Jl`sa0XAn%ZTg-`@^QYX{5ee+gClqxujE<@$Pa4vE2`s*{w78RzB54
zYTDNe{s=CzNz<6R1Z+UpQYRU2+o!g@_MG!1!4%Tfn5Ju|JPUbZ7~ss#HK20Tx&vKf
zGA_nF2V^Y${#Df+E&e%Q8#q6{@7IAF>@)QNsD8>4`n1?h)$0a(#_M8O6HufD8u)41
zc~PbhP85ByTpf#_XgFgac?7(2-Glq6{wxP#^L!@E@U<Q-0|}Yu*#`-n();zs*^7cL
ztuKhx7=KHdX>=Nw6d6En0<Pi9!s6sEMuJbcdMosisgZ}5`%d;vQbv#}p1Ot?AA|d7
z>b77{-gKoIH9`!Rh5g~J2o`p{*ZU#-zH*(c9G37`FIB=$TySDM(Y8fN1R1wz3#HNx
z?ow!vd`L&$5&}j(>AkGTKrB7Mv}VR$OTj<;*{?P<>NY%l^Yx%O(Zb0~_;3$Z^C{n$
z7ptAb@4*LN(DQ;-FDA!#v(9sG&Cwnlw1ZKvtc__>6{lmsPc}z%rENFf_F1mxV|3!d
zMjvLHrz@czge>gl29jkb07p7}`pFV`V@8%WYfw6&y&=HbX=9<1_zf6tqE}WYp&HQ7
z;Y!Sj%|n#Gp@qx-^1gC!z|ZPJ30C$=Y0n_K<8!p1c2YK4C-`!oF~GCugq@eiiHeV*
zNr*O9xPnJQcE&v3P3FXOQ!KsJm1d-^3B0pJoc1K~d*i|rP16z6yj^ZaIh5tyC(Kw)
z@3}Yg;CRVOzavD_wk5l)9_$)*;~If+<4v{z_i(><x{e8T@&Ui<EX;^ifOev3sj89o
z;q~`?oQ1{P&#o9tyz@|8hxD|5z|Ehez`JITcVmwV<Mz<${l@7z+d`1_7p7B&XWq&#
ziMU?Kvkh*=G1?M)JJ^;8{Q8rfS{R!qouZx&q7oLeNLDdm$syz3t+B!z-J*4f!~r(l
zj!_|bFoy71iOxEu2$zT()`j-1(u#89K!JOpE}Z``*C(|g1w({styz4|vkOhesaf72
zMAb(lf=d_V7{9bWxU`gyD;6J}U<@yKMU>IpigOn&+2(blt5=bieNKlZ9~Uo!cp^L)
z?Wh)hg4)07ni%akzA+l+${7{5EQ2Sz#^GynCTVEh?}WAAw9pb#MifHyx|E{LI&Q7$
z+WYf+dWfp7iT8XD2<^sN{Z3FHz|G)qF6)t;9elWXXLY+CG=8Otu1a;1=9xSRnOy+L
z_^x1EaA{IR-=iGrliVvUm2IL-A{=KRRM!WsY~g}Y_-&&;dVZx<no~W%27JE-$?LOx
zH08S&f?8|{=(Z&&nbbCV!HT976Mj3WOt`?ZM%hN?PKekvAczbicx?-pfeVXJi7=rN
z4WF=^?&%-Ur8~I`IlqV}+dP@MllD`1Cnl`zpmtDSaig*K6tN@y(|e-60%ziBtoSX6
zn-`vqk0B_G5{~ro-n{^^aL1dl^V(R=H(3eowk}oSrqXt$ahU5hPV%03-jBL8Sx)Rq
zV+yz3Z3dq-EpXwpxjHoEx9$`rMCAL>^K3E2&4u@-ZT*jgNvq6x$q%Y|SE=oe_IYVU
zNQP5!5s{xxt{|QVHEW)F7Xk|0J#aaRmvmrp;!cD|1n8|S7)h98{`Nuk^m3(jXT2LC
zrjv;#MZO*=EG_ZY+i<SX_*b-9ZY!Cs4+k1-baQ51Dp`I6KagY~ac4X8X~{mrN)oYh
zvi>>O-d?DgZ)t|@vm&#b-3wF_c;@^X@3j<m_eW)x;yaW}IaC_7`!zH05geuAaECX{
z9N8$bH<vKHmb+&(aYGyDPwh=b@fWOT5PXv>*mB8o;6}6!n03>H!|=6*TApJC#vu#G
z!N1e?So}DJeSYm~;bfIUT$ghq8^4BA%+oz<zEWYr!))OBZ)`=$aXZwR!Ze+}BmkVa
zdw1hvLH%5_ODvX(vgMw9i{3~b&5L9VC|K-?t~e$v^t`_3wk_mNYw3TYacCW@f<VR~
zn^XD)kn3C^bM5V}WA)2?*Ln5{tj2*!cqiRbLq8kKEq}6?k_SA?rKMHH#a$50f)}XA
z4vxZT4pwBj$?19TFWvH3nU@(}ED;m+?rzPtFA+VMa{^7Rm%LHQtCh#<Z$J5zVbR<d
zKj-p98=jlqq6<sLORp7JZ5mT~rS!Fwfvh?UP~FiT{zND&S<HRoIXERqVaA|tDuZPC
zTnDC%iQ46v_0SVG;qAf1kz*Fe@^=<zaaZ7G<Mw0GC4lo?Z9JM@4zwBsgn{v5AtilX
z<svz~6whs)y2UWvA>OlF;d`5#ih9hL5qjJe3#@1&b9Do@O-8fSI5^>2H%^V{QXkC0
zDw01L8AOM;%8Y*1Yd$}5YWyCKHDp-GIdfCXw%)Ye-!kE#t)RwGQ8TqSp%n-J+@aS(
zB-iJKH((%YiBy|0>5ms$b?aAGy)79q{4TisB%c;{br0?JX~E1phH~&dGX-9j)PpAD
z1qYodaA9MYg>!I#({dYC;Uk~jvRf37r*7LC)U8iNE%_2StIr%u0DmphT=T-Ysw<;j
z8!tY$gC2U%XS=t-BA!sqC5AH%rvzi(MbUr@O3%OL*;}beuC0ppobvUXA#Z6OB`YIK
z609#hwzshT5Y1L?#}T}6sUqq^sn;G_?VpCoSd6|XxvZwJ6+-l}qgMQCWyx}>tjpe}
zO!!+NytkaVE7ivJYU?B1)x%OAtXAFPh`F|1-L{mXs%LJW#+kPU9l|OJ*uc-c5H&Ca
z=6DgrJgh9Wma3Bu9NSvfJm4)RaoWu(?o*sj)(Kdp^(XIUDLWMP(AkIOaT;?hrfD5{
zTkf;G=4i0<FYvRe-wN3#7Jp!6XFH(l(Qhi(g;iq$2}kuS{QxXFZRY2M%V$GjJf&7M
zxT-*Cha%N>P2Z+43MRiW7bh?M57{H~#p$h~oEO%HWnGKOS}UdHIV!3%2C{eT9@6^Y
zN)L=WbmY1AOUFveDu2NcrHTzd-0BYuhnnDl#a$1Mmg7(=8q2TPzv--1SM@n`Pn<Z(
zEqCv(#&uC~H@$t<F>mww0!ulpNc5KyFyBmiBk57(47K*JGWXBb4VTxdiAf4${#CvG
zH&x*OUunZvO-qZv1$-hrLnpf{S-8QBISm3iq*l-HGLYY8Jf7_Le?4EfiNf*DhFeX+
Rbxr_(a#G6oiY1JE{|6ePQ%e8<

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/free.png b/docs/source/assets/design/v1/prefix_caching/free.png
new file mode 100644
index 0000000000000000000000000000000000000000..cbc2f22222e0443072e70f7d8ac1ee8c5daf3590
GIT binary patch
literal 17933
zcmdtKbyQSs8#k&C0xG2<U4jZCF*Fi_2qPd2NK5z7-BKzF$Pf||LyUxUBV8ih!q7Ez
zNO#P@oIU9KocH_VeCw=r&N}h^GYf{j?|sL0U-vJr&8JsNvLrWYZ(g}_g+yLXM)k^-
ze_+7R71s%X&yvV?$dxPLA$b`obx(uMWc;zIw(5$du}|z9Zv))3jRON2+)aWKl3J|d
zY3QKPWr3bwJWn<p;T2YDA;=h|2_zDEw6o@au>G{QY@i|iJb(I})2hDTSHJ(zm&58?
z<jnX$dcuI>-5bmwZau&LKYW~b#e|s1|9+;H)@RJ*-UzmN332xtfU8F98)Re-nZ;2R
zut(l{9vuD&yIULaAjd8;$TrhDz^D-n6JXaLS8o$Y)u{m*b(MSd<E1JG3)#5hFhP?a
zjM7TArZgwi>Up^<@X*!IU*CSo(3D{;O~@sAe@@9YnBi)!MuYrdy>{pM^#Nw7s~qwL
z!Qy888g$ptTsj%@Yh9UqnKy<9AAY!%CjI^&%_yDQULDX}q3cJw-{qnQzd+0t$#cIc
zTxXUEx`v<spqypzbFaGHt&K{{NY-y0iCyum>I!i{w@Gi@d@XMcUSv+pnYcIRFu($N
zsU`G<7m^*z4a@?=ox8<RLm?f~LTcqw#i%qeJf_h0u_^o&mp(8{q7NP!J<Or_pA0xw
z9=p3#yUP(VMZX%iEx~|2)?&IgsSG-c>qiAk`Yxp!dC`FXj=3V&|NJy@H&J<pvGH(-
zVHf3ps8rmb5t(@*H|ika)wDEj=zpMBT*z>!Ra578@>_a0j!A-!(*b<`B@=tT8MGS&
znxayEb3l1aSG=yP@ox{iAJ~7-;Dwa+iz;A#K&bFh?0G5m#}g+T<YYHub<g`9B!VlM
z8lCETm+7vpcV+8{i#{B4;HW{vT^6*VjFb@umKUB4Udth}J-msvGw!vTGH_AIsC<xe
zXb^D<yMamM#|2~QpceF2MFFg6x{%^f7U<r=#Qoi0%mEAC{i58jM?=o;LmJN)TRp!m
z#kw}GgJrpHkueDgG#sDp2W;x7{8v{UC>yc$d33SZ)BVxp)cL@@eo0@h69by^<`c;L
z`Oz5n>y#lT$;7)3Pu=l6`uJeuGGWuR3D6Lb*#7I|gozwe(kWyDsWsi^kv14EDo+J<
zYm*9?uS7H;eJ@VMlv~Ss`CNR~mn^PLR-`E*m%s(*UELsxex7FHZ#*z^k7-r|r}2ji
zjhKAevVCQdDH>Zpa`(yne*?)jVW@{Qt+Rh>bEN)gtl-c*z9SLzluP1#3~xc!p_bD!
z_u{O*s7+YU)!L^iN45pgc>V#sB@3Ea8QDFya-kn#JB~2$*$F%eT_6o95<UKK;vwmo
zV?3?JBxxvl(Oju_2fI>)a_L5S&L4S}e~AqnHMAK%xu65to`h3D-nsM$+0@O6=82_0
zDvO7-n0B_l5F8pi|6RXb>l!J$airaPEG9p?e{9Hv@WM2abhoL``;Rr$68ssg+q^Nl
zO8iBE1x?SqPF?{ulkbm|!~Vv2jJkj8t3MkB7+;)r-N`|^e!=x~s#8T&UgUmg@Ti{K
z$CI!1?xBt4tJ$9Qf?A)jo3Kv%a|;J?%6_rqBzqd+jnkE6F#~94>pU`i22~(=9*;NW
z5#2s!8sxN_A{Op}MK8})FDFfJ`cJj3x0UtrO&vy?o-3o?ObyeAq0H(}KKr_dQ_+=w
z<qyUlt{*4A%lH%{z)+3dYO1~$x?);sxO`!O<%4amrzV2jy#g{JwcL4fYW|r-NNGq_
zgj4PB2-g9&&&y|$n4P7;c(9*8jI-_;rWM^^g`zEQ#-w1+V3qIf%5wkR)#g#cFuJ1S
zP0#J*%3Rn)<yAa_Dq4xg&569CRrs583wu6l0qfhRb;D58q^9(QsQ$WV&-slt%jEW%
zp^jaIs5)ZDpQ-2;1ult+bN?;*pw_(nVpm^=-t<Jf;@G^Ud6!tIB`Bfw=;Ipe7HZsg
z(nz0!1#%u)i1I&E6}N6WooeV%kieGbDIvj|VryRWeH>FOrSV|vIboC~LnT5gfS^Vq
zyDs0X(gyr*g5-+M*Pb#@EGcw{R)3?6@ZaeQINZEAh?egYPjUO%MB)H3y~P`AD~)9{
z_#n51fX?MZoob!vst-HQ4gCk$xqSr^20rv+#T@Dr#7H4CoxYu%NAkre_Yb`7b;zm7
zHM)U}9Ld2}X`=F(x_sV#J(o7w%|_W`2*+;${Y5K-@4JX@6H2y?s&C#UOF6Yg&ic>g
z?sHweFwSH6HxE}RzO)IvSX?61E07-a9Jtowo>y2E%70USz$qouvX<#X$$B^ER>L`^
zTUpgsrB*mqlEaJ<Hkj4EVs6v6haIFxO#dXnrs7EYZO3KbG{O9!<@Zz{L<K{F1%S*F
zRkId_)GN?tTU&pifANj(Y;wkLNn1&;Y`S7{>TtJw(n5pH>iN*fezB~ia$_j%YsQ-q
zl4jdR=>OisF;L3eke{55`?+P)7BlgSP}9ov8=rM#FAy$A+S9$F)r_WE!mxJ6)iY|=
zQKW5hzIF@d;V5=9&S5n1n->gu9GNb3V?hqlwyz{H#TBi+5pC31u=*{?-c_*ufN;~*
zk`^>LBwu4+z2R!n71f<BXo^@=sP)x!i{iF!+zcR<Ug4B@lj>QNTtQsXpQ|+Zfm%0m
zd;eFQUc*x6dM&LiaOr<ds5O*wAk%cj6_of(9?`#^UlL_gcm(?XOBhDFUuXN%0$#qV
zwi={%dh!iTcgAVpav^VDMIm{Y4AMPh#ERy0HeK?TAZce5Xi9ix9rA=!I&$YZ(fVot
zxQy19=Cp3TBC!_!(wosLO1p>qEbvaHlJ(a?Aj8X#g0mQw2A2rySPge}nqI$mpVS<(
zs>$#AMSQ-k#@g-@5#L;&x*2!Ak<2#TObH4a-~8|0rD_Ph%`a<r$G7+dvoOjOMrB}>
z!5=(yWv_yNq2jb50JHR>x0`I}bupt?SqJR406#htIx;XPaVjJLjgX2xm4J-UqZ+5X
zbd!ieliKU?A*h$uHlC9cOiV~Q@(0aziklA!6IwS!4?E6(Lt8`qzPY3G_t3@NWg$V7
zj0T=x$F1@p$?Uq87hd{UaIGCKf;G)~V5<q<g-&D^#Hr2OadDlitR_K?*8jb~Fs}Z+
z1j|cd%UT@kzR3H+tL@uWQ-`f-{;^CezhM1!_%HVbA#HV1o4qV){>%&aX50=%`o#G-
z$yf!d$>2R>*r%h(RK&f{_xJBt8vGtHb&PIrzvs*?06v>&ho*@rA<xap%RY~--K!U+
zDNYsu0z#&ThA;~Aqbf1^K;r<&Vh_QF{iTY>$I6m#TF)=447S~Q8q&1>Jt4j2M>t;=
z!2XYcP(|E>va#y_31IP-7r5^+sLp;kNh0!pJKp(!1>ms(Iip!irQmB^hP!~}uOBEG
z20r9hsm3tXBmImgMXAyR*d}g}4juvViStQus^R1kIOO0ncWs>|361#4F#ulwN5K1k
z_1BJ`_>jKrbBfD$(%vCCLTvm$UOvR~8C2!+Spwky2Y)cviEjlzp74IGQ`F#A(nGF+
z+~=xg`KZq^(ZapK7AQ09nh(|b{=<X<lD*myQjur5+&0X`|1~Y_v1oqUQx15elUsXe
zNbZVwt^NQ@hRtzEc(!GC2-I4S3xM^(O7WCXr?~8z{svcN-*pph;oQuXD?sy-ng>bx
zW6DN4^6d}Z!35CkRd~onQ5rU9^;pHt(7(_U+2UTnqXmvm|Lu2Co`wy}MN3Fjs}K_B
zuGVMQV3&GoG{>^iupznV)0GJnLhHMX2_24?xc>Mq07Dg6x{@2e==yPBL`fEwi>j|k
zn=3b`E;iIWI1YI+5?lWBzREEDmL6I>Kg~P+x6@g=`{BlFB4Lnw?|hWXaEpocN&(ko
zahiY8DtY5&0En9Ez*_@0L+<cBjc<*3eCDQJV66CDO#m(4!HJ-yF2ZUJ=tHZO7wn*3
zdHEUxl4*T5lo6kDQSJ9ws~akkpk9}KlN*3K^@gx&e18K5X&oen6e{I2X$^3wlv^Fb
zvaO9MA^FFlg{vdkI*Dx+MOxn>&H=2RWpa{@u=L-@{<;I_Dq}GTP|CQFTtWNB+*LA_
z!>R@$?#^E!O8KkEY<L5c?6A!idh}ZY*f8<<`H%<V<=NdWJ1R|Ta*g9oCtdTo5K6x8
zY!axy-imv|!(4*}beeUvI~z1|e;IAsySHxLEt|N63rov@v|bkIXFLD98W7rYi16@;
z7ypj^CRQ!onuOLD$z#O`zmHW%CQvY_4k!!=vlfHuv@;B6Kxx)?!!#=8jZE#@i-{1t
zGC2|&NJ@Om{$tUaG>^337naaNqwA>fhMa=*%zoiDiC2?taB%2+o=UkSBgpQ1vxbTr
z;(WkX+yhJ|I}Am+0gGJqG*{x*j;cZy)ahNw#T^YBP2VJ;h+N*etJ1iwa*kVWNmy=q
zpk_R8gfI*N)(E&JkY#f1LM(SAmn5Hy5H8mkPTPA$Fv=hHlU6pA8vXhgr}BL7d>ln@
z<m(1i_06(4pWE$lur(oXOegJ=!3#5W-*L0niPCMEVbh%o`=~^lZe9iXGC8gXD+f#}
zuWe$#9Hv{J@fP6$z5B7MY7=7B`e<vqs^(!uL~fwh!o-SFFN<RsFJdFerq6J%C%-cc
z$C(Etkb|5i8JQWuRJ`IdoT@hQi#rp0-*tGg%b?!0A<tcZ9m$&V^xqPmS2Jj4I8)5D
z)ICem8egE|hcmSGSabchOD9$y5c@aRxY!^NA-N#<7_&Z4OGadzYMu7tgpoT>HL)&|
zbO@uOc2=7`ku-JVKCl{GD?fGHu-XHRog{oecZ3#$ILNz21z|(1evBIZhlANV(dTmN
z(`@C)<aje&gbu~WLQB3tE!#R3-dmZ7K#=n3pDw;tY)z^RQ}0UlM!Jg`F1Uh+=p88A
zCn;2h$BO$RK=A@F-Qkf<eRR?_wGL5Kiz3P5&`jSUG;-qE1q)nV*E}eDg1Gy1QX($-
zxxaJ|jbKVaU2nKm^A$Dk9?z&0V75wrsH181BKnc&_`l%?KwU+pJJx}(`xMwL1O`^K
z&saGpjp;Yu6g8Y-yvpLCJ=_tw>&tn*N^meXb2p{DU!Gd}v)jQgp26zLPU3J5ux+@;
zDOJifZ<UGZI;ZD2K3BI}KEHRy+IvnSSe#K-WMiG`z=ldyUd{MW#m(v8n6|R6YY{ME
z5PDeV4p!gndlr3a>VBD)b+YoI16i=jZ(Ijl2W5V*yxZLCdW=&8BQj?icA!rFwJf<u
zPcfB`>S^8#XP?#c+aRJj88k*uX(2P8muoAu?@D{9s87cU!y2@`Cnr`WZ4g;)A64AY
zY-tW>eicN$t4S8{cSq&#TDUkkWQ-d%hH|tqvg%=xAN|q=;`~vCxT8U4J!Y7bFNU;x
zQxP;YF_594mhI2{k(Q%}`g}bq>5j%R@4#hH+o+@`&@pI`1QjuS$k2YkFmz$;>B;=?
zlS4yh;4L4~jqb;FIlayXC&doZ&C=!Oev-XoY`OOBN1bu+Cd}2<3sS$AyYp)vI02KT
z)IdDMoh&5HPb~}*8rM+|R5WR<V9Lk!X5wo;^fpgOer;`9Z(8p7&mUt4_6!=mB))hw
z3yuL3Z{RF6qrVznN3|q^iGKyC9Dal+r{&&QT#8*h36w9peo%&K%(m8%!vkgTym)~2
z7ARqWbQ;PGQ;Cy?=B|nH<}&UNXIrZxd*<s@+^#d7*$E{|(xktvsnf2z53c*Tk=oBZ
zX@WWXiD*Ag`(2(jep)_Jy67m=JrrB61{>|+OwrTr(_Qo}kx~!i1<mXXk|1aruH$vm
zO%({t5dl%2b_4m7(}XiqX6^ix^FYLG^7{n8^)_|Ut!Oaqz^KX@rX^k}G!+$oVN0AJ
zc4M6gt|xNwAzPF(4J&bep($@m7`oz-r5$*CpMcnSsxLIxO1acX-Is&brP;kMz9}9y
z^XnDIOVR9)iqC6ar_fjO-ViL#-&SZJsVpQFk=frI>R}O#v3M~|6@;la_Ow%^ipUKJ
ztu&?N?aSyVec~slzF8|GSazM-)f(<g`$^0}1|1i5nL(I=ZYvSJ0MCW8Snb4at!D2Q
zOlp1=A}hbS51L&5CE%Iv={n@q_|v`V!Jd$pCQn#;dIQ>TaKomS$hC`sV$+i9=&JOi
zu<(WiI^E0MDN`8_9*|6;(MUW*6ZLE-s+g*IWbxb+6%4N+%f&79&1cY(%CE3z{v!<o
zOfkkE&dATCxE-i&0+YxhNgCWh6Y!aelLv-5?0%)w_zagot`Fex2*UlDzb4pcJKzAo
zb>=rA4ze239mSxL2P}c9+NBNUxNkqKvQv+4cRv#NWS~4+c^CK1t*4(HLX`6K%{D~;
zG}U~PX_?upKno^#Liy(crLxS``a6@ji=PKGwh4pz&*}UVDhi6cVlMZp4B1>_TKz^{
z%jM;1LGnUitX9U_@MT9nGzo)ImRgrLUzbw*PB8bU)tp~<eGBv-{kcZg;TbMQF7N-p
z=OgVkl={g|UKir^<2V8m_*$h3DKUf4#`>eB6IG|&){I*O#MC?HF;T2JuSpoi7mros
zX@q>1C10>x#fBhfnWeF}+?dPJ296S{CAp_9k1iJT8WkR@amjhzqSt+_+0sJ5F8lsK
zt`$}__UudWy?WIW7FHoS{pXI{s=F-QH<jd<0;2JnO1_zZ6>UE|Kah)1uM38esDHXX
z$U(tjViGX)@Y)Is1!z8PHb!0IiOBz2jdI~zG^xN^JR~kk(b#xoU&M-`<6KQcdC%T%
z=0YXR!TaQN*1OmB<da340%yG*W`eBI`nBINc~Kk8*JjNKZ4gDKM0%}V3F$#9#y#VU
zlAgl$Hy(RQZ*)?j3_PUc*d$Ko4c~7IArFtmZx_`p*Vr@J6f`>dj4l*in><Jh{zRf0
zer?SvQ*dPbo&@&y<P<h;#??VOW%juCqAjsK4(05>Q_op*R46;`r}Sif-bf!5@^>Ar
zCf;N{oxiUnF?sIak1P^gj8lPeGZsCCMxTVjieQo#hbY{G9}4?{49vsI7x1YV863*?
zQ+7EJKDTX=u>VAX)tjAD54{GjZ7P3fM7v$3g0!EG{Wpz;P4`JLzq0}+e3TvQULR?#
za;Pk*LZ<E{s=Ecv<n$bKQFK9XI|^grP6!A90+^>^?cH`Q_E>VS#k12~E7-wnM>`AV
zP$f3w6ydud6#xw_syci(TWd9e{(4`Z<+`i02%1jF$FtZM)Je!j_}(<@)0ln9X5SXp
zV&=5x8(I3Z{HhYuyDKRk4W7*<x&$Z)Qq94Kw*Se3;x7IDWoO|?ZHvU|R?Jd_AC2v#
z&HTi~y(PKNDNx$_vC8>v1J80ac8bFx+ydbuQ!ie%v5cPgC3`BAVzu-6WDE(q-Y^;G
z>G=Q@87m!rZeQF$)mmQs?(uD@H^}dOfr+i;WH?!F@;<1_rDnGwOV6)*?<i=#S?S6B
z6!Vesd3`p<yFS7YABP`rulCW0j++rxHh$4E<QOXi+Tr2-P5ItM9(kPdrZ;(^RuY?z
zRy4<#_!%d;qURLteG+dMF0#O8fPQe|2j_|3i1t<a)QYX$axp3f*48Fc<Vgp-b$CPv
zG!{tL(s4=^Dhb~!+TvHfqTFy1U?lcIyT!3j067~-dp)65&wrz-1PRDd2*3Rxl1cHS
z-$8v>Oc+mlqIJsP%PvKAt(yx&M1{@>-bA0}OykN;K_>=U=@<^FH}Fn)tJjM@CR>-b
zhokM%fuu0QbF&b%V$=CAqUeCls0p9M<fI$-xcFciPF%KMU&?%*5ddg|u8Ye)UVjUJ
z505nF7i_zxzaMh(iJ!pTaCM{GAv8|Mz4c^Qo~gFKyQs8gawQRQ&OKH!+T!+iO4Q)m
zE#40*PglDe+idcmvd(x;2_2d**2YO*XcS9!Jv?gnJHgC=up*2S`#oE!!}gP~`3W;p
z(k>H9L(J@oeY*hvPJ+>9>G9Pj_v>X(+}9cOT@He)SHcX@2c!1YJtl>cN3&yAz~#4v
zKQJ6)!qEu0taa>$(CUDeL@*&K?cIoJYeEaY(Md~AHT7y%`&zrAPc}9IG53K9VTWli
zraGPUl(kUlxzwT+CDOXzP4~elZw?lpZ=tIiFK0qAsS7s~=De_A)@Rl<rwHi!M=EX3
zF;;ds%Q#bcluzXs=*F6co-SPmH}jJ?+=Z)yblXm3c@5rokK^|XYXrwgBOU4p=iUXp
z%m>H4eta1+ef0D(t(+{x|47aI>(y;WuP*hn2*;Rf=?xD*D%OMYjxu>WZU~qQ!o;Z^
z)n>6C5?E0`_dSayA%g~?f%TaE!s``l;Ne196<jLegx=;V87keq&6d1$k;S9rT`KD;
z^_AVbjH@!jp$2Q^!C*$2H?Q;OFtF;frfjzr?K@;x@VB1;)nUtw5;Lksi$RL-w$)JU
z^Kas*-H4j1GuZRA>#)_8Q|{^}y9wyq#}8}uRZniuG|qY#xThu<w3c^V4q?7;oLiyY
z-GTBH0cTAT_f0mVbU7~}Y*$hPl<zDP1KZ;60!Gg*yW>|<lDFZ}vD(HI$EOVzDB=nr
zH0|tg$B!~UFiux$$=@+90ZW<(V*6age6X<p?(dpS1@goB`=wqP;jFxEpO)c&RA3N)
z%Ua0gV&aagsLXum?iL$c^(Or3R~_GzR>P_0$0%)&^pe9Z*Lv<k+2z@<oY39KIFyIj
z(Yf52>L4eJHPee8!3b~F810_Tc1C(Jh;}c_d0=p!zJ2+MUr@F0)Ws@ni{F>-v~IE`
zAi&NWq8HS5Yw0DBae#R93R(G9nYZQn<aONLCH(eG#4Qk~5$&lWGHrY0SNFV5{F5)A
z?$zTWGsiaGqXB=Oyuaqb$;FWCh`Df$FQfC6Ra1=t`Djy1w(j-5hqy;OjP6G%#1`J5
zZVnSdb!~bC^tRRd!Q2xKC`KQ+IKHAR>)63}6|Lw-n)R|YrDc9feGN=C`f{o%%Yt{)
zUCw;_>u&p?tbbkv)%yOD4eM_xn)rYhejPW{|49Ns%x6gQIn<$<x#6FdnY+kAf%7a}
zM4SSTC1$gu-){ngB%;f{b2~#AqTMt9adc)*Cq8rJAk?zei0S*wU9ZOG2oJ=t>+3Vj
zAQdi1VA?GkD&ZS-qZdub$nWhQq;BYpkJ5r6Ve>V>HZFxmCgMYar+7$3&+ck<Ac<!U
z?WJT>e8OuZ$9$P>C~t$re$lg+#ZN{&ec}!u>|uexS`dA!`E1Zm9dKLsA{f6GB;&&G
zpwutkop0YroZM`27C(kt31GX?-xx$Hj&R<Q-fK>Xo0;tj2&){MXcZWL`0-E{A$rJa
zB`7T%Vq=V_!Senflf9QybzeH=UZ3t(acwuKQ>WEt%O~T4fn#KMM7%&H^t3bJvEblE
zFWP`|bc7&B0k<Yr(8w+2#fN)jZILAgG>yo{@9(RazJ;xz&nx{mFeGTjkXVvhJ5a9s
z(^7mwyV~w+7Rx7yhL52UaYIe*NxuU{<@~ZhWu60v9<5m^{^rF=yiq(%?q2^MeoU!M
z%g{lx24a(bPxVpX?cKob19Kpe5zoBO9FO#Sd7x4LUx0!2rWq5z0cWw7$vks=3t8iV
zL9<6Gpi#ia+P_pjrOsam-=_Mza&^C!b?;E;wk%T-oM-i1_9B?t8de%`5u&?U6;!)6
zY4=72K#6~b*^(f>fO5{9|GAtM-B-=&x6bBod1Am{=Gkrv!biz!+dRzjxE<Hi6uI~1
z68Xf2Q1F&Wbw%ELibIn^uO70<6{+Xra82B;Bp41#zE@K{on3t9Xq)r;Hu>d3-3QR5
z$>mDCYz>JYyXSVJsTzk*A3b^a|C2_6XqIN+X}QsyIp!8>KD1WM7FyC8uj}qb$=h(p
zv}Rmof+n}ApLfD;uc`ALa3oDQZp7|(t{5<KTGaI_S&SLw7M72op1hvP!=W&dI~i;c
zVowDx$L(W7vXR>gP-Jnp?(I`+?0u!{vB$Z8;do-^l9oX)Ni{Uip?2MXAtKA`3_Mzs
z>awwCfLnv#cW|f!e{_lwhcla;p`X+BOq85z3n_SCq_<(tU$i`Qaij)?*&&x-o3)(V
zY`+nBJrhFBkvAQA9A{wvFiuF4^edGxy<+%u1Kr{!CqT+b_1n_Bi!8h{9@^Z~Y#Fn`
zJlXv7G~9|?nY^H^I7AAvmg7JnTaueoaUBKjc;@OQ6R(Ha*!|gUR$E>xu>ShJSR)aB
zD35ob{!B<V^=t0{NWfD4<G%zWDig_F*VcOgjpHB3LwlafHr^O1XUSxM&_$GJwF>sq
zCu#P~!<aBrkja)SbtB1T;Z0TAh$%ocA(Oy{qf;1?zQDV`>p(te>sgfx>c36UZvj<8
zBjN3dOrC9<2&oUzXrs55_HRc~`yX^22tCYo9uvGZXCNP)6ga5#adf^1WE^1s6p{zH
zxFl1*5)o-+#mk;?+>W-_caCLppfp6!X>l9sIWHXlzQ2EenZ$3k)g!=Ex#-{JD~Jc*
zv|sjS+p3f}&}**9Pp`N^9*Nm}Q+9HoE9s1?1yCfd4(XPkvnpn)R+1$AX?@6l<_9=i
z7b}&=rKZ>wBrx-!uyyODV#G3;SJ}y6P@`kJ5x29X5&xUjkDY~XJ&fLO{#|P|4(Kj`
zwyH7CGa8jbF}!{Z(SffB9&U>O>fzRoeiw7&RsjMtb9!4$W9fji&}w<sOc16{)-Qyn
zx4&80TRG0o$mbeQr=RSmH#@#UKL;lgnsoLS7QFXSVx0#D<7_aiJJ{o>-6mn=6OdT0
zU8#1Pdho<uhn(OiXOvgGQiv4-19@mg$%fsq(XxX?i`*~c*FI-ms3_>aD&jiEQ<-8F
zFfwgvEAIH~>q$ir!ejl>Te-={Hu;-y#dw3<X`NsBe|HCPEG9s0=D?J&(bwLgwQYL*
zp?DvKBsrJY7SH&y)&UUg$}hC@w1?&N{n|pCLt`Ggwq>w;n8+JSYLZFi4FImcBo_x$
z%$_ba0o2V!9|BD@?K{ht5O?KuI2bE%cQpTJ=^bJYudM3VDtNkw{=&OcJNb3{#;c<-
zWM0+CW$JQ(1HvNlwBCNGt3K?J^b79d&m8=(+nXOCgvV^_ID$IKFBsoxXDFs(oKSff
z#KuJF@*6t0DgDeu8Gt<3e3;c61avs1F|1=8RR{SZduH^3{&EL^VTy}unVkCP{UrtO
zK_k&rDep0z$Cv=_vU*2ipLb6$4j%UjyExWq+p;=PR<pn2Uu<lT@EoVma>FA1KU>XV
zL^IF!g$a9DKK_(7ycqU{`pMdh`st=z=kXEJzg{MXpkPH<aJQ@V)&AsChPMKoP|U!2
zoMH+R-fzarcUzgnr;U>jQGjBDj-ZA*EITLoQ(kn4BTJ#pf0);evW^O?z0p8gQ+v#X
zm;>gd(!N16<#$4NG}G6jhhH5&^%E^#U0(9}9i}{OVbY8j?EiU|5YxqGc+CoKpix^0
zz6~U>8BU>SVOS4EO43Y(4)gNoW&hay+T(TZ`n8#pmNJkV)|9e(7mZqX^N?I#4m~D&
zj~Nl$j2nXF>W`M*0T4z!xB>z+5QDCTh*w=O_@h!eQ{CSjm?#X54$!*(P+_aaR`<U;
zLXWWN+&^LD()P)Wx-TNTb5J<pw)p}`edN$cyTni4*VWFs$~Qtt%HONsI>nT?sCKul
z82ak^P6Oa4yYho`(Zu%7!uoplk3GM)BYc?PBFo?S!-X~#vnNbk3=`F_<c*V}dVlpR
zU2ryFV$svl*qeEUbJj7h!xMhoo!#Me#nJn7u_=M@Cu{j<7VrReJzLSi#a<RHvSdbB
zmaUfsExGZ5pSGc$q|+m$jZ^jH#0@2~@gz>_1d%n)n;7l_9kf~2hPpvAg}BZiG9}Sp
z4^L+8VG;JDL+?&*j~Hd-bY<lrPWB!|Xg}zG<tEwRaXO8p9ToB_;Drs{eHhU$xc!~U
zwi0)4;`pur;C~0C_L5KiithbI?F+}x3CjwMA0dd{zalkWGNH3xJrZPgOzM|7^a$PW
zN;K!W?UQox<4HZtMcC;vIly;iyp$M=Pd|=)HRBgB$M5Z@Zg-m8obO{OYxn)=?2$mB
z`H4(oSzQaZ*chO#=ev2hj_}H+<}rH4BkWZaYy-WNhoXQ12)+{n?Up`3%Lz#%Kd${;
z-*R1!K!+9?v-S_brMmGmBYjpH1A^q-6XsGP2EMPYQaC~Q3E6D;4^q$!%PBiTjFk|d
z4yP=U$}PYfFQV<XH2C@CY*`Qep2!#9GQ7C`r5XUz;gLC-%XTjvD6O7LT=c+b8-h+#
ztCO(2u&FwXXv2Chiy-q|9r?20E$`*9d;d;vCMPg8&*0Ho2EJU!(u;(_G&>GGdqc{^
zmW*kMP0kzzzZGRCh`8@w<AZ&?1w^uTV(O_vQ#amxLlEP-VyZAKnD!py?+&7=DPOxX
z-=>drSKGbOS=hVuMN}=-iGQK}ATeZ<Iq<V)W<9C+VA$uTi*w%~T64y;H9o*o4&^p9
zCPo@KGD{w+8m?qiLH`JFR$ahdG#|;n;@_5*V{9S=Oo@h62TrnYt_tZ%H8?%R?FgT0
zW~3cCws06ym6;F_-3o-|0BBmMGfwTqSZ%r+D{#u{z<ofoBlz;%iqh9bpy-!(nH&=p
zpW>dk_yD8lLFX2>p(jo7rMe{38_e1k0TelC_&4vCj{e}VqRbIY!v-138W>~swBTtu
z>~e+oQ9B2$Wn+&D6E4*$c^r95O%%A^0$(<}$dZTuoG4oGw@y0+>VJen{u4(4fgRKm
z5eX$FgKr=%j28t_cGr!sY0L7b_n4$takIXd1{6me1M+Hrx5uUp{XDu#8l;lpK2d4E
zmjIX6={n)WJrZ|H@xlb8oFj8oG90s@RxMNeZA9GS6LzxLJ8)*N`;>T15s9>7BamLs
zL5E^(NQxjV`1!{J;Gq5C;!sOV0&iO*l2v;o7>JW5bWw5Ii&k;_q6<;$`g33uejoek
zk-C5PpR|Xg{P@=@7P!FHpLrLCI0rc){xa6&yrBm~sezHKR}4ZD;M$8dOdf3?n@Z}l
z-+fh1#43_gc4I7hg1RL?LG$=v|Jj8AX9Z5$`d^`n;$BU}(sC$`(B`E3%kI7uo#f(k
z(f%hhLp}2ngiI8ME-Uw&N^XOm-`ZXbCdde)3&kojnkU-xB9dx9sU=(=Oausgq$aa9
z(OT3q1^n~rPEZWfqF~pKOJP;#5*3^x=XM&|IA9T+o1UgTt)TJ(i9E*xS+jPF)u?_p
z%rwWB^8XK{#__QSDRtRh(yK^^i(eW$j%0f&0TVWRw8y^UH+(acMOXML3qfUa1`j|2
z>-@daJOz&~_$A(LMmuarUo>o4nUjwSxHx7PEPROGJTn+#RdH#-3QuFddj|yj$q5Lf
zpC>fDn7Vgzb+Gcop@8J+Pq@wt`#nEI-IY01EZ8>za7;Hp(+B^2pq&WLoW-XPhZwe+
zF|mg#?JfT#38iwxSe!k7mAyM{=RQ+r&zJS(FAO^=`L*4{Z83HgtNM?ym9=B_3rw+R
zVv=9>%)u@ttpi`F-^Pg8h#2{Px{^EVi$w-R7QlQT$|=J=)sk)CLZNp{GIo~y1AorH
z3yU|U9Z%W&=^={U`lwZFe<MHZ#)M>OR8f?$M-7joxa8gF9+Mnk%2|n18A^$FXM;r+
z{JS)|2=?&wKXgdCQtgrL4N)3D(pJV`6r3>2yrG@vr8A@Sot*9@0NwqBero<!U#U@Z
ze$g<{e8U``#v8bJwP7WU9we@^{5kGX00vePoM}y*i4C1Qc%ceadHiRL6@Lg4ApAy#
zP&t)c>W+WpcN~Jqm<Q|KO^WKej8TP4f&?Ub#vISTj85xP)5ozj@MD7(a1(pZ<;4Ae
zzEK?eC<>7(j5g6@2~0B1zFAv8l<;@WsYyHA5KecIft*pI<bV03mC1FkM5bG&(eB+l
zEYlDEGR90RKwQ4QN7Nk2OHeqi`CIW{=Bl`EvwvrymaokP)H|LO7p$;6dlGp-<M4TR
z@7BT=uN{-5=YKshzBKY`mG8!FQx#si_V0`?rK30mhtr?nGbg(a`l2*!11p4Wnr5^@
zUDu#8#i+;$s?iC8T$yWY5TtEc|6Hd0;pxYwvjo8a_UEXxPNw&p-;Qt&$|VSFIq9Ts
zSv(Ie(N3u5vfI0!!;j<4yU+nr8L!p?vMP}jnv6JFe1g)9TSl6sl5e@oqk6KPJHt?U
zWU-z1YTe&z#-6GmPf7RNCR`6f{!S7Ptdqw57XSbSQ%qa*`gm^HKwlf((lgm4lz~3*
zWp%R9cK(bMWAyGM34u>Bb&sX^{D?qhrfU5=usZF7eLrvBc!_Q%#6cvTQDOpOEO0!@
zXIx4|%CBl?LhQHKTPgG>uTssg4f#?19b$gBjt%aP4z=Vc0;2}-mG?7ent(KZX(GJi
zXvq$a7&P(M-h&d`X2U(QvPK?}Yuz1eizIu&s5ry6r-7SM{Ei3kze<1toCoGU0&00S
z-NWCHC0<pckx8k&NMZXu$%ExzBEzdj2kEfh_`30O3uf+j&u$B%C}I^c%7HT%kbq(9
zwMJrplhrS~r=xVJl%wWTZNdnQPEu{7N2on%*Pfg#TzJBVLm1qTV~F~mtnZKcMead2
zchoj`NyzxWjkz8K)ufJ3{muG-`16R2PGX8;J~XKxD}c{)J`YgyEvS*Ps4&znp%^<?
zNYK=u#sft<k(5>7MD@bhs4+di?(;1S?+qaa<V*7|@}%0oXCr@E2OeH`j@`06LZ=NU
zsYe*t-iTxBtNpDh+<U*QnkTO*xO&;C_p`X|-fSf2P=c>90IHm@+tn!J-LAYMij4F}
zOUZ^+nXF3YEOHJw-77PE8=3rZ?L59AB1eMVAzm;bgwPrmpB5HNvh~m}6SO6`c)Jm_
zM|%1x;U^FIqF7=e-($O*(@L#^<$qKFHU2WW>Yt&UnErgO#oHC8G>jk7eXFevOHh21
z$MR77)D7mMNW-Y(Bnqu_cVB`hd-l!**#_iJo%BHAM#3FJt?>BZL?#fv$fOY^FX4sN
z_NP$^Cl`K`YZ$+ZDPs?+R~PNK2HW3aZ#-}b0vO9Vm%W+dal77#yZ-opgoC-T$m5cB
z;=<Cli9*LLEG;zBS4DTRQ9Am8<V$SA8aCb~8uN}>&nQ*pkWFm)H)HKEX_He{c?Ws?
zcmhEjo3bOrA2k?Wr)`!CL9m63F6mLJ`Hk8JXfGaP0SfJgu(Ez|;cC8%o;%(q^2iKG
z_t{zJ30ma$-e3XyjeElEoI%q6(z>cSzLm;;kYbZu_4`KvUfh7`&|{MUOH3c3ba%H{
zYX3fV-?!^tqdn%RR#0(Y+uit1=RPuRV@`D9Ql84_lq5c>`{17J`I*DPa@dTnhh-0y
znr5>257F@X4KfPx9@V*4thgnQKJU6^r{fo@%)>k_upS>_dm{Lz<4q442($fUty7Ol
zDVAm3^Kk|ugi08i^mIMJKY^C;qW!_0j*CJ_8o^tyEMCDOOQ0a}S8~vFq3vIwmiElh
z@*wSStf+eCMPV0;?kccK(qH<;3wE2AmbXnKO1^e}O1!%6RXn>4GtLlV7HQo7e%EWh
zn7lu91#V#`Zr1V2zW^vsQ{{U4m#T5sW4aE--^sV|@KHSmR{7k4VL;}I(T^)yN+;q{
z>sPsmF<UuaEsCn95Lol<OIyq2;&z~BA}(F&X{ECZ9K<MUTO}5hKbq6nvisK*F{bq8
zBGUn6odvvb(HsK>09+AlGTrk{cz<K(^dsfXSQVGvu%ykkk+)aBn%@_}+G~H5r*Awj
z?0XP}C^cx*Y>}Xo9ufq^d)A)$U&hV9y>9FlKl31HBzeI*?jKnS{q!x?Zs&fCVFP|N
zA&HLI3Bv6ltGs<v1D9E?Mu?N|3SVV?cZ^85$9Rk&Cfy@%XvF48)@HIDLB1YK&VMjM
zY=*FDD9-Q9@z=TnE@J4#hi^H-)Z^<3z`Ze`v9*8}i;F$4tQ%<jT?-cXiqseII5W%l
zKYNak1Kxk>CvUd)df!lSH1*AOTznv(MwA_3<0V+1-sFfUc$WKk^K<X)#xCC5*n1RE
zf`<gy%=f#$b~Q)TbF0?a`bk;!{*pJQ9B=c_1c&%S#24f5HJ${i+OGyBI|i=bEW>o=
z;iEYdv+p$xIU8K0O2dJBtZ_pEYNRZ%343Oz8k%H#gWyrr(g@Nb^5#QE_Z;ffx>EFp
z#_&APW)F+60u(SN1C=`Jq#)%**gAAWP!*f3_1!4Yvx7L#-T4NK8OL7jb%#>0IGWz2
zW&&LM?@k@c)s_Ls5h2Ecn4mne!5R@4pM8dMAFbEp|31C)J*uC)UF+tZn|Jpfz4&3T
z6;l=dSFhXz-jYB~-8=7AUIm<%R@o#U_w#-E6+KFay4I1x)}~_!hmJ&JlIzj8T~RuF
z+I}5g0X)LxM8ki#q>dnL>U)mJ;>T-1oxMfZ&BwGSJtWC~Rzs>OZ!W$xNpwH5?z7rj
zoDs%8PuH`7O=qg%<l{?Qt=yE-YW$_U?ladsoUryA;Uxk^Nk<eM84leNce@OTc24IK
zUp7^E!dicu0;xP+6P<#9^gtt5HRzw%x=#mL7~Ki;*gxLAni%wUfKo5-Zl<GI{tL9}
z5`G|!MH0K;-XYUa+<*Y2?0HIaQ4IV4o{}a2`!CD8ZE_(sQfVo4816IhiB;a(z*C-E
z8yWg(F?EpfPfGufrUZ;MSO<Le=uB8A=joKmWb*oYb-SZ!Rw=WWLmEMmF8ucSb>?&S
zf2C0O9v0BZyd>t9<9A2Zky$B>khtTlY==ju??8PxUpHgAmJE<tg8Z1jub?mFtZD#r
z`+b&}f!|jzufy_D4_~;YNhvD5=xqV`$yO-Q0zY>HV8d$t&CD$UZ^KknhbW^YHy|<3
zQ<!L%9p;;43CZZF12eRr9X~~9PHLq<OBDZAHm7ETRSfYQ`2}L_B+{xA{K5{r^u2pn
z3wbK1$-*TLr_OZu4Pxj7geOc7LjI#vvl`e}udI!}Uoxn>)hX0z9&`1;cSR~%{)#8_
z?|MT_jxim9<&l13T{hPq$l&O*3<9tV6yU){^$xF9`go3PoM{A+VjB_z(-EGI<;};t
zw-l?Px+0x%_2O6Yapx4f;s0``!vA^u1Aw+u=Ph|#MDB7#Etp?N6o;Cv!5Ey(Ra~d_
zr<@15n6*~gJkHZ`1l6@?vPFfAt*lY$Pp%Xz);|CO4QF`|3ak+t*0^T&*0gZmNbXzf
z@l0O4QCp|gT3+#|6ryE=dZA+S;p@JnecYcvD~7gO#Q}E2A9F_mPQ4#@I?m*`Rv)-T
z+}I)!A2j2aMZs$<<IHnnK!u}@b=pa8730&hv7K(`#*_*7zpEj6pZ=}(z-f-efpEPU
zCTOIW!MI13<jxvckBB^7{KjB304ivH#EwIv*3X(b`F!d4UB&>fps@V1EGZFCn<fpM
z*6~QX(ZF9nV>!$EC^@$^oRaPSpzjabX1c6yCp)mlp}6BXZ8Q4UnH&|}-Wqzyo~9<?
zg%%^n%|jvD$>KgZxaR?0PKxKzH<AEdzI*En(Gu2UTYy#_VR*3`w;L<Av1MI|`C9M<
z-OlDqpOOBbJp?>}m66p!xorpz_NyZP-ut7Acz)A{1>wK#BAlIr?RnPY!+JzK1rd$E
zmLfH^S#oKl;yg;+$k^56UR#f!9oPVc9;5yTbQTwnX1D#f3eKFZ@uLtlx8f2g3@%Qz
z>|Y8)JyLy&+qb|@1$KW*FpMcXLHCzaDkYZK05?^tlAZjKbo_8DC~K;EeOHUNV*m7`
zH%Xb(jz|KPH%=7)<LSjbj1hC`<>!+)I6<y=BYRj3-Zx=s>ZVu3pcl+Vbq8~x-;;BY
zM-lN5V)B}G3BsfAgz_OaaFHi7oP0$bl3j~2zCd~Nm3SME#N^{pwhjL(U5~Y}<0`kF
zj-N=TtMfrh#R<a!DnkiM;a16*?2VqIpAxpBZ@`z;Vq$=>o9RGTwpQi&#i2+|7-*^^
zpOG3r(=#K}b8Sg81gEU?2i{%n!GomsRVXf1&1Qp>M9f0u2BpZqFo-nbN^>p^MyoyO
zZYmXP`+cg{*~UU2Gqf|Vw9)gR_|+_V1?gDdy2Vf^9(II2UCeUhAWOBCWi--}>k7wJ
zF+Fhy3TytFQ9#)B(8Z8-vZJE_LszbcsG)7bRF8`A!s3n4h|>b#h`g?cMrjMicT?g0
zAO<Zygra8WS{Q3D>ER`}`=GrjA=7%TcYykNtZtdy<~=nzzC<#=_ujwmgBrI->;P}>
z5H1gOXZDY(tm^N5y6wa5kPHx*rV?)zTm<eyBe4lSbjahMXh|1DG&GV)Sncq1`?lgg
zi)>3Xmiq-s?~Lxa**`uIG_HoH(k%rbBYJG8hHx^m110Yn<^+?JC|~Be*tt^W<?oyC
zb<Zdq)$LG?@@+Wllbs46Kt5NMUbBfDCZ2v&_>p8#z(}d~>x(}^v8zAkh@B22J^(By
z$A`x8bXmjGAJao#p0oU?L4@F&ho_XU$A-B}^?H8+6&I7wD4xlhUl%rIyj7*`5|Bl2
z<Tc2tni#OVk77osKu-!u71<Q*?wn6n6RLPMIpYj=Ko0v78r}sMW~acTYO6(7d2ykW
zN3k?Gt*n&mA3>=3IPC=CH$W6-ARvKh;a{N8)NfvT+L#QC^wQ=7R+aKCo22zWCmTRv
zgR#FDWURmU1}s~|EWyEVnf=yiADNg)Q9ht4`Z%6NVu6<-9&zpjm}Ye*f-fYub9Ix)
zy#bd3fah&j(q~O0D7Ae$16f%^o8Ml=n>TInHt<cr$&r+-FOcuC`IgRL;I#(8$b1P0
zOYoz9&ns{*EE50@${c_H#<CG)`G$2+Va<YZWeTgb^;dS?C`jmKElT1OimT=wVJeB{
z!NM?%e~Y|D^~-Bpgz{xofG<@o(j0yIabnfX&}tc;MO?jlR{Oi#jFJwls2JKs;I;NG
zWQ4yvz(pR=vsI&e@UHC#0;a*2yW;Vz`*#Yo>OAI-10pjvQlkP&yFqc<ws&!fGzxm#
zDSGzBGr<C37^kWKkm!J7a?Ccz4@lq-6QL*vK0t`=BFtjiMPpDQ2KkmwjZ=(b5I_gd
zX2{I>{CuX}F3e3RSQwDlZ+Y*9V`sv>{n!|vSbp~YxsU6a1`PSVjcEn@_{;_@EUQ6|
z)4+jJ3NdSfH5;IKzXfhCq<lil+{1z|IH?clmxt%woI<&jd1=ff*vmI^5<*^_E(z7+
zrcsB<aygyo<sj#vvKxB(_{JX5Gy}%Z^U=e`2k6$W%iH)Cka7lRYG5jv;fXe!V<rGw
z?V4nAZu@oYpCw*13!hvNdC^0$_-)abxa<i~;kx^tD8jx9+i`ifZBq&K_P~Q5WUf1(
zQ~NkoNfh5bLzbbI$2$?#tQPPGr;7Xn=Nd<ndorV##c9ktS;SN<paC;S?q=i=nf^dE
zYJ7i+owR|o+EC_zmzl3FY%f%-w*=@RUeEzyn**8838y0{S7_KjbJ@Na`c@#>leQ<V
zdv-j5YTO_7xWB3}!pU-$uJYp5Gc=t@<E|Oko*2S^6YlNP?rfdO8|NyK<*<;4P;~0f
zsqbs%eBAi`o8;(4g?iuvz}8UsN7%aSmII|QeeJh(*lJ}eh0V{avQ>7cJx?v(eSA?(
zNc5N<Lv;PX8x)|e?q{t%#B=fONb;0j?bNiFW&WY|-W~L;!QSltc2!Wwq5>zN#k%?o
zu@6fi8))Eu2l35|Vt49#5;l}Z*K=FhCdr=ma@H)bx(PXo?fFt#8Qe1(dSc*lY&x9P
z4!EJ~o=vU7PhX|~AmVQLe2?euKYq65K!xN;{WW9c0LNa_n(T$jKo&)^+CUoxNgfv&
zkbZy?o%6tx24R?h!oD*(8DXdj7_dB|;L@Un<lZS8W3fz}AOT>p(zHL(W@ntPYB)h`
zN)lpsWUxUYjM=N7Lo^F49b<SUGZMhx7Z}ksOEa7XKF!m84|dR*Uo&q+Qqg;^bxI1H
za6cZRHl{p55EAX(0W}vFJoN)AyL2M-oL&X`(3fimzAg)Xv~b$cNal!OR(|F6qhfN4
zFV#TFh?qBc@qWe9TnyFS)ng^e4jKYd*Ykb@z`6%=FGLTiW7~C;OV3c#)qCj0X*875
zDm)WLhu)zeQ#bg9h-9^@;hKf`gfWAv@__=1zV=r+V1>1B!qe*`PRgcOPipTCKn#od
ziqA3FLKmW^SDOUup|iXoxi9l7TPkDw#&V<%xQAKrCUMq$<$|EwO+0&X%ny%5&*F7-
zi>N(!yn=U~K_r=y<UEnP0#MDjTfw2c?JhD+7}YC9EcUGTO7p#eq{Xl+g5Y?mMh4}G
z_?Or{B5AJ3h^XYq%d^~y_E}RBE|Kr5E7hHBvc{>&?Gv~-Khw3}`p&vz)G7VdU-!$%
z$HjYC;%}`IOx}mmS_Z{jh;s!{Jts~)y~6N?`M_M6G&#^m#Bt8eV<c{ukLVb`>(<D?
z-Vf(9t_dCME^5iTJ8jXJ=QL&!(>6E;@Gw1nkC3S;;GMpbwlw1oS3=DFMVgtSA5YG1
zlI`7{d?<^WZHfh6nN?b2k)7Eq|6I>KA0_fcwc(F&@u$=rC~uJkl<L+?@$}1Oy%4Te
z1*d(hIKF&*X@PQGmYU;GDNg>rEI2jzQ!fM%J}R8{jh~hAzg(ltPt>fja!Z3TkC>i>
zKLr&0OqtfE5<6fH1Rp*IW%OnOTEAM_)(6)~V75kM<o>*w@ZWDAtV&);*0gTB*LExC
S0{`aaiac0JrtrDZ`~M9-Us$C8

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/prefix_caching/overview.png b/docs/source/assets/design/v1/prefix_caching/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..14fb985adca032eac15543d559fc2f725a2b3bcf
GIT binary patch
literal 33028
zcmd?RXH?T$_b%$*?rj4EMJdvxE4@RgiXtFQL6MGt5PAzOK)^=tJwO1ZiuB%zQU#JA
zUAnYHY9s+dsOJZ|_xt|umwUe4amP643xkpTva;4(bFTT!=b0;EkF-^(F5S3v=FAx?
zb+!AtXU_bIJ9Fj_jSJ_1S3E^IKAk!9@{IcZyH9+~)-ui`uBpr%Mclmk{66Z^xhJUC
zpzKV`Xt;=_%ymw_5AXgA4`qa4E^=av;ShNp1$?s}zS$q&Nt*FnKpt<GWTurAuCUmo
zPO~J#TC@6z;~6U(%b8>?PbJgCB0j6PG|Cs)|G)cl;|^O<*x$0~l}AMiC3Jjc(;a5{
zb}hA7h6Ma-tj}F40q4Q~;LGA<oN-RG&Hi_ilDyehN+#oJn;NU-ZJzvP1v-JG5{5)7
zz9jTTh1bXQqT~Rb3%#fFz0dRpinhr?rcULCOz9cz?0;w$+#O!r_Z2mJbk-qti`bE7
z!53v`JWqleHotpGE>^7CGcQ`wu(r}1@)LN*w}1B$R?A;wd~Bur@i;%vEXntA!>KWB
zIh7$tOhtyTS}!CY4%rtDN`1Y8CYu+x-d^5+$DDzewzWDqvKPgvrut9wzNzGZv^Lo7
znY2H&7MxQ~K+Y;<6V7TZRb{)Mnswm8TkHYWmJwl@mdr+%=zSle2r{Z|897c&K9n*V
z)!(&B%>dyoxAS_&rj>39i{ZB2i*8Hda^(;G?8X}h#C{EO4uwIu*rz3j@Kk^xGw78N
zeZ0Zp_==9cu;KcR(re`*Jwv><BS$(#B&}AMS{#UX45qK1^O^v+$1=}|Y4x=aCwCSL
zi8S0jN*VFi_ckh_D_;1lHq?8NHo~TLWVoAE$$%^yPbFILnB$RJdd{G0sl@mB%T3=t
zJUQ19t?wNjwtf7_SBtd?Cz~(}FGpOXo3XMlKiC`;8>Kz<A16&)Oiv3d!vzS*-j0z+
zULs5d+uSswmq*~p7_0thWl}Dn54X9RPHcZzHqg$4WVemB$0CyFW|NW!l-C$-2dn9Y
zM<;8WZ7%04nDwuAJy-a-R3(lR0Jp?s6F4Adi4et(MQORj!|BD_!pBd2wLpDyExLvu
zcpvtCN9i(7Qk>W+R4PCD@_VF|UBM>~NPvGK<+Qcrx5bjiYL0#PNUq`+gk+=M%(@%T
z3EDCJ$-QdQ?<tib+I*KvdIFj<^E}fyC2%om{%`3ddRA0FSDt^k+Af|a7Cpz6knU-8
zL@rJo`g7go$q%*V*|U0mM>K-Nt^P0UOn+Jf)0BrkZ;{I;7@^5Yd1KWFB)+80L&9I*
z&la_7Si`ppv|2X;;mw_y3#2t&qu_JrmQGEUP4%G@!D;l>a7^z7re*xF_ere{NgNuY
zn>;W`Wl==t%M&vgUu-8i*^M+k!J^5+nlqPB$7>PE`0>Ymwt`48t(^Ci<$Hm71pbHC
znOE}MJ?<AqGMzIcDxl2y&K{9$j-uX8xhTm&v}I{lJ9U`sC2>O<Wyk%CI`<iZ<NNjR
zLqNFlT0397MN-7CfO60iEU|uCw3**itk`yr+Y*8)4{YU@*Iqi4HF@Z{EJ<E(wqbIw
zEt*3^Z=qJR=N_WJPT30DCXZZ8z@_-LCC^X4aW~*Hz(zIo<MN*>+T{17xZRuEoO~vZ
z^JjzGk1{a{7L6c?4KOg~(Ui<UPl^xpV=D1gzMXOA)zd>LRO*Eni}I^v^W+5VoR+C0
zMm^&v9^zCe&{0r((7ZLlq^xQC45=&r7K^B+%#zbu>00}c3^D7N$+v?=`d%m$2pE5w
zSBXqEQ(Am>$>-~0Uz@)d@^B5+pfmkBGGy2u^9oZWohnb&Elg@`v8(xrMZk8QeB5uI
zE`K7{2-P6nUHxu<IO-eXY*%U}^&7F93Z3Z&v*WIo%aWJC%P(p*9$aYnvItuJPU>$p
z2vA>^l%^<38MTW}P9wS=LUODZoA#}{lY1FVgU4pmdY`Z!k!H_l2q^^Y1A)i@$=t?-
z^%`4Z_R|mcUwam7OF=JDuMXTER;u`km2VFFS!|a!<7cqM@hj*jfMqrl?s1Yw&9`jq
zm~M&TW+XV>^F#@`nXy6(eI&jT4b}VR4|pHxYJ6cwmObf;dWnZKcwGnz($dqGn$*w9
z!!oz;HS~-Xa6{{KK|D!`G`;G2n}s5|8M`E0*ylgFMYrAwE9F-U?N*MbLXN9BS}i_~
zAcmF`#uk@DmX$~;z2qbLBZG$VIa0Ytd1Rd3fuAsA;_py<H+uQK#}CTxZXq!o94q{A
zNu@E*?rBKBobaI2XWt6YdD*Bv&xNgo*yJ?GH<X0u^QfR#RFE=8!LFEM-P~?X>xpRL
ztPBvi*lm>anNqGk1fvz(Z+u+ULtH(~eek1r(AV|Bqi8egWHO$d{!a7MKdP@p=Dyyv
z*RJN6wcraGJNw|Q1xT=(oncEuYiM_F=9>j!J~|7o-!-svn>qAyvd18$oPbz0JKoLv
zQJ1?>%I!c*E`I&JUeU3D+mV;#ZnM1P^FKKP9pR%TZAlNm&g)!?)M30kYpf+g{)T5i
zGEjz?bV%_8vhA>4rti5WdZdudm*m8nNX4J@8N<WUjquc_9DT*Dx#asEgp(^+OZz=b
zk2@kqAL!Il4Wxt+8sF;=%A>2^q$jw3=>Y|$Q_1#QM)r7?op)wEmnMI{<p%sv<nQ2g
zUL`+U8RCauxj5r;R)4^~=64y-i=jKmZq0Xo#RleF$u_cjFTF=DwN_d&F9_jv>L%>6
z3j3+@I{q)3-$k*rEEnX9^_!o5;Q#+WC`QvJ&;7i^1a(M8t7u8+y!_Y-(@^*=3?!@3
zbM=Uyb0R~?w9((;(mcDQzG;)!A?5<RjmFNV@G(2unnTF#-3r%u$gt0q6q!e!@oV?&
zi$m3!xdw!|Dq?R4&tE<N@IqYt6V}4b|7)Mj<S>Pdq1ND{3w>pz8&HGVNyYQYp5L!@
zTMxN3D2<n<DIV;&j5nm1;%k)->h-GVmG+>O+TiPMVP~K6_9%vXkkR6mwEgm9hsSf$
zfhD1$vb!H>D;+lcxyDQBS~fdfrtn-sc+_wEs>bD;!i=T727@+^>jDV-T$m7Mn6B4C
zWI_D;$K@m)WM8y`wK8<S+k9Wbus}Q<-f{Qpy7JWsY&D);)taPu*tnlXG&W)D(uBt=
zkY=9sc%he9<XFANzDOevOjXDfYzL^8*(USj+vE9Fq4gcFt}5qpB^9U@AQU8-nn!(?
z#uR*adcCHs%)#FVW;j)?VS77huWv9maO1*ygvU>69S=N+Jh)mlO;WgJ?Dgr=_@w!v
zqG6eMI55`7gcoco9LcLLl|lQ*hvo<4Ea&NI*>DOwax$yq4gubCVRa?>(yhBrwVQO4
zFDQg3gbhbiJ4sCEKWfLru7x$$jZk}e93u%i=8X#sZr|sYWbm@{pB!7szAw(RK>aJ7
z;zV|bXV<-EhNfL1w4(+1KgUsqY}~6}Qk&CV|JdQXGDTDH#UrJqVq#S%oq44ReYCuV
zW9`mYs_E_Z=u~0uNxJ35O+Aw-(L)O|r*Po+-a_F`sSFkWqg&d_p?}8r>g58sW@(-J
zX)*CT1>S~8+Q`TV>_qqMRG97U(r?VO)s~jD?!-E_{r!w<)$76Uz{ycl0U+r4%t8a)
z2Ctc0MbWl;j}}R3J#vz-sEixJ8itp?{3L^G|CVkX2#HUX<{M?-00OtD@bAOdMb7AC
zGcR&D6A@iIqsZjB27fV);c1<eQOh<}vm8KJLRu$ota7b>6{$G5#NWUhzFBDiCmv}C
zBzdmRC^T7gB`mq8LPpNB+jAGnt9<N4<S0<fNKE>t&utv<XTkcOc4T)?A}*vD*^tA<
zgchrtyMcLcwsij5u#@NKnxei0;~VStk4HnYyjLYtlJD<zsEkc7Yc`^wdAl>&G9wz5
zF&rb7m@A&Cvgl*^kKmT<0^?i(h0QMTb!++C>}YPYx~ed|KzN)jTfBipff0%v!4{C=
zxjdLR<$LrdW5*Tu*+=0mXx6pLV%(xT6CqeN_0rup0@bkchNdR|NhZgLJ*Hc$^-E1W
z;kYR4uvcN8Cy(7MhK^7(YK+WY8-@toVrs_-0Y}oQw-o9_0;aw}DxZ!SOdj^@(aaxg
z^dG^C=^tX{ug|(Le~?w*d7Nv621%ptNlgtO>R<%jO7oUt75Nl=w@|DB%ZD!v!L+kZ
zzm7Uv?HV|wS`FE0a)aj>qR1~R&EbOS5j3}0dRhg$tT>p|hbU6K@R(nH>9%eO_FxMl
z+>>l#(~v7ax||@488r=D$VD-|`k#|)TTZA<(q}{@K1)DB`C4aBkJlWvnly;4eVOEo
zH@VGerA;`|6qkURC26ic*_>VHg_~hs>|rVp03QfXlm&Y2R2Rh)3}z6-W78O9d_mn5
z=`D&Z{FLqOhooG>HNXPe=PQjUcM_MSZQh(KGkpBOp;($S!?c6@x2Kh05l=z{A1-Uu
z{6DQysDV~!6dnZ8o2|=YO#Kf{dnz$}|8Y2=?vno_pSr8YTPaN-;(8jSgZ9?mi|4~2
zp;QQa$C?_SP8{&aVzyAlu-LddI5WkOje_$IZh#bU<gz42K*z|snN&K1ul;idK8XMY
z9um#v5)>>}wqESp$AfhK4f{em$PuA?Rwl<(<Gjn;LiSc~#Xa}RV(|#P)a7_v`rp2k
zswdP(XW;sNEhFoQ-H?tSDBkLdWo#wAotNrgV1j6>I04V+6T|UlijFf1-PaiRWpa$C
zGO>HqK-8!~KqPEyBr69BGMA2!q{6Yv8Q9(naWD*CJjSSEGJj`89q{ezsSsI@JsF)Q
z?2aSaGo&%@*?~bzFDMJLdD!(UP=Mj9efMQ_S_pLXR=P(1(NJJy*}uKlCq^=O5B%P#
znv-P=PlDjziGM?g^F<s}i5p#EN%up~Hf7(<PDT*RBOjNlxy=1%-t%oN`H1IjP)(f5
z(dJPI%X(y*dB}lmvclmGu4kWJHX?3fu<ZD1X^vKjphw2tfA;t4gQ)^h|DO@aCmjqW
z%_}QHo?%SFNVI}Z>q;_uZ*g+*P<bF*)(7pGK)?w66E|VkDiwB=Iu832Vr4MH8myfm
z(xdPz$=9df#-+0&&u7b@gz>*wr6cMVjC(f=LD=<<9I)vqbY+r-ZYZt57h7n~vaVI*
z0@OiKY|Z9^C3xb;Pz~l=?Drq)LMM#jpX!UNB7SEC-G$#arY@}D;Bkw+-xS-SY~TFv
z<GEe`r#%_o@l%(wfFZ>^5WnC_ow73@Vh(OvhdiI#?Wr1~b@9j6wj7Vw#ao|`l$G-P
z-cXl7h`#yH{w#0x(1Z2zXA=|6-6r2S7thu=bNjB8^jEb!U`<XO;|QMS!T_$DC@|@_
z_~Ei#rqn+%)**vZsq)e7ssuh@S^DRSnTc7%25irh9_O<=LXA93=MTiL3&%TQy{;tu
zN-U3apD7+<4r|*iUbgJAjL8*_8V;2T6g$*}zIO`rAuy_g7K+#uu&CT-&i_IRa(K@>
zwX30L69_~7EnLRH5t%?pzR=$3CbF#6?C8}<tT1#o?kc1LGpQWX8Ot~ErFB;4g!W(S
zEHnd>yxr<tQ~l=30)bXLYmtX7yw8}U>mGjG;V8G6ZrXL0c0<n4??*7^Lr@=_c`^BX
zmR9ZZg?aJcnaXkso1JdoJKv*c#MgDtyxRO2_>`&F5Ntzk+?H5U71O%!*MMO1?gh&e
zZ7R`ILeX-+($Tdrrn{rCgBqO#Ym<9I^(&*f&D;~vvcsy!IXddS4p$%%u8x|Q=#qyX
zjXuUjeka~{yQ+LG7RX689SCs<S4pnsF-uv@D;q<Zdq%bxyUyo4Z9Yxdw;-@h8+Wo1
zkDUcjHVOx95kKOO5B8Ouyu>oVvxn+)V0(&9*l$>59|i(*P+1yo!N{|BWX2!ZV#uuK
zyWhb8K)VRR)N1NW+9+H1Imdx;cA7OL-egoHV+yAcI6_g+_{8|9&<Yr%p!8~AHooup
zTw7Qez^gcLRfN_#Xv2xE6aqgw?hCaxgCN?llAWXeW}<cHTf_Gv)+$+rHJxtVzu*x^
z%`gH&H7Cs`eU2UDu({ot4hd-Ata0&bLkKsZ+{cw81dKN20D|mjVg>vodVA*?R6!{1
z*;}Wyea#ZV)>WBD%AtwBa!<f;*g|CxommI^^=Ok0oAmW&Y1;xV1Xf$@>@%ZIQ~wy*
zWsheYXb`@?r(9>Ybv%|zS}>~vBZD7wtmsV9+=)Vf@S}oFSd41r#xxER(|K6=6=Br{
zBvUJCE%9?PsY2>W&&PwtX5vc3J#rk|wd&mW=U_gl<yqV-+-#3Nb2Tt0c=QQk@>}no
z<?S9!o8J%pg}I*;fxyX+*ubbw>?EbI5}5%uF@D`maT5hG1!0?2V=~7tH{lm-f-;ca
zc*g|6Ize+9ZFwtqspp+Z2{29+`A-U=MVsZ-fvOaZjGujgozsSHtWtXJoBIp|jDN2g
z<u~`>HS>sy5Unut@5!5fGBHOPs9A=tp6<JB@YxB8zT8y%Ls^#hUrc|JPv8GSv=B)3
zx^`1amdSIOHW@vg+uaDg{$0UsbIaTGY)2$J2UX_4v<Qu6m$x8u_hsv<FnaS<%vpwW
z-U4wT@Q#IzXr2(nU;gpu(5grxif)JWMX}zF7h}=4BuK4*Q|xy?`^=-eM>ATbDzi3O
zX5*hY2XN3>;R#YQ<=i!OgQa_IJ2^9ULAPr{>Ir$thBN4Q>v}&$aPk+8yvVOM>a<`N
zjN8V$ft;(c6D5L^kabgFu06yo5xya+U|FpciDgX(GlvNUcZ*8mqU8Ono4nXpG~plb
z!E#w$QR||5(zBkwQ$hG!Q1>H?+Ito@%^>9V9Bx`=qK^6NB}@=;kO}WeucTQf-GQi2
zHgtaJ&_KXW;l4iZYR%U~R#b0<VD5BxzikHLj)C&wn`wE=I0c&gHA>n$>y`dlZKDuY
zNPLqFWB}JPe!L{z@46aapzR<JT`P_;BzAYLvaMCC20>jY+~gl4R=CGi<3t`ajLBe3
z41j6uMTbnQ&nT(93%Ze`b$r(7HhH7@@w`nKK9xxV1?qO|HN-Vpn|!Um_56zuT8jpm
zv<r`w#vLH1F@81<RO9a)ca$NRza2X6s@7Y>)$+gbEQP_XFd(Vu1W!Di952U8sesgp
z%vF+dmZ-m&J~Xv#)OnQi3qM<YvWtsY&`g!a4V5-bx9mt~Ggz;Pct-E);y)}1EbQaX
zCGWig;bgwlSTkZn7^X!Q)JbWxf58~$t&{;6s5Gn8t-J=fqlxI%^T$8$*G0lMDoJYI
zM6?$i5>4Dm6>c0Gn>n-{&C{SC3qS6nN4iO0JNg#79~&jrcTxqp3zTsGdnWdz69H(g
zR7g!kkL5UG?9@Ij?yWz_^3|^$<a;xH+#Si7<<zv`yksrm`t?!gF&u&^Y-Rg9>N?yO
zDYPx%=y)LrgllPVdv_VSz?=qgmI58mZ13`KY}1w<e_4-VGBLBlj7>gISljLoTwm$?
zIsKrXWX(8cfbnY;KJS@ucx`*#PH@w-`=1Jw!B{8&XXXV9lG%JKh(ny&?dy55CrD~a
z_-obe?h@CX2)dFi6Rtz48N@z?LN<4D^6;*VWe`9vLM%wUDlQoH$KU58FA^s$=o(&H
zF^J-3XnfotuG0!J4Rq&fJ7X<nY3!~~z3uq#V*+qanoc)tFqb2U#Oo_-Y574^K4r=^
zMqF+x*6n?pZBUE>xs)uV4&Ah7;GCT|(gG-`{Fmu(5-{@pEj!qQ9_hff);f{xP$q?)
z9v4c;hH$=_$#AG>1(x(Fz#p-EMnl8EU!df6^zYK|s_2aSBul)E9$kJ74R}Qe`%2rX
zx_8s`G3MChM_ILJq$)Omsa4MM>JF8GQ<P^VQ3%NniZ%eg)sBAK@qnE23=VP5w4<U9
zG=@_E?Con14vQAHG7dOP-LjjxALTx!*cGbZB_mlT+3BVtdUgHY)d&tK1ViR3r9L=n
z1{Dm1te!){FxFG``?Gv!d0Ycz1ExCGTER0Pd;&}3_F3JZkR|&-V;$X(xm+;hz^TQE
zsfMJk*slmkd-($EzP_WX)nI;mE#w&RcSU5RQ^S?Hz|w9BueaN;&XxMg1wEtvPvk$*
zFruK>Fe>*sh`KoMCK`-LVD4-9RE1ClBM>L0rUuik6`Ed7d2Nr)BVZqyPgoZDmZie6
zUMxcOdvhTCI8x@a(TE7H!md5BB68HjNN1kz`fxHC%ItEj!1uJg8#J<2=l1GIrBxE;
z4*F}M-91x=#!rYVx8i5h=U95r3*6|pS1jibJ1?O@ZMWiQlpR|@1xpu(=v?}ItK+RO
z_lAOfu4z*6a}NJhbc=*CvmnhK{^c<~a+d_9Gwb?=GU~8-&5~}W*uMsWL3VfG5K78f
z(c2=eOY$2u4?hkpY7mKcB!LBJH7?55=@^|QJHjCW6!LBt>lUUy6V99ajHN=1K<!I#
zNG9s}78EIiF?OocI}Jtt-|(bxw3@DzJJ)q{azR~yislHL5qwEkL#=FnnOO&MK>NA=
z-Yw+4(J<1}Twp7zG_1}cM}S<3f`(snRt=;IY{fFK54q56$L@VO&Yl0cC09yKo3E`C
zombQ0&GL)E1IlYlObnA>bRXm8kX(+uR}=z*4o4MIZ8QV2Xcm}58l)QcM@S>>RctIR
zUm+cI)ov)~o{`eG=m@AJ1ha32X`iw(+7<QNtf<c4Rs3Bt0Yj>?$aSUGl;^O}i&BaM
zng=#QOvvD%zcVu89x!GH)t`cQnt8EPb(z5|mA2N|krb|y5bci?`GU=J^WJN0*XxpG
zYHw+8$FBjEF&v`%dbjmJPYCr#31*nWQs+iG|1q{~V}IfHUnM_E>@Zh!21M@pOQAr5
z-{I-yv*&eOKP(KR0g0<X{y@K79~Qtlh3f7eG@7sfT}hu*sR(+RXS>zX9Uyc%<i^E=
zonflI#d5(i<30qZvu+`n!PW&st+?_R>4o@IG(-aDe&mfkNSo{fvOXrEvOQX%!O~dQ
z*yqpd$u;@Ar3-g1vg@8uL`EpkGM7gi;ao_w_Doijk8k}@xa{HR`6)3%lsfc%ji(p3
zc@zN=M*Ss7-(1iS!u{DFwh<*9u)6c+RrmAC;xtgV&sU>1g9%5~RXIGZ7siglZ7s>-
zV>-Z~0fY>ZA&HJT3mLcqqOV`bOQ0>}7E;Ld;C$dLc3Zf*7K1^Ir9n1S8n5!i!uEk0
zRNG>rfsv~-Sf4v9!UT;Ns?<>x67uLk%wFV4qHs7pf)v8_T29y}K`cU%c@frh0ONOe
zga)pNNY@O5XDcYJ+R#*GU@*o34I<6kM0~4qL4lB<!52SBpBsXizyWp4{HuV(BtP9Q
zF~G?3Ci-*18xd&^z#RaXaqB@gOwAIYTTmd}`2hUPqPxO^G@ACF*^oQBp?mg=duuNB
zx#NCd*6|S5WVhBupB4}a_f%IHsx?8`t+IEktSA-IYu7Qa;Dd%H7Cd(vzxBC7^&bpA
zaFs?$_FiuBuIe$O+(#3u>iy1&m0m%KkAfHE+X#8Tq`cvmm_UKqgDV`jYU(dQ;(DG&
zg=`Y%Eww_>^~_dizZsc@+oUwrEt5~6&+Y_vwtnx|pKo>0Ls)fu+q>2M-e}bcxI|pM
zKwOwq{<pT4X5WlW3{|Z?3jV7deKCvT%&aJqN|*KEg8=+FiqVZ{>9e^>{Vlf~+E!ug
zc)FDC<=xB$Sk5FX^z)K(*V5FK8|9Iuas>6}X2txKayH#KiYs<E=5`_{7F2iNSBX8y
zT&F>%l)K#j54h)8|8T1;Q7n1$*s3gGeqLpdepYllDbCBkWw8svBs{N~LBfaJjZw8?
zzZ}3pH%ep~w0!JR=;sDv8Iw6T`yPQQM0oondeXOHR6j1LS}E5nWc0&$rBM{P+k^0L
zqqC^aeMiB6OFu5>qg|&?T66XQ-mQ%JK#Yz&$kNpLa>LHR<IGi79ZQhXpvY%X%+gV_
zJ3w_O?RVb{2yvv}Xr~UKafmsjcB8!#WNji|r~?opVx=V#w>HzUQ%ufP%lb+m>KXG{
zgY8V@slJ<FBvYMWs)Myxfg+Xrt(rPt%zp_DCk(&|O$W#l9bCVt9<?a?|Mw?#`9)Cn
zmhVdVk};W1noB$B&xA4sCsTXZrAtiYjlD#YCS9I1)IU|TWS2Apwlk5217Jn}i-h=(
zlwojUj*>IMqeyAl0{2Zv(CicEzz0k7<L+^VQ%A$SomJM2z&~$$J4=+WQvA|D-qPGw
zUy3`GMoxIhUo$=-`g6NZ$awp|dD0$2_#v6`|C=0#(W|ih!Kq$E!T+Y<lB@q5{uo1E
z`=85GvAm_wb^UTDO_*jlP1)q=MBuRKlA-E7(ecNSd%B(y|B}=vJb#$@(6@aH-n`L&
zHHr@AKiv?i_fgbxw`(&0(SL4Tg*bP3zGAoTN6y<ww+1%1KFRu*v<dH!X^ZUcg^!qN
z-fZU*iqvB*n@zTyvhI?$$^TT25VasFEPi-#7=7y>UL$OCHZ_quk_hl}^IcWWeTECN
zMZX>3F;^D^ocvh?qp8^@|Cf_Oa3GjgH0%x6L=YF93L)XU=pU(N6sYCTyN8^Y<>d{{
zKkE38xxG8xh}&1O@`{f|+gTG5ch!TNGhrm&>qGCv@19Kvew#o!!ifS^f!wQTTERf|
zbd_<3g;nQPzL9h3Q#j+TaVa4e>VNG5C*F%zEX<BfeVpnhiBLPc*%DOMNH8V9g7k7U
zM*bxUEih1ytI47{d<k(P;|~omivkuD9M}vQamI%St-5@cSI?_gQ>Y7sF9HQuNjV}l
zC-q4vqb-b493{5=sGfEEje`pCT*ZGS5)Mc-yeot8>u%O~j~9#EyCQ$o*AO=wCZn7B
zM%Lx&m&W^_bVbI?%WmP@gE9>d;4P82ze8jEvU~-{+1e1qFBRs;+*1osbJjEn>aTs9
zQE?lq1qQfXqBX)1dJLqm@fLdV#*%Wv$Bj*%KF}ZH30zgIVt<Q^w@s736)(!94e=}O
zS?iAKy7~XkJ^;%Z%7~vz#O?SkwfG~*t*H}fV~Uu=6a&7dxmMmOFQspGG2yA!WUZfa
z_1MzX@w5Fc>oW@j&lI(H#;(?Mk*05LlKI=8NCVo}lNXqQlaf^2N328vzqBB~@=es4
z4!!O@&Ts7cKFAc&U}1&TueG9Ub4}ay4%j<Sda`~}#d!L}VA90e!DW<jw!HBM*9t)p
z4CO$gRkMx)?oVl=2u`S11}sV|G?x4af9aad=!*vyrGlt;Qzd?+a7I=2x~(csNX7)9
zfAqL{&}|+*4OYMK_`WU_s7ApbJ&jN_fGWIIYTA?>#ensnWby7h5q2;?+~c~p_Vk08
zY~+{omGYP7l0z~f^w3C_J|C@>zXYe?R_?v&e$CPETTeKPKIg{%ygK>awx&{f30yV7
zU2H4(B~1J&>9D``sT8g_V!XrSxzv{i>**5#L5mY_tO>{XY;vavu{l5*fl_!~tLMmB
zX|pmpd?Df&YXOKO4vQ8`GD8I~movh|={a364P*|?g&{?N!_9cVpa9>vk%L(+at{2G
z>N%T+S>;YK8*RzY%a9>@a4JR{4X4d|=X7{8{t`$oLIS?Vb*C&Gd@IU&k)tCgw>nXh
zUZgXF(4!CC>|+aDaBfp~%MKv0Zk!uWyB@h)?cm&>Y;KW9*eCjafB0tV*ii>zD6ANb
zx!KzNn@0eO0uYkC=Sre2djY5_62=Z+&N;K7K*9T6j5-YtY)I>X{RJNL#bn}NaS+^>
zcGae_bjJoKFF*?E-Tz(_Nccjvx=m8dWL5=V`A?w$Y|8Yb>PjuRY)kWHEHiP*Ji$wO
zbr%rNOzBFTs-F+(Vmc2_=z>+t3)Bprj+L_~HO;ZydV@Krg<AsWP*NA*wYM0-U%Rjf
z-GYnv0WD9)NXP!Omj&@z>3@w9yHOXRfQDGl6uN6*Tf-u^+~(Nu2Y+MY8D|;$L+zr%
zEtWbSz*#%II~w2D_iXC0G=r#-6q*cOf|260e^*$;Pz^{1s#&M|UW*s45)McCj$uPa
zBW1_)<5x8Vhxh=aKiP{mKG8ng5LM0}09Is<b*XA9oygeVY&a7v!|HZmcik-?Lu;&S
z=`*;PCE`?p{<+p6-mkSY$@p|HKxIFnBU^0-5t4l+Y_?nn+JP84Ggmja*Dm?$0h>9!
zk9)^<)Y4FCD_U~*rxMK5_Wnw-PPAoGs}*~P<B1fLrr~h0Gia~0+x#{%KPZv*8cV<O
zz{f6kUESJgJMS@$sKORNaEvDj=-OgL9Xu>^M&u-L^3Kga8s+$FbLbV%0aF#pLiBgt
z5}EP8Z069iSPw-pwz3s6=5~|^sOrO`znU0Dq#OsZLdkF38A}!cu_@Kxe8>g%Nsvu)
z?;2?u3CP&fE{mIEidr*ubepg$;T;`~fCEr2_IekCWXkCL5*oVuAxqzr@*y+$c+nAg
zFYw0-P05-(`qBA$lRSpd9o{gN({%)a3brSaSalj)Wxdd@XT9N4S{GVShgEcM)6Yx)
zzvu#pC1%XT`-7Zl&}wYpirlcP7y{|-;+<{m8C8=&a?pMcQq6t*RqQjTnhOScplH*U
z7``6t+}z-RF0#QK?wh-}D=b-86NZ$PyGM5o{56E2%M9JD3Vw~G=rM;vpP1i83Gg*`
zhDv@DPp_}KaV`v{*6+H}LPo3kr*r0a6^uEKVZw6HgB<5n_L9)P&1R}sx><ko+}bFR
z&g~PbIegpa<9*+&$3v=b18#N{h~tL4=KUgnXSb71pt3e(>&}L%X;ygoJ$dPz+^*d*
zlL%Y*_X)3_ZEs%uxMAmiia2x_Na=7&W9si|YUr1XmuOiow|u!6gOO15rXEQHuyvdt
z3cQI$crY1lz!YW470hg)VW4%6vaqIi#UV*&bn5xfFPX`+y9UZ}XIt7gI1zVaQfi(s
zoBuATL4c^IZ<!2>sCRivU~#-+6DuQ5*COl?g@6pE87=HPC+-R@{F2K~W&*T3OecuU
z4n*ytwNmg7FV&*hF&=9>*1aH?ZH9=6lac@!QGT5}D9GXO+es2rU)?dZ%E-&m)#1N&
z584LW5+S8~H}4CcG$6pu2wDq-M;=GH?RkM(5#ljJcGGJoO#@;%Npe%c%`YNNZ9A6N
zwKb}xM=N})1Bhq)pCk@shri!xmkr*>2EVl86nt$B|7%7`o{>2IMI0BX-R$u;=V{i0
zO$VL!hfOz$f>8#AxBpP-mLAcre<2J!78jc+c(dBd#R*h`>i*^(3CVe`P5#xd?=E59
z+G2~Tr~L*#^UdgJ4Br64{Qf^$p=h|a&pqY{CLT-LndcTF^{=?uQe=o7T-w|7`3!GZ
z)2<i|LxnYco3k)C|AhqitsWm3$4G8wJ5-wPuo^zz`Kf_#;;l9@r@5WxcJ}gF3TDsw
z`fxE!Qlvf|KIcEZ8LFpJ*)O1cf@e>f2x4#W|1<6{_3X5Z<Nv%TfxZ--)^AY|<`nT!
z5BVp<h|sy8a6=IxeDS~3x;Cp)p6h`_*OE2eXnk{@rHq1-UDR;BOpM}o@mj8UzeaCa
z{rcw@^W~rc2p;xS)X@e^_b<4duj}1iocl}JjMDFm&ypKdqtjr;%`ncn@z_opm7Vno
z9ntWYOqZ+nt7tv|$DX!AWpX%GwU?IaYBldpgcEvF%<x^3(PJ)!3}Ct(wiNHvBkEMF
znE({2=~m&!4NSPZGiH>r@o{B<M$K1_2>r;Q+3}N@KJ8<$E7>0P-jC)bT1M`>RwjY2
zSRqKulXaGr6&Qh5+&_tzr^jk~>0Bwi<!}VgpDYT#=P%?~2oU8xT92-!(Axp6DD_t3
z0Vyqfy{WK7_oJ>!4~b0>Upv>^TO5Ec-iEMxmwH76(OmEaAs$9u<wB?BJ&T|-v9(Ag
z-ubuU#Uf<HDI9jWa=svt`&CMfGTpQ;PxR_GrmZn&|AUhu^C-$z3k5+j90T1v3e~u|
ze=SVCRpxRk(Zrwrm$-Y9%k<bDuqrlM8UlLOa#j|19bJaoOdjx|!#&}+?H}VsmdVpt
zfNbBKS5?xPIWKC#yj}aDCc{w6a5S_4?jQY6D-IU`;B75u;PcZd-B^TOqaBMWaK2~E
z?dW|b{s0gpy&Er^nK(a}_(QRKKlCTp6Ft`_B#j+6xvL7F&BU)Y{a0I3D?@cnu;X`|
zxUHoM5L@8Ul2E21Ccx{d&E>q2;qJ}26eU;j5;3bE!Rb_ejULPKfFx76Mo0&9?QcZ>
z>un;q6N$@agbK_bi3Z&kLRGFVXim^BG2Ec+*fM5wEld5odhDR33g7|WPw~ZU;!#71
zew(Jho3sGj63`CQsqcE@Iy(6kk>J|^^g-1JQ<Opl(l=4*&WvMS*2zqdw3lRmhAvEL
zj*d+e;Z3Z%&wS7)nmyM^Lr;&}&C-~nGKG$jDwX|hVd>Zo>;cf!)Ntr_?=uvhs&+`(
zobw=S-iDv0e7@Hil(h`N3_<)Gh%jEk_Gg;&B+wW%j3zL%X&8s~U+Dy3@tspxtVDQQ
zv7yaE?DXhL48a}%0-M{AFXR<=n?EZcMX>4M)7k;Qvsanj#ZR$Ts4><?i&~)FENRwU
z5SMgf8>25*2Ubt<4}^nPHQvC?XPxB)7BAXZcARL?q%^xS(J7!`Znh=UqsMdO^%hFb
zT$U&elmknoXMNDTP>pK^Mqvx90ISPIxpmU)vEhe}D)u9hD~s?x-;zSG%vM@3>oj*W
zCE^afP^mP=r?-Vwn#@7(?c6N-XRFR@BMg71_m%&YUpjwM%My0Ox01gi@a6l8<N8Y9
z2J)OX(`J=cEgff%JF}`JM;X)B1HTOjJ&p7w{@`b8+MZ8_U`kB*7Z&aSVoRn(tYaN9
z^JE#!OxV4|s)9yq`9fAe2h(ouLJ;5jon(hqiqb~O)q$J>IcH1{qG{Q<P0X36eiQpZ
zK{2`ZY8GdB0!{WyKrih{ha4$6OeB>O^ak!h0#NYHjX>EN9{^OoZHge_@9~==JL$Z=
z*AbMk2B6{Uq%fu<2p9EfCWL7KCIQI!EPXwrFx$9`Xpqi55pN3wbnk8m_pxGiBQ01+
zL|%D&N$g}RwAdZ4xcMspXcnkIkBP%pK*8Y^gS-c6C4G5}dGgi`OLF`4!D^ZSNaam_
z;SUf{zBVh5Uh7lib*fcU1GVe5G)O?xlr7Na!BB@vhhQuu^uK1Wul}iX7JBtZKN#+I
zqf`d0of=?8KD2_EX`Sd&*fcut9kw*c*bU@Qj-|ce4JhF$(3vCt9MkexRU)%ou;i!w
zrU{^D@~>=#o~7;&cC0pZ7jWW^&J)vA4&^u@Q~_f9F?N%c-pxoIAnCag{vhr9?+-ee
zH-E&!1}PN%15=3)H(5Gg*|R$y%M}G0nf9;xu&DsVK5t;At1IMs8WG5<%P13Ou+XI=
zm{8NQB8l-qmX}xOejno5$7=<kSL;VFw%PQh)97a20<w|5l2Lzs!zDC%wB7r(nTrb(
zxivqFYwmM#&9Yd_rRcK;EXvpUuv+PtP0VH$i7MWijpSEgK%q#a8+||1vHgeD+HAyF
z?xsa3Y#fmWk+5mI2HpXID+5uW+i%W3yYZxg<?xe1^dC6+W}?}CO3LFdu^WcAciO*o
zXIYG0(W2sa5{%}ijHk(l15R>V0vD<}CEsZ5@<yu{%gia|v494Onhp}shS+!vecNSq
zhp;L0>ZEWd_yTOXwg7Y4u;n3jpo3^90C%g;4Oresw19A3_I*y?AHHq8!p(S*mkxAz
zISEmA7QC1_&MAAW$!3PtVcE!R@#viAK?~i<cTO1&8i!U%<Mx^TbNv$8oK<h2PAJHu
z?w8M*FDqcV#w>St9kLqTfgMP)<eJC=SebA%=%kGmA<FAS0jT2_mv36IT{0T7>)c_P
z@6Yr~>f2;(U7fiOTH1y~X74DsECPaDwU^$kzb_SsP9{;J+YpT)9`Ro`DzVHO2CsPf
z|4(Q5hTznG;D|?qk`T0i6$&-yLvHdowGA(CNU~V`s8{ZjBNz6SmFq(Px*+=|6EZg<
zc!O+ZMmOsKL`~ZcaA{{`ahV#3fotiI`_7z`evs&fofu=NmbMrVnk4v~LBw60oaGNm
zY1`V)npy>#!RndSqF!YboqEZPbi-{Iw>@1)OdJwKz#{1GuT=HxEd8mCdo+9T+Sy7#
z2Lm?4zRS<jK;(kQ&wOfSc?nbSliT{)hw1>}<$z$QcuIy6_leAQW(QrOU*K4_O#fiN
z*P-gzi4cpf<{_ES+o&<&JuuLnc75z!r^AX0<SbhUmYIB{#7wb|Rj1Fm%V-!3vuYH@
z`>lDG8za*|;1~3|x}X9G3-(Z^_NSeJafDT2DBv}*fY*dIOR!dSE-li9<j8$xv<XUj
z=2_y3DE`rjQU89zVAvoq?~1!s5b{s6c3y_%429zt2ggqcts0s^79+<AuL%f}fKH^R
z=ZG};=KZqzdgf~yFlD+@!~QPv=jZ+@2TgM0stY2+7xOn@{(NkDSx8ew1Zf)RoJ&(4
z<(S{~J?Y&|OcD{Ma~r^6Q>(enyP}45`vmaw%|Ejbdugu?G^D4y%fg6h>4LHy<obAy
zdx&&#_6+q7uHW+4MJug{I+r^?E$?6JEo(kxC#5lI-GJNPEl?EM3lF@c<^a-pdssX5
zX;3+eymav@?qTL-t9&SdAv*S2_})B|l=1*M(Q1>kGByUIXP<fBa;m}YCl#xv5FIPU
zZII*YD&JYD*sd<{_Z}YU(HkWyp$##)gkQw%h%)v|wfZ{${MF7v{nMqWMP2^~9{%Et
zPX!;&Q!4ov^gii||3&@&U+^S{?{?a89Z#9Sph@oOH3FkJGa~6I<My|hVT0QkV(Z@4
z`o1b~#{i(y0>Gzt3>5r!_M)yRd)zo@aM<|~e0sCu$Gc+ZZM@ri8fA~aW-9lTim2xQ
z2JUCwKc8DS@{GsDlTJDGa_43J<I24+D{$LX;EKk;l?L?6*aNJ|g@MmEqn7*<vYG=N
zJ@u9>f%_QQ?QS3J8f$wH6zsCsXp4_t5E90eN)$^?1b{n69=G<4fm?gXNK^w&_M5xB
zk0_zvm=2IpvO$;F-~RX}@LY0a1;>Ldy~ut)WFc_j&y(@qREAz}cXhUM&k=e{!xFAi
za&(0rc$$VK@vy|ED+rj)V;S_yX#FGLy2p=qBK4oue=UXK{El(a`_mhZ16SsJcg`nx
zQ(gy_(%q@=eeyA2fiK?mgglz!=A#E*{!CLeYa~b~d~yRtmVw_G=H%&kVjkH)r;8P<
zWpA0C+>!DS8I>kGzEka>04A1mhxaRPUioy6qP;7m2ZaK_IFt$01BNz1KCe#fbocq4
zvkz|USB%)7+++OlNZ#Jl#^lz?Ks0Jr<gk%-eHb4-u-NX`-2vuC+oAL)S5_dPt(3Iq
z1c_u=wly%u9DH_#UVpL+_)6u_2E<)`7h?sN8qI3pLXa%iGlOZCQ)@s(O-s7q&1J)G
zr{7e*W<$^?2AytXhe_}Agt}y;V~yqMhXnf4{43u_{eDkPrwF1sV-(<V`kuseiCNVR
z`co@ll^N<^rYf>k8wnZl0QSygD$Xolx%%rp4*go+-W7_RSHQH0w+f75wj<J~CPHy<
z)iw4m-WDh@LO*LTxWJC3+Ns$awO{^V0xS;B^qg>$(0gp_+3*zE+o^m(6knb+f-fEo
zKV3Rie#(ftK2ntYvz7me5a4)Fib#-n0aZkA5U}_MSJMcp#iOULL}V4i0y~A3QZu)>
z${xH0`}PI6mnC~~22Okie2=X}gt3+=RJH$nq^@8PO-5)&R3vb>HMHT)UHvgbzy<Ad
zQWb#LBXf!aoNDf##(}MBu(5(Op(|TmKjNtX;40dm8uXQFjBcN<tNJX3LB{j#ype8w
zRe1A40kDUB@ON`kMc$f<&^ITWomp%}2zzscUI{2d^U+hV-^W>8EE>~Pq<o716*c-I
zlM;oAcq15?aPh6gKn6{=Ue)^%VXXk=wX1?Bw@4>mN0m5N?nU&3C{KO<=+Jos81*7(
zYB9snE9}%hC8#?^Ra!G;Vg3X+o<MhgEDsRkMAd!X1g=7_?Uj)Tb@~;HfTGos)^a1s
zlTA~urlT-qvD^OG)6Ih%wB;&q$XqK@^5;?4rMTY`#;;3ZFtj3G)TA}TX?9{YtshMu
zFGn<skbsB0Gu2Z3`SQmZuC9CWxhvT-9oGvVM2kZEI^S=e#^q&(6<3#pn9A4efxRm<
zIS<}lR=1SrQvWElqGb|i)1J-47PfRYp}+EE7Q+;Ri=3KATy*!2Uia*a30^Jbl4T9s
zjO8k9{2=Y7^0940f7aO4a!c(o5SB0jM*^K&RwV(^1L>7i>XN_zpK&k!0DG48_;osP
zoIi<_-b#H{AR657VG_Q*(V7|8L%sRWbZ@_IMw+KL*C65yPeLbIWY|8tXNdjyg=a`l
znPA$7QrcJsYW4V!5x%s~a<5i9%i9t*tx~_=1q|`w?;Gx35@eAkWaVD(igJ-)6wmn;
zkAH@~za45qsL)3--O*6(KXGFA`)v>PyNwmBwxx><^$QrOn7=NoM}}~zkEpa<T=^<e
z)UT!JI1+yM<aYIyHrJ5$l2Q?9AlEsH`tikOz8i!`iT^xZVbrzQ{tRYCxN<sYc#YQ4
zizEypP@1fn0X8b3E#<nec47^CGq%Hd78lI&J4oqc95Y}GEdgsn+>?#!y^<c|FJ%=I
z=ntMI!)lg_T<Q<EJ6#lecuYO?YroA$1PS6!PB>|bv?-s2kA&2bEG<P1UaJ^tM<cD!
z33bg-F7>2FY&3u6b$wR2lcM9$+U7IXXb8~8uk69<W7OC(^0z)dP5$YUVUe3YlYyt5
z(bG^6rzl4}*HYu=j6)UBwkB@u6!o;qn60_OL>q9jc)<A3tEk;t9vv$#^C?`ExY!t-
zzCxP*^=wZy$}lcZuW9*(M#l_x2XGN3YoaN1)w;+1#EF1iL(y(Yl0*6QN@l9}LPl2g
z^@?l7SHN5ouR46xbipHnB=f8iIgfS6ulD@lXEW}0I|;MLw+zoec9{?70*>f=l^=Rk
zrz{`fyfvM>z=m3t6BD+{;DIw&7ar$ID+O}<k=t=;Bk4++NsdD^d%Nl02@8B|Sqeu!
zu&v&1;rb;u|3k0J;+n;nR|^p-BOiqWNp?G5C`jNG6FI?juaU&H&tN=S@p!wt$5*!x
zFtd{91AnyV*q3u9jqXy1`5I}GOJ>9A^&A(0!=FZkt&<%nar~n%@f<@Z?3;VQ3&ULQ
z*>hZ7Icn+74HeQ{K@-mt-uFnYmlO{5Yen-9f0umL#|k5V4tF~E!erIYn9ywQP^gh2
z<je+N@i7#%Z0bxgktfSj2v@pF%^irxZ|6;HEek7s!L&a2ubp~nU9~OE8gJ^2OR56q
zk;iay{x(_c*vQ&tzSw>_;!+%7E!n;(4iOuzmvs$qi0<$KYGGogf}c&FK%GZ6i#J;)
zU>2YZi`Ni-tXTW}M^o|f)kA)xa-0_=fRM_Y7}H{0S6b@nz0U4nJ&jv66H;1(--}<*
z#cuDUlv#IYcxjHhRH#&~dmr81`BX*XT19E+;emsuV@f5kNTFCr44S!%G=@16+`6d6
zZcIG#paC`pD5yi6CAbWw>TXHka^v3i3AG%x5h{yZvdo0C($QqAD%Rdcul1E$>*`qg
z*}(kfwHh7s-eAXiGqU)!ByseGU!g9wk`m{n!|`<0-}FW|<-V66R^n!1e1--3Nv+;M
ztVF%h-(=V4HJG;Et{E{e&a4K~?G^nWQ@8JjaEh|30K|X4&)$Y(J6a03cvX4Q#6D)N
zA>8CqyGH1%kM<G>By6XSQ+rME7+247j9VV=TTfE5B`Xl-&-UzVw#+v>M%A1L6G@q~
zQNplAx11?d<fiZS$tR*XxFK>a*st_<iZmDl@=_G<R-;(O8oMM<lfMY|D1u>ICbgtL
z{OalU$i66?d$QfSP1_Bmwj(Ah5Z43rLQA>N4#jn|Wi}U#9ffQx)aG<He^HB9RXRRs
znUZbyo}Nj9?Go#z_p|-7>T{ZfKr5{WhgmI8g#xkY?y7aKiP6qC{UFCC!K#qVrZP(<
zx%G-SlhN?qN>bpS4f)W=#az2cqPu0QQUgSb!Uu`AEm*k-!*&is#)F<^;WMRv$}vkX
z53fy^ndyu=6h1_91EyT>zNEizXzILOI|87*ipH50*P@YhAhfL#7-CMeKzP3!SVQ*d
z-gYT+-@eWauh>$h2-{Dj66N{XGiH<AXVChUgQDTsZ+pcs&d9ax+rZMgNI%@{2z)Kd
zKI109$B?7wn?_%}Wt?@NM4OTcXHy`-XcyoI14LbqV9EpGu$esCzEum9XmtQ6tLIp0
z{!>fuY7q9IMA*!;`;)_izo7Uc>G=M+paHOg(U%2lv#F`_%UN&67<2+D$7Z0`P=(-g
zTT9Gmt#y!_2DP7F!%P{a?=F@Tl?pY)GaHMV1*HuV6E4m7)r<bb<6N45JdtzF`=d0n
znn^eGKvqvEJ(1bZIDD2{TCoFM#O62`aJB}UFqS=ty9(rPDUsK`(fVLR1)FWvB76yJ
ztt{G}zesVZCJ5fG7O5Kd(epy*i~a3pkShG4kwsZOS+_27{Q7P1fvwSJMsBOI7H__?
zKG`ZoU&YnYmUMn%Fi7I_h~~_i>|3GwpRY@o6sLHkaQ!WZFQCUwYc&btP1RS2#IzWt
zFELZEZb(_uKYPGmp#z$O-D;2BPp5Oi@S4edo}T)tbnY;!-%MNa-mT~qPtrH<x)-Tq
z?^W5>M(4U-;j6f@%tO7alB0&xHT6{r``M8UWG4qnT#gQCqQ&gRfJ$YGqR(hjhA6a{
ze}U(F0)9p(8Y#J*V-AZo^=elcw=s5_kDmKE5Tc*^XHEXYEK?vm9s<A~weqCsd`W=w
z_Ol}T5~1peA$vPyB~1o!kYBUY27Y$H>4Ftj%LUH*3R*yUR-<pVAw|=p%uw2qn+;?i
z_)PoK1Aj-e_n?BLj6@0}cXiO_K9AN?*fX}<F0|>8S^JpOjy3YOjMVIFojzIX)}`a!
zitgmGoTr&n*sr0!kPWnyWybEVRUBPgqmu;&q<swC#T@!Z+ANF_m)U@8e-GN{B<%zP
z6bAhc0@@g*jP-I=SbTQ>C|%0h;tW=%%!;lK`n8n4o9F$-6Jj!zGiB~E4%!i^5(<i#
zpV!z!o!Dh>hi%@{Edk-)gGeO>KY~b^LMhU`vUm1I$FS9VMsE^ezEAr=f!hi6RX|_=
zpkRCTKG3wkpKN7Rzd&LuZzDJZJ4e}$BT16yA@QE~pb*we;*4eV4tY=YJy!LZ^4(Y0
z>)cm=zIZkE9z+dkNe!M!vBE%=udwL}1fHEklYi*N2qD=PCQdwxWG9Ngt$Fd|GyikJ
zaf`t(O$Z4XnVC)*v$B#Wdzt_ONCV$Zs0)p$Xas)!MgQ1hyL99!Pd}~s<VOcP82kOO
zU)uu8I12-H^`x+Qbo?Q7EITTv1VyI`jIvWJm_>i|6e_mC3~cPLtt7{hHu@|f0iWLQ
zH;$dvXO4B$`S@2~fY1CDXWg5!_Ki-wnYtCMEQ;-8Gl+x)?QZ>}0`4s410S~cJ0M$^
zIwA7pj8A<2=b<x^Lg!NPVV*x;_7T1!#4Dl$h4bH$mKY|>m(tUtcKvq9n~tk!@&GT3
zVSrL}y9*vozN)cgRgoY>i=@w<y?elD%n|%)5oXJR>p3T_tCHa{c`vUYC>uGzq6xUr
z1Zs@Wym_jBK3cjY-<ZYzMMuzw)L#pQejAU&_~a%&3h4OJ-n-=Stz&E86>U3H?ayHH
zTtS8cj~r~!u1AgI{l`=Yszf>!Zfwq)4tcQi6My9fa&@EeqshjQ_?Hpvc5r&B`G%uf
zFQv7|$L58(bW`9y0a#06sKw0X&~wV+fn%7`y<0}5hqa4~R?Q$Dm)9_`HZV)+1}C-7
zP3wTbAKyfk1XBEl)@O0CY^+br@R{9sS82A6Hxi7!`evTob*wpqyW7fDcyxnHC$;gO
z-Fr6AHTH3GicTp5cfNjHW*Zgb!Ve;}ZrZ}Z+flwDbj(R7NwEn4Zx8e<CMNAaj?n5O
zYxbl8bC}yp4%$Z%CX_?zPR{BP2h2d&LNsKl`Y&-eOl|~*kLkY4lOAyrXFzP}os~ar
zW{|kfF|*jUp`r%$&_{41F^k^iB_~Tr^fGaopJA;<bz|B61B^m4gX*xBIU=w>YJGEv
z4`Aj3*OZQqIP1*h6Xy^6r}Tu1cDT;4KnRbQU|;9^ABD_J`E`nJcK6%fUg&4Sq3Zhg
zAIu=K#OAn2Y5M-7sUDl-8-6(g65klCdUqAS^zcS6w(pC61E|!p+3w3tPI$@&{Fy!y
zv-Z&wEkHq+v($7a>`71fw@*zSZ=qjIKPcP+1<_cV1rIzLt&^Qtd6<9;I-Xu(DoWMI
z8LT_kAZG7e7?3@|U?mn9yRhxT8}Gkx1|F&kCyiNkBwjyX@~GnS0~#U|UU+!zfx~B$
z!9r1HU12F)s0sPo8BW<8v5)HYI<jwqt2{X<_fRQmUCuFR&;+!f#ne!~+A5A=sk30X
zBdhxuIdg6`xjUkO(>8`xp|?+DdunkP2nS;BYl%!5-pq?JZF8dPNDF2UB(r!-1<Bf?
z+kA(1gUK!76P5ZwgEEY;?pLaUzz5N}@BNK{eclb9?VQRT1k(c8I_MRT^aQ2pZtO_P
z2}|>Jg)YJSJF`uVYilD-wxBwYXnqu{UwUXO8*zUnh|OFc5z2cl<DPJ%%}4PfZzZoL
z@vu!<$3DMu4ELCJ(GQae#Fx)I`<FXA)N&hklh+<a;?T118MiH&T?b8M&K<?1_=$eL
zA|6{gBS<|JyM`-Ob%E@E-}N`#RTzoic=d`muM_8D6?*vfl}AShO@i|L7;jXF6|TNs
z-SU4`_uWBFwcERPR6uD`q$y}<(wm`LDAEN4q}R|1LMYM|R60m+0Td9WBRx_gDxCx|
z(g`R8LPruHgaEml?>lF{GxvA?{mtAvlVQkAl6UR5thJtJt^Mo}B=TwHQ$kOe(rwd1
zG*g_TlWpaEbn5o3bttoj?db%$Rp0titBV)I=pMV|aW@6Lf`42>bY7$g4^|!HPb~nq
zoXB29RVCGbn#5K8rnBs2pDXzw;rCkwWT0ah-oil7C&^>f!{pd4U+(Cn(}m*B4-b+W
z4t(ArcKCAp&0g%c>A%qF3kC<VqO6f%QyKtt5`@+xya3=9SXnJgZXkWFWbKn@yf-L%
zjP@*aN-AEP87MFx(k>a?p)ph#=oa1M*7`f8S3A9APcVGwx?bQw;E>@^JW$nBX*>)w
zov#dNy!vob<Gf^68UQeJjZ>KJUuCSCPjI;3p_uR5r!J5$WP>b@17PP30CV48thX4z
zfex%NPG6`4E7EEBScAH&_Kl?`^353lgh>T{!#%bjY&CWb?lzB=h%PCYkYvaY?2Etb
zqMe_04C4WW)o^O?4fyBEoeDP)@0A7G3aT9WhD!FIzS6!IfkX1l_Xc&2ucthA*pmGm
zEUP`hw@jGM1PmlgR?&tELEWoE@&B4(i#ri%frCojILxT3*NzTQ%?WcN_@!@|hUB!t
zN_ATRE(O3|scglI5?dIKw4P6yTW8s9zM~5I_cqzAz>orM6Svr7P|tKP=Dd-i#Arya
z(k4YNEwKBbex$H+rtVYc3QI+Pm1!?vm0tkU{B?gD$y>OQW`jUCp#<5PRs-*>Qr|x*
z{PiNxjs}O@z35wiW5N8)n|dlM7t(elr26MgRX^_YroAr#2(^@v6w|!NJ1|RtN0UB2
z$AiDX&IhqtVCMbTqtATevl=tAAGrV-xxj)^n8SIEZXsU9Ef)DsCLv+nTi;sd1qy>A
zIqMIjY7|EqqStT!Js43Y-onet|Mdvjl$57*h~qUy)(2x(G%7H<Dmg=T#riq-V!$^5
ztl-d%=GWlOOqYJnuqn9{M@)rOz0;{~{f>Igk)_iZ&*}|2qp(w%R{y)z6aPvEjwBsG
zork|Bfp6r6V{Sr=N}np0Jw8~R2hi%$AG^=;wF;kc(vTU*PE()p!y|{Cf)Qm<VN<wj
zTHv(RcQ2tA3{?R9kCPEtgy-DI>5>v7Ik4)eW`w3Z`1l55v)C3OS-r<EuJQ{6G;Y5%
zmv^JbQ>Gm9P}X!g?Zmz@7toY;WBhFBF|y`9rQiSsc$O!xZnTZzN-49t(9f#1x%Pz7
zt^0ZfufE6u=_~YjHbo`zcpIr<jxSJbD+HjA{mA{lUr3ke-fHN*H}lI$q^hn+Iz621
zA84H++JT#cnp1D?x1yupf(6ziS%GFIg|uwLw4~h7&FEwmnW<n(RUIVD7PLB2@qU1D
z_Sj8|fU6d~Lw9}a+8-5aBdq~&sA*6!QW+pcX4%Qg4ZVF2gd?~1S$59K2GQ=_NV{zQ
z;8VYY20;BW9fL)n!Ex`!$6^t`gK<ZZLBI@~9RI?a*40P79vVb;AgP*=6yRO3abykI
zUP{3d_qaE9rffShs~Tr=kIT+QZ7Ksj1=8J#EQImO=(?_bQiTMbZbIH4P>qsAdjyjz
z#_DMb+Iy72w|%C4CaU=Z(4*X+np(H)eG7pUS$Qg~(ApY(dzM1nRLW_K{wX&n&kLMt
zwbqa6BhDc2Lr@p)QW`!q!kh4k46DFbRm+nFg%U2Pd~?4p2hf{{d?5X}0PFZErSpkV
z$zh9U1}@EsgYpNIjqkSU2q?hx;`()XmGZE=-}SBxTvp!y<+;;HxKucJtFPnB!aOKn
z!JxG9^*l=N;v%J?zQ>7QbG^_@IyBpZZ1d#p*=fvKsFmwk-mw1RV+2=-db(0wUUtWb
z67%;SsM6p~8+xVH)1OvV-4Of*C+%=FfV<Ba?n<~2B-W%b>fSQaZ)%+g<PJ}c2@?y9
z1)zi?<Z_FnvnxK+d-NDsO)Cs?^_9proN~OgoE>u=GEqQT-_W>pfz<ioI(HWwGkG{Y
zW$?-Hp#&C4f}|BU8XW7|cW1H6+>c#)CpSRRn~uwnEc7d=%s0J77;Y=t#l==)rt#TO
z=<@Tm!@`c7wq_Yp%$|uU|8+Im$K&o^71mWr@`z67ZRYM1)b^2x?CBU)H~);0GEiZH
zOZ$%|fKWF0E17XeXK~h6uEsp5cyenrKXeN$=C!btmgJ|Au3y-2F$C5zd!64(p!*50
zJT}m}A`eEWou9wopzB!4`1QVit9u+~kJycn;Ys2wZlB-yiZwoMq|<WJ-j}C?1}c|b
z5)H|Q64ob@NG-n$JnATg2!XtvVPfCb18$vf3gw#jVn7cRuP2AtR(v!y%L`rGG^jdq
zCzwlcQ1i?sYw8uS?tqW|=`{ERpoM-OA2I-e=&qj2qpNxx?MK&7j+hsf>j&<`3+p1-
zPkH53Nj-f{DKr<5Y~hU{LW9Vc4U~$+U`gOb_WNWvykm;lCdNNt+$+T88)KP#+mH~m
zzOP2!TN=qf01fgrC%DlsP%I-{6UYD`aN2_g>_NP!7w}EXN}JmKvXO|K(4Wi+x61_C
zVckbFJ-DwdVo-p0dvx<}9z=DY5C#3lKqzq&QgNAln$;Bf5ds$N^K)$@j1JT(@^pX|
ze|-<Lu8<buxkX*0T&@QiY~fYNEe3h_0j>AEg`%}~vln3<Lw7@3KYp5bOexuALXrVK
zOftE)ZKM<=k8#vhnZJ3T_Q!8R7KpkY;jxoDf;Ba+N|Gcm33^;jq=>#?@IPMA6m;j_
zxDPmA$<2JXxaN(3P1ra8i(PioO}jwqD<<y9wPD!9GnWYvmd06WsO+h>9*SznKot>f
zU^8$@VeZ7(sUga=(!2s`64~=Y=zZ&N_P?#!0aZV^H@M*U@v3X%%Rb`=uBeXbmAe(q
zx@lWSRee2&;oorFl3F_UMm$ZNcEH3mfd8SbnTnqK$s@LSoC!XyeZRXOJJxbwC}SWh
z>Zlu+wx;ClUp10rN|V*ZZ}O7F2>4SwAlPXc`Ts>us&Rft;Y#x)+euAVy$I%_-qPgT
zGpbVLB@oqPCvl|0l$4`%Wq~ElKoW`Qo4tRZ?jlgHx-Q9+dc%b6FPj@q%>GF9agOA2
zu2UKUekm!V(EP9yhm;152F>E~MZ6nwU1j^-X)64yG9>`Kw<z=<qrd3u`DN3GbK|4q
z?`5Ac)`fnuZP@>Ktb|Fq-ouGOJ)TJjh+I~g#4&G5&|T-2J|=(f{m1tg%~|U$Mr0OX
zg|4y_xK|XRGz0j3?LKODD4lm{QXypLSzk1x(iHCgC8?;JqS%C!00z4!qw^1suNHkO
zYOprqt1c9t^E~zXNy+J<oHh)u8I(*ctPFSX{Ti0sr)VM7BtNz8Nr;7=yXs%c0?eTH
zzbSYi8I++@P`D%$RIx2labLGU|6cBS@MOYp;zwr<Z9;P__t&$j4)@6Grdbw=@_M4v
z_w|knj*0qKU^6{Fc9Vl89C1Da^IrE7Rd$v<H&5?Aq1yUSEkOGzPr0>L9{iOJHC&C)
zEww)xD2=dJ8Cc;<YxY$2Aow7VANK<*s3M2dkA0|JZd0?IX*e&0z^L=eoW9AT8(>+1
z5?<_9Oqa}#u6eS=q^C5Hjx5xL6uRa<`|yCxT3VJfVy1;a1r#`B%r#RLQa_vqvZV{e
zoM|V@Hsh;P-&_qO^uC^)0m>SOh>;rIqxPc-oHON0MUT$vaG5W>uh}J?$+?!xb!6;#
zEFydC)`wES2la3;JMSc9g>A<N)@7-NicyaFY>15%{zu!jl{;1`i}t~J)8}hkeA7^C
zsnI!Yko{9>c`l&VEfA-pp2u!P){)EOJDJLlEJ&f4pIWJmdaul@0DJ`OT!6t@1Oz&4
z`{j6>+(sKc%-$a-0|4b@yi2Z84H(dh=V-!aZcqUU0&KUXXZo6E>Y}qD&0L&J*4Nt5
z(|RgrDH0V?y3nM)`fjILyIdm;i65%6dKd&1g9A~De@KB4_E;@bm+XVET^T)4$eTWa
zq2pn{alK|wb_Z^HnOdv$ZyH|Ybl#+{iIVo4qM`kOeqOG17{!nDDHn{~o20Fytx-{(
zNWOtW6<q>KRrJdc-><E4ee{0<K=R)-bEh9uSQve)B!_CUr*(^|E(FKJ>h1l;9}0n4
zO>eK{k@nuYX>)Km%l5j$y?{wAx(>MI+_*t9HW^!>Ohu<HjFmLr8b#X5ZGE}0;V%ml
z)8aP#`MiZseGpit#O$-a*&{q4`Oli!(+tHPdXNp-{Yjk0zBp)h2&r9z6c%-v3}{01
z;WmF8tFgax${sN-b(^(L0O~<NmHc@9_+Uq!W8N+R$v<!L-OK{gRcSD*BG$NYobm(u
zVQx@X`LykfJIP2w?dpldr-XS$<wBL^hBYAf$=Bd&$*MSQDy;|2bW7}3;y1eUc$fEX
zj(3%FnC=d$v=7efAq=Zwl-9i@L-tGqhJ3N)bi?^wt^@4oxLjfh!^+l~oV(nATq4^2
z4kJMdLrLMswW2<_j{oYwY?y@{dZ}!TS2WMXudfl)e{TPqPY;+F3Zz{1$I;xanX%Q0
z6Wa6~6TDhu%yX0`pm>mE2r#_^3l-HJ+Chf^_1tuzqM4vl>4_dfhStqyX8y4U4bAgB
zX37@;d<)>6vcEBy7<-~^L#Z63{Y)CDU=8uO0s|*rz!(0u8Gr{`Z9Tdn)3dZJ&r>1M
zyR_e0BbMhR6EG(Pk&fTxLAXpT{V*Trw3B&?8WHYBZ4MV}PTa<H-nNo+9mx=B>I$Wh
zl!4@9x<g|5==KukSF%qxCNN9&YhEETHFPxdgJ0<Jv@lfZJ1CU{;=_eUgec&~0akR=
zOnAi_S`*q=4Un@gP5WZs<rS9`=CKMRgJgjD%3c3N((enuS~V82ZRDVhe<a>U@^f%@
zJg#9~S(F?7xr{f>#`1B8;(F^0>D67Z-@g|f*VS}E>a<cH=^Ft(yc?^4Jd}1Hg^~KC
zP?JZ#d~Uqd#>a+SZv~L4R|*e5nqM{vwJi*z+A}31^xkc19-CP-0)Tp1dl^Yvy27Oa
zo}k}Ho_k_b;Ivo6pvDCtZ*+DkP#hJ#EO0${>!<a^bxrQaGXc{j#?~D`*ZU~T;_IOa
zCeijaqgbJQWBSKFcL0OwXZ<PP9z{>$-?TGJ^Q^AXb_<m9bjhDuLNpiZLitq4yY0Bv
zv^bOf{NwTOH7NW?6g-|eSNi{m4!{S_4`2NcSN)gGvJd`M5&a9V0zSp#pYkh!gMlL9
z=(YcdZ^z%My8BPWIR4S=7E#C8^O5zxYHENX=l&lt<R|JLt@s{W8YjJ0s{12g?^BPn
zg})~2|JOf7Al6z7jQi;d^<tn8<*yYAuoJBis5+Vwrrj<bVeJg|>T<#~7(f{&QwMYx
zT<8F9U%oOxq$a<<&CLRjb2ay<Pjj0bYXVwQu6FV2TG+M$ZJu!>yWtD^cLFQss}_^l
z)SetAAjQSe&8khgPO`RcVM7#<Exz$s$=m=?NY6d)qlxLRjT%wa3;z9QpSR+$-=mox
z77r%vN0^3C=#M*zMg|dMR110%=it{$@12}JRpJ`jY-BL#L+jzWo5uolMQL>jBeuHX
z4&84GT^mzJ%1wck$H=@}fi|PbjrJ-8t!Y6|`wdU6`B9gQodiqai|Xf5WPE8I`VVJ&
zQv=-=&dmlNYD^&u%izcI`Zu8l6*bxex5s=pc6xz6CpCcu2aMnEae0e#7`=`)dDAJi
zftIC&`rOAqR7{mcMqM3rDz*S!@!J&;5jKIv(3p`k74=ne7e;B9z5N^dW{ENtBO+|_
zI<9E5q|8p>CcWLBH=Y5Dp|vAgAk|FCQUV`meNo?-y>$mNQJ78M15K(^XefCXMewiE
zm(=rNw;xe_ET-o13q_vVieyriB@>W6w7NEhLo(#ev{`SM{Qfo!+>Kl1hK>xl8>e|Q
zH}!5B3D1|=o)3cvs@@Q$xysd@d(aN{zGaAL88kJ0pBJi(zN?t0^s?(~(4PyKN18$6
zN$<gc=)#fXC273@ut6}OTCJ{o#BnIujbCIyzo*G5C4S{GRTdr<l*8ZG!;qcl--P;9
z@atkI+#a&m9s}uLk$z|eiezeC1o_-0C0A=*wIQv?mj2MzLUgpD-2D5>MC48j`wO=3
zK(F!rUzTg_XfKPA=g)7dDp}#Fm11;e?yuYz+iFI}-a!n`E}(LVDyEmu4YWONizw?u
zR=kb{88F9<A+a)5mOy22t7A<L!%`6+NqC5qLU1q^HGXhLx8@wk_xM6_$_J%2A{o(h
z-SRR;pY?7dR(}U8wFkf1rVi*;<X+Z>)SQF-T9xy^^&&5H#c6MbDiIVF89gK+Z6TnP
z6#j09N91u*3N-0hF8b$lfvjw(0c;VjM2g7Bd|4Fn-KaCTHbY2XbN`Dw!mfQBUgJW@
zyVc*aOpuq_7W%XBwl^Q|tPS|S{O`gGdQV){Tza0DE|-L6^@+2-HSc*4ydjkFk=IvP
zaza%+!{?suy9OIqG~xk3QjS*4X7sm=2V;<tBgm}&2nWI;);u`zl1SAU*IRHzkGtu$
zi0hR0KFhDlD8Mu_LoyM(VGt>06s7c=)EzEhq0F1UrF_p5eOW@!C-@r<bgDOGXH`!c
zG5l-%$Vxb*tf7FR>Uj<4bQ{mrn9TY2Is**yV>|_}WM5r5{~gQ^kA&L>$Y*(Tvzh&6
zt0fV-q8a4bBcQ8#<)!K?ylyz1XOD;OvirLoSL<C!<<|gwcBwvLrX}?U!K;kUz5qP4
zcpe3w#a7<QcIrp?{0gBHlEQCCZEbd;7z#;-m~C#KjV*2+$rTky3Q-V$9(v{ge*ro4
z9e!DYOrJU1^SN|nFz_9LO*dRcaeM`9#34_oBUdF1@VNTFaO(!!B#nXc&Mwb}KFlMf
z2HZaFNm0n10lKP*^p|X&Mk+0jpWLNV*Yhv5uyi3_O`UxLVP;^tdzZoR4kLrvXTeCm
zmTT7{uT{hqwfv5}CLGri`Jv_cTvAMndhs>k>XvRS``N_YY_(!7=xB(y^-M0=#l5`r
zvND{*pofASA}ULaSSu^b`hV)!r_}HuMAvhPk`q$w_;l&s=W6jB4|{XivIQTaPncZM
zVtp~gcIA>x*C5z1kb?!=Wi@@uHboTAg`lT%nB;i9AO}uS=S#`<2l4*`?ig>UgVV+M
z*HNB%>_)5Q^s&A^_gNAbcI6!xOZ?}3zMG!)Ye!bh-=4IlQTIJ|?l-~xZv7I=<%;s>
zJ;m%4`?RJ8%tq`zrH@d2`8-3>slFb<=3)zj>c&Bsrsex_(XEE-J1B56^Vo~9clR<K
z7Og;X+0L=J)9Zpg3M!V}-y+AXiWQs(o}IYh<r9&PAkz3I+H*Zsr9^B%TTWgq(B|Lt
zfu$o=eCa_M)nU7hMS^ig^bgEa;zgTbIxjop4%GRq=&m_D@*aTVBSm{u1ua>lJ<yg4
z3)z7}F0aq<{cP+0mHfmnqzQiEWJ=3zD;WB-l4!+1kuHlipN>b7uEix5X}sE}cL{<`
zwB+A)iRZbZ-ZTu!qQkqMpb$Ur9%A#02CZr2)fc@DosB+=hN@a?8kWar!v;kTb$>km
zzVbmroFn_iz%EVk7vzXR-9pKWlWBnyW7M#h@^uTVj_91hIE0HWi7T7MxI?&A9I@bg
zDQ2i|9zqTw@qbzMKqHLhP{Waev(<3?^9hAtd9cx(UZ0l5Pk+ESM_#sO|54kO9p3Q2
zzBt`J^8~80VSRTSZDGx3l25D_j4Mh%VR)mD_eQD_T@jyJV@$vWc2mAv)si<Z-elXe
ztfwUUH&ISECwri=T@G}nMoDgG36t29McEI+VvEb+u-n)Bz1jF}2H`p-^axs+=jsJ@
ziLG=E#cd#8lPlHWKy@`8zKv7EL<dGakYGbuQ9A9iODas?34Wn!8<AnC!bFROd-*WU
z_q`|>gMPF8mBcs|mjjb8r_*i1&OagbMi|)II+4z@R%!RcMG`j8yC8RD@m;=XL{o1g
zCf{>!5w70o3%^j8_BG4UnRET@bmF}1kfALb;ix+MRtsa`xKMv{yExi7k|VB2P+Rtz
z)VG%zF|jzCPYf?1io>|Cbr515gwBEeiI!-Jk~%afGH-QRGc2~D!N;a`ahiLrrTx}|
z1A0MjN7L}PW1m%D)s92u0Gyb9YC=Wc(sU3`tVuWQ=>l$RrFs6KL+zcbYL>p?B9Js@
z&gBw^iV}ogkELd9d`V+#`T7~3D#$-~I4x(Iz5*=K=QpZ3qMCuXr&<agkl}&{oa*fQ
z*;f`wJM;y*-PSO%?klHPq*N`%7U8SUN=8$<hwEb=oQBy)N(%NL<ohP35a)sg+#m}w
z_*ziEXCND9f3Xd+D?4{+eOp{|yfW$}*=BFXs)hD57!fE~BDUWBf?BcW(TuWgaZVZS
z4XM?5fR48cQ#WLJm&?VqAYRE1=qJh*jWVgv^_}yT%o1p82`VXR=_Pi0_hm`~WT`)F
z(o&sliJC^gG8zOo8wTlJjm|Se*>!GB!*?7WLC}2_R<E(IZU{7GgYmKFjG81Y&#5(Y
z{KRoVu2&qmURXXs;OO(nP%?r7(iLdL$R(-ReYyUK%h}<ed(#~OuwhM2^1d?MSH0{K
z;Bh3tSvw$T;X7}Ptq=^enEj!yGzu9Mq+8zl-RDsgT`+i|+R!RRl;2|T%*nl0X6bc}
zUkjV@t5dHgq>j3JhY|=#_kQmB)*{_t_`nszBQ9LoO*SxO!@BEu@h-;_#<Vt6&Ub)m
zK&$Ctj8}3!<R3~XSuWZbBa$v|AOd?cW5HSJY8|I1qVJquV}exISdp*?Pkm=0r*4eF
z{+ctpKe&4do#B%G(-$+OZoFb?+0o`;^W(!?wV6Q%UgMO$IJ^%vPe(kJuKkni+IH)<
za2MQIlkg?B94PUyFiRd(8;ZU1Kuh*l)LYD_7jG7|k;R?ScXg%t*pA8tiQ}?Dd6}~#
zVn0)Tb<cQhr5bm`ess#>r9KdHZYIVE-k|f}<&W&jkDD(#$UjSm`BW4fmzsVXc8R|z
z)Ygi50b^;!A9J0TV*&H%HSRHt<f7M<fb?R`NGqDtfB;*I+3%a|Vu_%mXV;BSi_`g)
zQYjDu5V$#}I+e%H8agY}zdFQt=H&5wDY4$|s`OO>TnXHx-(7P9e?O|;|4lYHP)~Zy
zuo|LMe1Hcns2R<kZ+I}tNA(qW=235@Scux7aX<%I&`JaaDOz^pT?j*s4_OJ}Jdgr!
zeNM!D&k11>;4F?r({9}~R3WU#Nl{H9$%4#`0VIAr7ZF93EI2jHElI3C;6eji?a5S*
zYe$6DIahg?cJ^v;G6wLL3M?+@pc>75FFw%4>GG@x`FC>YN~&4aOjPk)?6-x!K5ZDv
z{3ua5zSWUTpmha;5U@W_rU@S=YYE%2#A;9u(~!xqI?8!$@t<be>^5J`SZeEHC`GVP
zL%@O?5{`ccyycoZ+PL9G@2<XuoPEX!%DDfFDrK&r#42@Z4BivFS{9Kn>WqH=Ug1is
zDQuk|()F_S)P^-Zqe1lVT+rSk*R)4XsF`jl4jNM<QcnTgtRUotdg6SMt_pvO>(Qp|
z+RIVHqp{tXQ6}>Vp?=j1o!0E<tbU5eNKu%VGnM@9n+0vN?i&5_NjBOjoj%c=-J_Y~
zsGle$q~IIhD*Q>~rGXa%eY_O-1AIWE4#ujIye7~+4jF4y;O~di8p^6#cfYinmM$Lb
zpzFN_hy8~2hagmizpOh7ZO)|=t#uO!2G7_zUK+GLOkcUTe>Q9=|3>SRswdRU=~$r%
z7`qag;=1Yy`|TZUls=J6`Z_(3FQ`h*@%w08G}xh*PUeIpp4}Xh?Tc<T*qF@+9-{J4
zEMg!=(Ws=ATsxaCh&12|1-Y@=X1n;##AAQ2ut9c(<oXuti8(>u=+69al-It36uzRV
z>9Qx+ue>5TMB?tPa4nQ@U#PtBt2&=6+3u{6I51(rtDky$ZV>U;zrZ_HDR=Z0;P?Tr
zn>Rmz&5r;1-+gJFOyrc^8<>np&~8*IxAh*_EOM$$iu-A1VoT5YlcG{C(dglQ0C|LK
zHz_Zf46WBg#xlJLTkpY~<MgZr8L`~VU?<|oOv-nd4tq8qN}lmISzz>9l@B;8K|LJd
z6dnnVSHT6Avqyd^?~iHP<+L?S_hX>j6*e&%jySk<<^Dr66Q1Fwa7VLec6MfE+Kgen
zn~#ovB!kgwUB078aQPbh8ZdAXb&p&4)tl;ar6RAx?&=erpR0{B`1W7VmN}eXDHrCS
zob(P=H~eu|V!Yy!mO6Dy@e$o!Ym*?gsr^LjlabG@-wCU4Uq>2uG5j<!(P(I#-HR(K
ziL$FTD@%<#nh*SNY#t$|X@|4*)S1(*+9q*J_ru@EUH?!~p5W?%+`6K!Mbl|lV|4S%
z|8<S$&jn<ZvkXTc=>K&k9K)mG^CEGnmm)iD-f0W(C(ph;Hm{$5a@2e6A_Lu;u!&I7
zWpe&sS0YtZb=Ymja+CW^RJ(S@qitT~{bTboG#jdB$FE#tA2%sCQsX5knjBvVM9q?t
zSR<&daQ<9KhH%-EYIyXqdBVxAmdKAHcO+^n9=(hCZxii?sfj@R+@AKnOdRICd~PaJ
zHMkhNo2=T#;wfTcpc)#*@x(Rxjgo!@^-yQG9Zb`FU_0C4(4D08g=?nfJ^9eZfMzFL
z`(M)!Wm4Y#+}M<8&F%A8T$=i-Y%!w4He+fN=M&|bV~c!rNK6)=B0W7MD@J*BZI9BD
z#t)H4BWb%k_eoY9aPctr(Jp3GAu?R$gYBh%FM5reXvi-R#<52E6#WtgGws)PcCpa5
zjrFHiMF(Xmh=`VskMtdebEKhqI}c$sL$~dp{y`)wuQ0%7^|EOEG5_8KnMEBmYsTNj
z+IN@T^?6kXL-$u45n%<^z7^n>v6;KroncwFd<eQgLP{_hlkJ!+VP>EzP1^r_NV;`<
zwjd->QXrAb$tEb&{$ca<KPwY9xxE`t8jtTScVRm>wUHlDhTXNDZDRl?FK{*>)~&(h
z@ObhLvD%awcvL#j_Ug!mPs|8B>mTkuM`R0l|2XoKmfwKC@m{PO>^EZG_o{3gvds8>
z6L@pb5OKJ>JkzJE?Be+l`AG8joK=>x@BP}UY*VH1G%OVp$kSAVw+U)0Gx3;@Br5w1
z-Z4hbWR_Y_t)qBN#c7iat+$SBEX^YA>}uSwuzARC{p?dR+4-muQw41g&gD>Cdr&{D
z!I|ll?QRj`=k8tIzE}Ci%5QSq5hb>6fw$rPS=<_?ypym6=ALQJnt1Y+9<_D<+?gHB
zwGNTLj*6(dfAjS8_6EYN+8R`-^rOk79+Wt^IcsZAG7nwNa3-SU=vS(fL0qci+vbYb
z{|wAGghU67h7PPMG<r;3cnRM`Fl**)g!nbg-aMSBvDN?Qc&K>o8%W|E2AiQjzPAwx
zVVy^+>RV*mR^wZDu{W4t8(Fq~M;A)(9)3cMxk=&kpcVq--a8@5+1-g(vxj@=&o`Nv
zwIRnwGk{~_Vc~#|*u%ZFn-VIH&JMPOGN!*bQEY^o*MWI|A_Hq@t;dFHy$IhUE26w)
zMbQJR#y)5Kbr=^9`e_Zs1u65<Ll(@T1184ns|`8CRWW5?r0Lq?LFYWELpX6%Jko1{
z=^s~wDK#1-oAIjJC#^+1YyV@EGyw`0q9Avt?_s=>tv#I*msah=NX`D^pv8kZA%War
z|K{2qEuFo&sGca2dNr8Mu|u!Ed-3ZS8x2Ke=g&^*e6`ow5gm_<J{NSgwrcZkh-Llq
zM#P~>ninDh-Vi%9zx}TK{0hhr4rnpgVf2Vzc{m%&;=w3_Ejx2cc3|iEGCooKK~!hx
z=Eo^?043AX(;1meJ;IcKhDRJMGGXkqxKV@4@6X_$rRKu?$m+~q$Q%Bz#_C=2-7I9h
z-B%r_0hPN?zqr!k_M!L9VvhHc2?YJVFs!J|x+0T2<}Fnwnq6;y`m$^u)Tn24q9(+J
zI9f;XrPOoR>u`4z$Ab!;jOhw&WQL#u@0=!c1X$X7PErCyVm!sNm9d^ktlaLNr99Nt
zSua0*O5<B*?=EYew{tVIP=xaqD!>vMwJYB!H#06{r8t4(Ivk1wtZWDz(_iepAHkYF
z*048yR#~{<4OB?j8BN`v7#LleQS>eiThAlfA_*T7ZHR}`atEGBWfWydyw8h@=@CIK
zy^Y3qsPavDiz8>DFccYy48?XVD(<bUzmJ05*8)V=wIVXKrZmOHWGOZ4X=_8^)N0~J
zeSB2#yO^Q5f5#jQ2$1*jX<a_G^x-7UxR(PuWPP@aYrG<lyoS{rB-yHX&@)U)E_iy+
zR%ggMuzTnRW?`rD-fEIQPD$X`bgFNkqo6VZ^SCXbjv939Y;a~KSa^>f#*6z15Zxxt
z-!4(GLx=CJfQd5|@q_wa7P-a-O5M=+T*b>*>-D)OyRUwci5*(|d+}4UiBov%Z00n9
z!OA>+TrR9X=P6b?;+5=|%A23x*h&N?V9|jC0h8R)M?dPwrS?&u7ISmnvt<))+Z(^>
zH{n8rk9RH1VA_)u7BRv{+z=s6oy>$ea;7!w$(4D*Mh}BmnR;${uEYT$N)~7^VrLn=
zvcLB(8pRD0YtjhZ`y7r4>zl{c!)RK?oeSXWpv0@W-ZLTH(9MV^1{lOVdY3(0c#vau
zh@aMFYMs3~)-g%wDK$V5S~$RM<43&Zu@+-RUBF}w#?g37CSDzt9|Ioo1XcB4dsuJK
zjP+#Mrr1`=2yn@Nzxot(Ila*xQKjb=BklJt8>TXqH08~9yb-GZim}Zs_P|4)W(2ZU
z)FzoVSRE<NvNhvjR<I<W_lM>cf4cGvCO7T=Jc~lfS;{x1$?e)Zs8jd;yo?@f=2dnf
zQUYB+R*`$~6~2H(2AVWk7bHD(`H;wTQx^YW0NRTCk^W!=ycnahdl*4MOHBzZ8;9o`
z{S$d94O)ZX6#a;g+|GHxj*uMB7B18A3!YDumx|wnLu}=-!6L9POT~k>;d$n85}77B
zXEt<?Xk|?}=r}ycu?nNJ9z(B3!R|Sr^JKl8CO5{vxFg#Gn;9PmBIl!b<dt{VtsVwY
z*R#x-d+wRCGW#YCDaj2%=ZOw<9S4Fi8#mTMWUXQs(@#CVL9l<b_eVW_wHgUj{6==p
z)Q~J5+ZrIj+)Xch`%-DnUCQiiyE|HC{dT=emNjjYcsog$xHmTehN)~$64ze15lR?4
z!3Y4e)t`GFW^2>&S&51hZK~PokP>qI(f#5_O|zKKU}R=BUFT;T4B(JyF@9*F9!pGD
zcny^gcKG^%z!w(Bj>oR#1xtg83vblw7(t13C5xdoV)>rct*!Y>hiI>WTB=LYv*A?r
zST%L?6T&HnzV8~Z>4wi&T|KiPTeU_?djTBmxlMQ<N1!CbtaOjk#!`3_cA`fXf2*kL
znq%++`*!}hot0cfR({?p?~r(Y;Zzd10KD+Xk_LvJGolMz%|y|%7O)x%EyA@H45N8m
zsk8oGRQuFFt$|rnW4e6hxo2)LuK#nUOO1Y5?2$?*@Du7$A^xkb7~_^&mD)ePt@*to
zjM{t?lugWp6?k5T6XndgIE`}DFthS4*VEjZWdj~a@s{^o-jY&J#`&rknX7{(ie=Xv
z46K7r=^<4p6R<_<`@+!5WtiQ*u?0in+c)xE%s=&t9Na{yl&>C(<vIcr1T(W~`vHJg
zo7188iLUj4B{=vq+zvkQN0EU2;4xMWQQuEMCRYG>aM)+}rf>3fUO(L3U0J-Mlbv@x
zZbKweLF=33^s8!y;}s$B<FlXe#b*DiiO5O5Gpj=3YvoNbCY7<{N*q(4`qE6@j1!w|
za!dQR2zO+x3P_&q@Dfox`SqE!<Ty01YMo*?ZeFRK8_87xU*xJEZ!#Hp(^lH$BEDp}
z1;j=}K&aTjCj8q~f4;f402KM9OTO#mX1cfKa7V}nbWBXbZ64k-|9dma)q5lNjhteG
zM8wffe5Xpawj`cA0P0w{@rBBZTvaeeZe;F$8uKDqd*xn9JA>30nuC%5mBmsTgv!7I
z!k;XAnnrKgDtJW(`3=i1CVON~2{v#-x)Nrae;IKOceAh0xCwad#fhHRb3khqzOLgi
zHmKVaFnsa?%=>h2s4|7vB{`1oY;~T9{WjYXG^{ZPR;3>P5LAh<4~)C{Ax{lpTIMKx
zk@_LTNm+c!hiN7JLFA!rXmE$J*@lJ$W1MBS(Bi?gbW(GgQbJRRK(`sNm*Bqpc_A(@
zdtRUsbGI;*XB(rveRv`Dwg`J;vlqt^^2z|1`jtjRH{L665^iU7Nsu4)Qqfo`*oSOU
zys#3Lp)*LEe5<x+i)}{MyUG7KST*S9>&SIAEo)fP^WEg+2Z46HXnYyPTgD|{=1Zxe
z-6hFHPm%ZF&GC*Zj$o%*U@g=+L&AiU{*HUY>6q^9s;fWiseveRSDL{1E{P;S@BX93
zepe9(pYHu6G6*iawS9A+PDuAir5cE!e7NnYp!!l=|AQQdkK)VHdU>7@fzZ`lx<@xv
zvi^;I4Dx_a#wLAAmbd}jqvT9#6)%vCN?!4zPw~WqCq40ufdK7_op#|XqHHwgC8iHZ
zqx&*8r_1tX)9ZK*fBwFI>7(=OF`ULg(H%N?`g`lp@b}^->JQ~(=LW&FvXb@9)`hXJ
zTNe+W@mQliM(wk%B`+_uE>OVKO2dGZ_pM<+Uc4Q>)(Xal+D<S08uy`<P0NLi-tio%
zwGoRrWYfV<ZVLzUj>^?Kf2nB|`zO=AB8^|_3-PWEbM8jGA0Jbzn%G1{LkHHsN`Kdq
z4cbP75ibL|44NXER<<G=SbtoY?#gAC<$LD<Jm*yBk?FUzyAmS@)VA1S6jt6Uck0#K
zFHf=Hz*k{<ECqN->y9-L3Ut->D(ha$>dm!XY<&deqecJ~b9oi}bq4i|S`B`G`Hywz
zQ8zOpEC=`TS^8O|+WQMFN<Kq7h;<JeDq@`%Y1Q$;?(Wi$shW>tH=Aq7+V1GsZ0Ti3
zNy$Llr5X`l^!w4;H#tqg>wT?JpS0A&{tYh+<$$r()U`ywkOMnn-_+%at^l<z^Mf9Y
zqJ(7|DF0>eF0BF9`O<QGa$=L2fbRw%8ShApi6k~$NuqRmTpp)YewES{t=8A)!Xvl$
zIx?7D-biT9NbElkLjeq^mHlXW>xTKH4qbodfhNx^(<r@t8GDsH-=NCq-jp`^g#`OC
zUhGxKclO+Uy_Ga-1M5EH)Z7nCfU>57@jcwI4`HgKx(_a1iN80o3CCzo!rmB2ylusu
zN`woTz`>gA9CXb%oWo$u2Vz4K8*e*SBU93+<ZCTAaqyHOs~?lSfNZxco;_@kOGru;
z8B%R+_dvf6P_w@z@kFi5HhAq4Y)Y1GQc)3?U!U(=P#;D`WliyBfvG<wcMoufu|IqV
zdq%ISQ|sS2EKY}^!FYoi>F(FAHMXksbFF`YWvmen;K)*1Q6!<X>tk?f!+8qmeV!J5
zd{B;OO85;!w}PMdCYcuVreR4vSdJitc#5Dp$gQNAVPSWO9@t$Qz%w8^Z?G0hk&BVN
z;YuSeV*!ejeP_aARP<HCrYSf6gO~u&K=}3aHfz==bx;0$k0LeFZa!6@MSkxMV2xn<
zIP-BNpmgUF3YE<mjpupyc#Z8|Y>SRqOHGUIdG%X+PMnOsHjYd|-}v#39+XizaV*uu
zIQF34?z^guMHhMhnD2lxU7@`EUYvaUd#R|8fZ8rG;yMdV?@a!#v!vK;bcj{K{UPzh
zy8NRr-x3H8NCu#UQgpL>(j?XPd>5Sgcb2ALT)|^0fxsl)6FKL*)t4%m{$gg%pWDsN
zJh=Da)`6e;(QRa#V6v(cFP-G8%xOoy-y6j&lICW*c}}q8iRd0aCcP^#w6V(5ix*4d
zmFx!P<Mo5I<GG?@l#|VHd+fCnS@3)vx3?<iuEM$k2Dz4HjM$8me#h%*nvaRUaL|wp
zbxX*O{}-8J8FyfN9>NEkiz!~8JT=inefhAUA6SZI(QMx%an;9w=6R9$u<8SsFO>sV
z&r;d$nQwl^gtOWNYCVCvS|b_y)lzx&gEtX0v@47#Ue(Yy?E`#dqLOmAYPirph`qhs
zSuOo(R=(xg1vl3JLD%dSezJm9mv{VEerMlD-QqN25qCqp#mAyn8TCI&rz7L6Tp}Uv
zzY#6vyk<&TFScn2X;}4Q>V^OIT{hJt!Xk!rZlQdK`Ttv<%Z_gZcH+bdq?X#<N5|~l
z|32q+(Y3=XEkZL#^<hT2ldRkHpBh6cQ%&`iit-v~J<dbra*4d3WkmnK8On0Az<BTC
rGp@Fjt%lBXoRzWvg+9GGa=`2BzW8ad%XAR9loML&`fAnp>>~dUw0%u9

literal 0
HcmV?d00001

diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
new file mode 100644
index 0000000000000..dc8432baef9d9
--- /dev/null
+++ b/docs/source/design/v1/prefix_caching.md
@@ -0,0 +1,228 @@
+# Automatic Prefix Caching
+
+Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc) and most open source LLM inference frameworks (e.g., SGLang).
+
+While there are many ways to implement prefix caching, vLLM chooses a hash-based approach. Specifically, we hash each kv-cache block by the tokens in the block and the tokens in the prefix before the block:
+
+```text
+                    Block 1                  Block 2                  Block 3
+         [A gentle breeze stirred] [the leaves as children] [laughed in the distance]
+Block 1: |<--- block tokens ---->|
+Block 2: |<------- prefix ------>| |<--- block tokens --->|
+Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
+```
+
+In the example above, the KV cache in the first block can be uniquely identified with the token “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the block hash of `hash(tuple[components])`, where components are:
+
+* Parent hash value: The hash value of the parent hash block.
+* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.  
+* Extra hashes: Other values required to make this block unique, such as LoRA IDs and multi-modality input hashes (see the example below).
+
+Note 1: We only cache full blocks.
+
+Note 2: The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value, but this should be nearly impossible to happen. Of course, contributions are welcome if you have an awesome idea to eliminate collusion entirely.
+
+**A hashing example with multi-modality inputs**  
+In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
+
+```text
+messages = [
+    {"role": "user",
+     "content": [
+         {"type": "text",
+          "text": "What's in this image?"
+         },
+         {"type": "image_url",
+          "image_url": {"url": image_url},
+         },
+    ]},
+]
+```
+
+It will become the following prompt:
+
+```text
+Prompt:
+    <s>[INST]What's in this image?\n[IMG][/INST]
+
+Tokenized prompt:
+    [1, 3, 7493, 1681, 1294, 1593, 3937, 9551, 10, 4]
+
+Prompt with placeholders (<P>):
+    [1, 3, 7493, 1681, 1294, 1593, 3937, 9551, <P>, <P>, ..., <P>, 4]
+```
+
+As we can see, after the tokenization, the `[IMG]` will be replaced by a sequence of placeholder tokens, and these placeholders will be replaced by image embeddings during prefill. The challenge for prefix caching to support this case is we need to differentiate images from the placeholders. To address this problem, we encode the image hash generated by the frontend image processor. For example, the hash of the blocks in the above prompt would be (assuming block size 16, and we have 41 placeholder tokens):
+
+```text
+Block 0
+    Parent hash: None
+    Token IDs: 1, 3, 7493, 1681, 1294, 1593, 3937, 9551, <p>, ..., <p>
+    Extra hash: <image hash>
+Block 1
+    Parent hash: Block 0 hash
+    Token IDs: <p>, ..., <p>
+    Extra hash: <image hash>
+Block 2
+    Parent hash: Block 1 hash
+    Token IDs: <p>, ..., <p>
+    Extra hash: <image hash>
+Block 3
+    Parent hash: Block 2 hash
+    Token IDs: <p>, ..., <p>, 4
+    Extra hash: <image hash>
+```
+
+In the rest of this document, we first introduce the data structure used for prefix caching in vLLM v1, followed by the prefix caching workflow of major KV cache operators (e.g., allocate, append, free, eviction). Finally, we use an example to illustrate the end to end prefix caching workflow.
+
+## Data Structure
+
+The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
+
+```python
+class KVCacheBlock:
+    # The block ID (immutable)
+    block_id: int
+    # The block hash (will be assigned when the block is full,
+    # and will be reset when the block is evicted).
+    block_hash: BlockHashType
+    # The number of requests using this block now.
+    ref_cnt: int
+
+    # The pointers to form a doubly linked list for the free queue.
+    prev_free_block: Optional["KVCacheBlock"] = None
+    next_free_block: Optional["KVCacheBlock"] = None
+```
+
+There are two design points to highlight:
+
+1. We allocate all KVCacheBlock when initializing the KV cache manager to be a block pool. This avoids Python object creation overheads and can easily track all blocks all the time.  
+2. We introduce doubly linked list pointers directly in the KVCacheBlock, so that we could construct a free queue directly. This gives us two benefits:  
+   1. We could have O(1) complexity moving elements in the middle to the tail.  
+   2. We could avoid introducing another Python queue (e.g., `deque`) which has a wrapper to the elements.
+
+As a result, we will have the following components when the KV cache manager is initialized:
+
+:::{image} /assets/design/v1/prefix_caching/overview.png
+:alt: Component Overview
+:::
+
+* Block Pool: A list of KVCacheBlock.  
+* Free Block Queue: Only store the pointers of head and tail blocks for manipulations.  
+* Cache blocks: Mapping from hash key to block IDs.  
+* Request blocks: Mapping from request ID to allocated block IDs.
+
+## Operations
+
+### Block Allocation
+
+**New request:** Workflow for the scheduler to schedule a new request with KV cache block allocation:
+
+1. The scheduler calls `kv_cache_manager.get_computed_blocks()` to get a sequence of blocks that have already been computed. This is done by hashing the prompt tokens in the request and looking up Cache Blocks.  
+2. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:  
+   1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.  
+   2. “Touch” the computed blocks. It increases the reference count of the computed block by one, and removes the block from the free queue if the block wasn’t used by other requests. This is to avoid these computed blocks being evicted. See the example in the next section for illustration.  
+   3. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.  
+   4. If an allocated block is already full of tokens, we immediately add it to the Cache Block, so that the block can be reused by other requests in the same batch.
+
+**Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
+
+1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps:  
+   1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.  
+   2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.  
+   3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it.
+
+**Duplicated blocks**  
+Assuming block size is 4 and you send a request (Request 1\) with prompt ABCDEF and decoding length 3:
+
+```text
+Prompt: [A, B, C, D, E, F]
+Output: [G, H, I]
+
+Time 0:
+  Tokens: [A, B, C, D, E, F, G]
+  Block Table: [0 (ABCD), 1 (EFG)]
+  Cache Blocks: 0
+Time 1:
+  Tokens: [A, B, C, D, E, F, G, H]
+  Block Table: [0 (ABCD), 1 (EFGH)]
+  Cache Blocks: 0, 1
+Time 2:
+  Tokens: [A, B, C, D, E, F, G, H, I]
+  Block Table: [0 (ABCD), 1 (EFGH), 2 (I)]
+  Cache Blocks: 0, 1
+```
+
+Now block 0 and block 1 are cached, and we send the same request again (Request 2\) with greedy sampling, so that it will produce exactly the same outputs as the Request 1:
+
+```text
+Prompt: [A, B, C, D, E, F]
+Output: [G, H, I]
+
+Time 0:
+  Tokens: [A, B, C, D, E, F, G]
+  Block Table: [0 (ABCD), 3 (EFG)]
+  Cache Blocks: 0, 1
+Time 1:
+  Tokens: [A, B, C, D, E, F, G, H]
+  Block Table: [0 (ABCD), 3 (EFGH)]
+  Cache Blocks: 0, 1, 3
+```
+
+As can be seen, block 3 is a new full block and is cached. However, it is redundant as block 1, meaning that we cached the same block twice. In v0, when detecting block 3 is duplicated, we free block 3 and let Request 2 use block 1 instead, so its block table becomes `[0, 1]` in Time 1. However, the block table in vLLM v1 is append-only, meaning that changing the block table from `[0, 3]` to `[0, 1]` is not allowed. As a result, we will have duplicated blocks for the hash key E-H. This duplication will be eliminated when the request is freed.
+
+### Free
+
+When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first.
+
+:::{image} /assets/design/v1/prefix_caching/free.png
+:alt: Free Queue after Free a Request
+:::
+
+### Eviction (LRU)
+
+When the head block (least recently used block) of the free queue is cached, we have to evict the block to prevent it from being used by other requests. Specifically, eviction involves the following steps:
+
+1. Pop the block from the head of the free queue. This is the LRU black to be evicted.  
+2. Remove the block ID from the Cache Block.  
+3. Remove the block hash.
+
+## Example
+
+In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
+
+**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-1.png
+:alt: Example Time 1
+:::
+
+**Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-3.png
+:alt: Example Time 3
+:::
+
+**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-4.png
+:alt: Example Time 4
+:::
+
+**Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
+
+:::{image} /assets/design/v1/prefix_caching/example-time-5.png
+:alt: Example Time 5
+:::
+
+**Time 6: Request 1 is finished and free.**
+
+:::{image} /assets/design/v1/prefix_caching/example-time-6.png
+:alt: Example Time 6
+:::
+
+**Time 7: Request 2 comes in with the 33 prompt tokens, where the first 16 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
+
+:::{image} /assets/design/v1/prefix_caching/example-time-7.png
+:alt: Example Time 7
+:::
diff --git a/docs/source/index.md b/docs/source/index.md
index e90e81c72860a..ee25678e2c418 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -153,6 +153,13 @@ design/automatic_prefix_caching
 design/multiprocessing
 :::
 
+:::{toctree}
+:caption: V1 Design Documents
+:maxdepth: 2
+
+design/v1/prefix_caching
+:::
+
 % How to contribute to the vLLM project
 
 :::{toctree}

From 89003c4082db880e103e84f5015424e79f9aa762 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Sat, 1 Feb 2025 05:13:04 +0800
Subject: [PATCH 54/69] [v1][Bugfix] Add extra_keys to block_hash for prefix
 caching (#12603)

This pr adds extra key to block hash, to generate different hash value
for two blocks with the same token string but different extra_keys in
their parent blocks. For example, it can generate different hash value
for the second block of the following two requests:
```python
request1 = make_request(
        request_id=0,
        prompt_token_ids=[_ for _ in range(6)],
        mm_positions=[{
            "offset": 0,
            "length": 3
        }, {
            "offset": 3,
            "length": 3
        }],
        mm_hashes=["hash1", "hash2"],
    )
    request2 = make_request(
        request_id=1,
        prompt_token_ids=[_ for _ in range(6)],
        mm_positions=[{
            "offset": 0,
            "length": 3
        }, {
            "offset": 3,
            "length": 3
        }],
        mm_hashes=["hash3", "hash2"],
    )
```

---------

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/core/test_kv_cache_utils.py | 34 +++++++++++++++++++++++++++-
 vllm/v1/core/kv_cache_utils.py       |  6 +++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index f4081766e39a2..0a5ba1f98221f 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -192,7 +192,7 @@ def test_hash_block_tokens():
                                    extra_keys)
     assert isinstance(block_hash, BlockHashType)
     assert block_hash.hash_value == hash(
-        (parent_block_hash, *curr_block_token_ids))
+        (parent_block_hash, curr_block_token_ids, extra_keys))
     assert block_hash.token_ids == curr_block_token_ids
     assert block_hash.extra_keys == extra_keys
 
@@ -227,6 +227,38 @@ def test_hash_request_tokens():
     assert block_hashes[1].extra_keys == ("hash2", )
 
 
+def test_hash_tokens_different_mm_input():
+    request1 = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
+        mm_hashes=["hash1", "hash2"],
+    )
+    request2 = make_request(
+        request_id=1,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
+        mm_hashes=["hash3", "hash2"],
+    )
+    block_size = 3
+    block_hashes1 = hash_request_tokens(block_size, request1)
+    block_hashes2 = hash_request_tokens(block_size, request2)
+    assert block_hashes1[0] != block_hashes2[0]
+    assert block_hashes1[1] != block_hashes2[1]
+
+
 def test_hash_request_tokens_no_mm_inputs():
     request = make_request(
         request_id=0,
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index dbdda51aedaa0..2b6557ad3ce66 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -262,8 +262,10 @@ def hash_block_tokens(
         The hash value of the block and the token ids in the block.
         The entire tuple is used as the hash key of the block.
     """
-    return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)),
-                         tuple(curr_block_token_ids), extra_keys)
+    curr_block_token_ids_tuple = tuple(curr_block_token_ids)
+    return BlockHashType(
+        hash((parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
+        curr_block_token_ids_tuple, extra_keys)
 
 
 def hash_request_tokens(block_size: int,

From 415f19474dedc69934cab79cfb8f5bdc19e2ae0d Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 31 Jan 2025 13:39:36 -0800
Subject: [PATCH 55/69] [release] Add input step to ask for Release version
 (#12631)

Instead of having to create a new build with release version put in as
env var.
---
 .buildkite/release-pipeline.yaml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 51618a2955fb1..829414bf8a3ba 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -56,6 +56,11 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
+  - input: "Provide Release version here"
+    fields:
+      - text: "What is the release version?"
+        key: "release-version"
+
   - block: "Build CPU release image"
     key: block-cpu-release-image-build
     depends_on: ~
@@ -66,7 +71,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"

From 145c2ff648ad0a300f880ac38811d0d8a2eb3e79 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 31 Jan 2025 18:28:47 -0500
Subject: [PATCH 56/69] [Bugfix] Revert MoE Triton Config Default (#12629)

SUMMARY:
* previous PR for pulling in block configs also changed defaults
(https://github.com/vllm-project/vllm/pull/11589/files) for FP8
* this broke L4 MoE since there was not enough SHM for the default
configuration
* this reverts the non-block example to the default

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 .../layers/fused_moe/fused_moe.py             | 41 +++++--------------
 1 file changed, 11 insertions(+), 30 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 39607dc4ca11e..c966be99ed24f 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -660,36 +660,17 @@ def get_default_config(
     is_marlin: bool,
     block_shape: Optional[List[int]] = None,
 ) -> Dict[str, int]:
-    if dtype == "fp8_w8a8":
-        if block_shape is None:
-            config = {
-                "BLOCK_SIZE_M": 128,
-                "BLOCK_SIZE_N": 256,
-                "BLOCK_SIZE_K": 128,
-                "GROUP_SIZE_M": 32,
-                "num_warps": 8,
-                "num_stages": 4,
-            }
-            if M <= E:
-                config = {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 128,
-                    "GROUP_SIZE_M": 1,
-                    "num_warps": 4,
-                    "num_stages": 4,
-                }
-        else:
-            # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
-            # BLOCK_SIZE_K must be divisible by block_shape[1]
-            config = {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": block_shape[0],
-                "BLOCK_SIZE_K": block_shape[1],
-                "GROUP_SIZE_M": 32,
-                "num_warps": 4,
-                "num_stages": 3,
-            }
+    if dtype == "fp8_w8a8" and block_shape is not None:
+        # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
+        # BLOCK_SIZE_K must be divisible by block_shape[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_shape[0],
+            "BLOCK_SIZE_K": block_shape[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+        }
     else:
         config = {
             "BLOCK_SIZE_M": 64,

From eb5741ad422f04d0bac60c9b6c07183e0431ce8c Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 31 Jan 2025 18:29:11 -0500
Subject: [PATCH 57/69] [Kernel][Quantization] Integrate block-quantized
 CUTLASS kernels for DeepSeekV3 (#12587)

Integrates the block-quantized kernels introduced in
https://github.com/vllm-project/vllm/pull/11868 for use in linear
layers.

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 csrc/ops.h                                    |   1 +
 .../cutlass_w8a8/scaled_mm_c3x.cu             |   8 +-
 .../cutlass_w8a8/scaled_mm_entry.cu           |  15 +-
 csrc/torch_bindings.cpp                       |   7 +
 vllm/_custom_ops.py                           |   5 +
 .../model_executor/layers/quantization/fp8.py |   5 +-
 .../layers/quantization/utils/fp8_utils.py    | 146 ++++++++++++++----
 .../layers/quantization/utils/w8a8_utils.py   |  10 ++
 8 files changed, 160 insertions(+), 37 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 346898964010d..e39d4ef3188a3 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -153,6 +153,7 @@ torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
 
 #ifndef USE_ROCM
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
 
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index e6f06d72fbfd4..72d549e597df5 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -58,7 +58,13 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
 
     vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales);
   } else {
-    TORCH_CHECK(false, "Unsupported scale group shapes for CUTLASS 3.x GEMM");
+    TORCH_CHECK(false,
+                "Unsupported scale group shapes for CUTLASS 3.x GEMM.\n "
+                "a_scale_group_shape must be [1, 128], got: [",
+                a_scale_group_shape[0], ", ", a_scale_group_shape[1],
+                "]\n"
+                "b_scale_group_shape must be [128, 128], got: [",
+                b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
   }
 }
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index da77312bc4b98..6bef55088682a 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -81,6 +81,19 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   return false;
 }
 
+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
+  // CUTLASS block-quantized FP8 kernels need at least CUDA 12.0
+  // and at least SM90 (Hopper)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12000;
+  }
+#endif
+
+  return false;
+}
+
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
@@ -212,4 +225,4 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
       "No compiled cutlass_scaled_mm_azp for a compute capability less than "
       "CUDA device capability: ",
       version_num);
-}
\ No newline at end of file
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 1846d9ac29943..186e9c0e81b77 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -324,6 +324,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
+  ops.def(
+      "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
+      "bool");
+  ops.impl("cutlass_scaled_mm_supports_block_fp8",
+           &cutlass_scaled_mm_supports_fp8);
+
   // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
   // given capability
   ops.def(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index fd94134de0219..da237da2eccac 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -435,6 +435,11 @@ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
 
 
+def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_mm_supports_block_fp8(
+        cuda_device_capability)
+
+
 def cutlass_scaled_mm(a: torch.Tensor,
                       b: torch.Tensor,
                       scale_a: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 57dd6e310297d..adab1973b40ee 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -21,7 +21,8 @@
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d, apply_fp8_linear, convert_to_channelwise,
-    cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
+    cutlass_block_fp8_supported, cutlass_fp8_supported,
+    normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
     requantize_with_max_scale)
 from vllm.model_executor.parameter import (BlockQuantScaleParameter,
                                            ModelWeightParameter,
@@ -133,6 +134,7 @@ class Fp8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
         self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
@@ -359,6 +361,7 @@ def apply(self,
                 weight_scale=layer.weight_scale_inv,
                 input_scale=layer.input_scale,
                 bias=bias,
+                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
             )
 
         return apply_fp8_linear(
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index a7a3fa6601639..ccebff341a7ed 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -8,6 +8,7 @@
 import triton
 import triton.language as tl
 
+from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
@@ -21,20 +22,34 @@ def apply_w8a8_block_fp8_linear(
     weight_scale: torch.Tensor,
     input_scale: Optional[torch.Tensor] = None,
     bias: Optional[torch.Tensor] = None,
+    cutlass_block_fp8_supported: bool = True,
 ) -> torch.Tensor:
     assert input_scale is None
     # View input as 2D matrix for fp8 methods
     input_2d = input.view(-1, input.shape[-1])
     output_shape = [*input.shape[:-1], weight.shape[0]]
 
-    q_input, x_scale = per_token_group_quant_fp8(input_2d, block_size[1])
-    output = w8a8_block_fp8_matmul(q_input,
-                                   weight,
-                                   x_scale,
-                                   weight_scale,
-                                   block_size,
-                                   output_dtype=input.dtype)
-
+    shape_supported_by_cutlass = (weight.shape[0] % 128 == 0
+                                  and weight.shape[1] % 128 == 0)
+    if cutlass_block_fp8_supported and shape_supported_by_cutlass:
+        q_input, x_scale = per_token_group_quant_fp8(input_2d,
+                                                     block_size[1],
+                                                     column_major_scales=True)
+        output = ops.cutlass_scaled_mm(q_input,
+                                       weight.T,
+                                       out_dtype=input.dtype,
+                                       scale_a=x_scale,
+                                       scale_b=weight_scale.T)
+    else:
+        q_input, x_scale = per_token_group_quant_fp8(input_2d,
+                                                     block_size[1],
+                                                     column_major_scales=False)
+        output = w8a8_block_fp8_matmul(q_input,
+                                       weight,
+                                       x_scale,
+                                       weight_scale,
+                                       block_size,
+                                       output_dtype=input.dtype)
     if bias is not None:
         output = output + bias
     return output.to(dtype=input.dtype).view(*output_shape)
@@ -98,10 +113,7 @@ def _per_token_group_quant_fp8(
     y_ptr,
     y_q_ptr,
     y_s_ptr,
-    # Stride of input
-    y_stride,
-    # Columns of input
-    N,
+    group_size,
     # Avoid to divide zero
     eps,
     # Information for float8
@@ -116,12 +128,60 @@ def _per_token_group_quant_fp8(
     """
     # Map the program id to the row of X and Y it should compute.
     g_id = tl.program_id(0)
-    y_ptr += g_id * y_stride
-    y_q_ptr += g_id * y_stride
+    y_ptr += g_id * group_size
+    y_q_ptr += g_id * group_size
     y_s_ptr += g_id
 
     cols = tl.arange(0, BLOCK)  # N <= BLOCK
-    mask = cols < N
+    mask = cols < group_size
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / fp8_max
+    y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+@triton.jit
+def _per_token_group_quant_fp8_colmajor(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    group_size,
+    # Num columns of y
+    y_num_columns,
+    # Stride from one column to the next of y_s
+    y_s_col_stride,
+    # Avoid to divide zero
+    eps,
+    # Information for float8
+    fp8_min,
+    fp8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * group_size
+    y_q_ptr += g_id * group_size
+
+    # Convert g_id the flattened block coordinate to 2D so we can index
+    # into the output y_scales matrix
+    blocks_per_row = y_num_columns // group_size
+    scale_col = g_id % blocks_per_row
+    scale_row = g_id // blocks_per_row
+    y_s_ptr += scale_col * y_s_col_stride + scale_row
+
+    cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
+    mask = cols < group_size
 
     y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
     # Quant
@@ -138,12 +198,13 @@ def per_token_group_quant_fp8(
     group_size: int,
     eps: float = 1e-10,
     dtype: Optional[torch.dtype] = None,
+    column_major_scales: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
     It converts the tensor values into signed float8 values and returns the
     quantized tensor along with the scaling factor used for quantization.
     Args:
-        x: The input tenosr with ndim >= 2.
+        x: The input tensor with ndim >= 2.
         group_size: The group size used for quantization.
         eps: The minimum to avoid dividing zero.
         dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
@@ -167,29 +228,46 @@ def per_token_group_quant_fp8(
     x_q = torch.empty_like(x, device=x.device, dtype=dtype)
     M = x.numel() // group_size
     N = group_size
-    x_s = torch.empty(
-        x.shape[:-1] + (x.shape[-1] // group_size, ),
-        device=x.device,
-        dtype=torch.float32,
-    )
+    if column_major_scales:
+        shape = (x.shape[-1] // group_size, ) + x.shape[:-1]
+        x_s = torch.empty(shape, device=x.device,
+                          dtype=torch.float32).permute(-1, -2)
+    else:
+        shape = x.shape[:-1] + (x.shape[-1] // group_size, )
+        x_s = torch.empty(shape, device=x.device, dtype=torch.float32)
 
     BLOCK = triton.next_power_of_2(N)
     # heuristics for number of warps
     num_warps = min(max(BLOCK // 256, 1), 8)
     num_stages = 1
-    _per_token_group_quant_fp8[(M, )](
-        x,
-        x_q,
-        x_s,
-        group_size,
-        N,
-        eps,
-        fp8_min=fp8_min,
-        fp8_max=fp8_max,
-        BLOCK=BLOCK,
-        num_warps=num_warps,
-        num_stages=num_stages,
-    )
+    if column_major_scales:
+        _per_token_group_quant_fp8_colmajor[(M, )](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            x.shape[1],
+            x_s.stride(1),
+            eps,
+            fp8_min=fp8_min,
+            fp8_max=fp8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+    else:
+        _per_token_group_quant_fp8[(M, )](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            eps,
+            fp8_min=fp8_min,
+            fp8_max=fp8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
 
     return x_q, x_s
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 9977804188a50..3af3b3e0ea942 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -30,6 +30,16 @@ def cutlass_fp8_supported() -> bool:
     return ops.cutlass_scaled_mm_supports_fp8(capability)
 
 
+def cutlass_block_fp8_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_scaled_mm_supports_block_fp8(capability)
+
+
 def per_tensor_dequantize(
         tensor: torch.Tensor, inv_scale: Union[float,
                                                torch.Tensor]) -> torch.Tensor:

From fc542144c4477ffec1d3de6fa43e54f8fb5351e8 Mon Sep 17 00:00:00 2001
From: Ryan Nguyen <96593302+xpbowler@users.noreply.github.com>
Date: Fri, 31 Jan 2025 18:37:30 -0500
Subject: [PATCH 58/69] [Feature] Fix guided decoding blocking bitmask memcpy
 (#12563)

**[Guided decoding performance optimization]** Sending the guided
decoding bitmask in xgrammar to the GPU
(`self.token_bitmask.to(scores.device)`) is a blocking operation that
prevents the CPU from pre-launching the sampler kernels. The CPU waits
until decode is complete, then copies the bitmask over. This PR changes
the operation to async via setting `non-blocking=True`.

(Current) The CPU is blocked on a `cudaStreamSynchronize` and only
pre-empts the sampling kernels after bitmask application. Below is the
Nsys profile for one decode phase from Llama 3.1 8B.

![image](https://github.com/user-attachments/assets/8997eae1-b822-4f52-beb8-ef19a7c6b824)

With the optimization, this is no longer the case:

![image](https://github.com/user-attachments/assets/6d5ea83f-f169-4f98-a8c1-41c719b3e1e7)

---------

Signed-off-by: Ryan N <ryan.nguyen@centml.ai>
---
 vllm/model_executor/guided_decoding/xgrammar_decoding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 2d8594cb8aafa..ee30ce96f0a1e 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -307,8 +307,8 @@ def __call__(self, input_ids: list[int],
         # Note: In this method, if the tensors have different dimensions
         # on CPU device fails, but on GPU it runs without error. Hence the
         # unsqueeze above for scores, to match the token bitmask shape
-        xgr.apply_token_bitmask_inplace(scores,
-                                        self.token_bitmask.to(scores.device))
+        xgr.apply_token_bitmask_inplace(
+            scores, self.token_bitmask.to(scores.device, non_blocking=True))
         if device_type != "cuda":
             scores = scores.to(dtype).to(device_type).squeeze()
 

From 60808bd4c7a27b6d28f82657e38a5b303f7534a9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 31 Jan 2025 23:38:35 +0000
Subject: [PATCH 59/69] [Doc] Improve installation signposting (#12575)

- Make device tab names more explicit
- Add comprehensive list of devices to
https://docs.vllm.ai/en/latest/getting_started/installation/index.html
- Add `attention` blocks to the intro of all devices that don't have
pre-built wheels/images

---------

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../ai_accelerator/hpu-gaudi.inc.md           |  4 ++
 .../installation/ai_accelerator/index.md      | 33 +++++++------
 .../installation/ai_accelerator/neuron.inc.md |  4 ++
 .../ai_accelerator/openvino.inc.md            |  4 ++
 .../installation/ai_accelerator/tpu.inc.md    |  4 ++
 .../installation/cpu/apple.inc.md             |  4 ++
 .../installation/cpu/arm.inc.md               |  4 ++
 .../getting_started/installation/cpu/index.md | 13 ++---
 .../installation/cpu/x86.inc.md               | 12 +++--
 .../getting_started/installation/gpu/index.md | 49 ++++++++++---------
 .../installation/gpu/rocm.inc.md              | 20 ++++----
 .../installation/gpu/xpu.inc.md               |  4 ++
 .../getting_started/installation/index.md     | 15 ++++++
 13 files changed, 111 insertions(+), 59 deletions(-)

diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index 704a16233981f..f3b0d6dc9bdc8 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -2,6 +2,10 @@
 
 This tab provides instructions on running vLLM with Intel Gaudi devices.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Ubuntu 22.04 LTS
diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md
index 88352f639567b..01793572fee7c 100644
--- a/docs/source/getting_started/installation/ai_accelerator/index.md
+++ b/docs/source/getting_started/installation/ai_accelerator/index.md
@@ -5,7 +5,8 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
+:selected:
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -25,7 +26,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -52,7 +53,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -72,7 +73,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -99,7 +100,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -119,7 +120,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -146,7 +147,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -166,7 +167,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -193,7 +194,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -213,7 +214,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -242,7 +243,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -262,7 +263,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -289,7 +290,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -309,7 +310,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
@@ -336,7 +337,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} TPU
+::::{tab-item} Google TPU
 :sync: tpu
 
 :::{include} tpu.inc.md
@@ -354,7 +355,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} Neuron
+::::{tab-item} AWS Neuron
 :sync: neuron
 
 :::{include} neuron.inc.md
diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index 145cc9d668efd..f149818acafb8 100644
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -4,6 +4,10 @@ vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Infere
 Paged Attention and Chunked Prefill are currently in development and will be available soon.
 Data types currently supported in Neuron SDK are FP16 and BF16.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Linux
diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
index a7867472583d6..112e8d4d9b256 100644
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@@ -2,6 +2,10 @@
 
 vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Linux
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index 6827afc805fd8..c0d50feafce56 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -30,6 +30,10 @@ For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tp
 You may need additional persistent storage for your TPU VMs. For more
 information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options).
 
+:::{attention}
+There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
+:::
+
 ## Requirements
 
 - Google Cloud TPU VM
diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md
index 0808b869fdb7b..3bf1d47fa0ff9 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -4,6 +4,10 @@ vLLM has experimental support for macOS with Apple silicon. For now, users shall
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: `macOS Sonoma` or later
diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md
index 08a764e1a25f4..a661a0ca5adc7 100644
--- a/docs/source/getting_started/installation/cpu/arm.inc.md
+++ b/docs/source/getting_started/installation/cpu/arm.inc.md
@@ -4,6 +4,10 @@ vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CP
 
 ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Linux
diff --git a/docs/source/getting_started/installation/cpu/index.md b/docs/source/getting_started/installation/cpu/index.md
index 2f549ede0cf48..d53430403583c 100644
--- a/docs/source/getting_started/installation/cpu/index.md
+++ b/docs/source/getting_started/installation/cpu/index.md
@@ -5,7 +5,8 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} x86
+::::{tab-item} Intel/AMD x86
+:selected:
 :sync: x86
 
 :::{include} x86.inc.md
@@ -15,7 +16,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ::::
 
-::::{tab-item} ARM
+::::{tab-item} ARM AArch64
 :sync: arm
 
 :::{include} arm.inc.md
@@ -44,7 +45,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} x86
+::::{tab-item} Intel/AMD x86
 :sync: x86
 
 :::{include} x86.inc.md
@@ -54,7 +55,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ::::
 
-::::{tab-item} ARM
+::::{tab-item} ARM AArch64
 :sync: arm
 
 :::{include} arm.inc.md
@@ -92,7 +93,7 @@ Currently, there are no pre-built CPU wheels.
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} x86
+::::{tab-item} Intel/AMD x86
 :sync: x86
 
 :::{include} x86.inc.md
@@ -102,7 +103,7 @@ Currently, there are no pre-built CPU wheels.
 
 ::::
 
-::::{tab-item} ARM
+::::{tab-item} ARM AArch64
 :sync: arm
 
 :::{include} arm.inc.md
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
index f146ae0918b44..1dafc3660060e 100644
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -2,12 +2,20 @@
 
 vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - OS: Linux
 - Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
 - Instruction Set Architecture (ISA): AVX512 (optional, recommended)
 
+:::{tip}
+[Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+:::
+
 ## Set up using Python
 
 ### Pre-built wheels
@@ -29,7 +37,3 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
 ### Build image from source
 
 ## Extra information
-
-## Intel Extension for PyTorch
-
-- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md
index 0a61f889753a3..f82c4bda28620 100644
--- a/docs/source/getting_started/installation/gpu/index.md
+++ b/docs/source/getting_started/installation/gpu/index.md
@@ -5,7 +5,8 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
+:selected:
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -15,7 +16,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -25,7 +26,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
@@ -45,7 +46,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -55,7 +56,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -65,7 +66,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
@@ -87,7 +88,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -97,14 +98,14 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 There is no extra information on creating a new Python environment for this device.
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 There is no extra information on creating a new Python environment for this device.
@@ -118,7 +119,7 @@ There is no extra information on creating a new Python environment for this devi
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -128,7 +129,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -138,7 +139,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
@@ -157,7 +158,7 @@ There is no extra information on creating a new Python environment for this devi
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -167,7 +168,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -177,7 +178,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
@@ -196,7 +197,7 @@ There is no extra information on creating a new Python environment for this devi
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -206,7 +207,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -216,7 +217,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
@@ -233,7 +234,7 @@ There is no extra information on creating a new Python environment for this devi
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -243,7 +244,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -253,7 +254,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
@@ -270,7 +271,7 @@ There is no extra information on creating a new Python environment for this devi
 :::::{tab-set}
 :sync-group: device
 
-::::{tab-item} CUDA
+::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
 :::{include} cuda.inc.md
@@ -279,7 +280,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} ROCm
+::::{tab-item} AMD ROCm
 :sync: rocm
 
 :::{include} rocm.inc.md
@@ -288,7 +289,7 @@ There is no extra information on creating a new Python environment for this devi
 
 ::::
 
-::::{tab-item} XPU
+::::{tab-item} Intel XPU
 :sync: xpu
 
 :::{include} xpu.inc.md
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 131ad1704ea11..c8fd11415cfda 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -2,6 +2,10 @@
 
 vLLM supports AMD GPUs with ROCm 6.2.
 
+:::{attention}
+There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
+:::
+
 ## Requirements
 
 - GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
@@ -13,14 +17,6 @@ vLLM supports AMD GPUs with ROCm 6.2.
 
 Currently, there are no pre-built ROCm wheels.
 
-However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
-docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
-
-:::{tip}
-Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
-for instructions on how to use this prebuilt docker image.
-:::
-
 ### Build wheel from source
 
 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
@@ -112,7 +108,13 @@ for instructions on how to use this prebuilt docker image.
 
 ### Pre-built images
 
-Currently, there are no pre-built ROCm images.
+The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
+docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
+
+:::{tip}
+Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
+for instructions on how to use this prebuilt docker image.
+:::
 
 ### Build image from source
 
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index bc01c6000bc07..4116826789e5c 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -2,6 +2,10 @@
 
 vLLM initially supports basic model inferencing and serving on Intel GPU platform.
 
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
 ## Requirements
 
 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md
index 0f5e013ce071a..c64c3a7208eeb 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation/index.md
@@ -6,8 +6,23 @@ vLLM supports the following hardware platforms:
 
 :::{toctree}
 :maxdepth: 1
+:hidden:
 
 gpu/index
 cpu/index
 ai_accelerator/index
 :::
+
+- <project:gpu/index.md>
+  - NVIDIA CUDA
+  - AMD ROCm
+  - Intel XPU
+- <project:cpu/index.md>
+  - Intel/AMD x86
+  - ARM AArch64
+  - Apple silicon
+- <project:ai_accelerator/index.md>
+  - Google TPU
+  - Intel Gaudi
+  - AWS Neuron
+  - OpenVINO

From 44bbca78d71330909dbfdde232debdc73a4d5a81 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
Date: Fri, 31 Jan 2025 17:38:48 -0600
Subject: [PATCH 60/69] [Doc] int4 w4a16 example (#12585)

Based on a request by @mgoin , with @kylesayrs we have added an example
doc for int4 w4a16 quantization, following the pre-existing int8 w8a8
quantization example and the example available in
[`llm-compressor`](https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w4a16/llama3_example.py)

FIX #n/a (no issue created)

@kylesayrs and I have discussed a couple additional improvements for the
quantization docs. We will revisit at a later date, possibly including:
- A section for "choosing the correct quantization scheme/ compression
technique"
- Additional vision or audio calibration datasets

---------

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 docs/source/features/quantization/index.md |   1 +
 docs/source/features/quantization/int4.md  | 166 +++++++++++++++++++++
 docs/source/features/quantization/int8.md  |   4 +-
 3 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/features/quantization/int4.md

diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index d972dc85fc23c..1c98620aa2145 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -12,6 +12,7 @@ supported_hardware
 auto_awq
 bnb
 gguf
+int4
 int8
 fp8
 quantized_kvcache
diff --git a/docs/source/features/quantization/int4.md b/docs/source/features/quantization/int4.md
new file mode 100644
index 0000000000000..f8939e5bf0150
--- /dev/null
+++ b/docs/source/features/quantization/int4.md
@@ -0,0 +1,166 @@
+(int4)=
+
+# INT4 W4A16
+
+vLLM supports quantizing weights to INT4 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size and maintaining low latency in workloads with low queries per second (QPS).
+
+Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int4-llms-for-vllm-668ec34bf3c9fa45f857df2c).
+
+:::{note}
+INT4 computation is supported on NVIDIA GPUs with compute capability > 8.0 (Ampere, Ada Lovelace, Hopper, Blackwell).
+:::
+
+## Prerequisites
+
+To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
+
+```console
+pip install llmcompressor
+```
+
+## Quantization Process
+
+The quantization process involves four main steps:
+
+1. Loading the model
+2. Preparing calibration data
+3. Applying quantization
+4. Evaluating accuracy in vLLM
+
+### 1. Loading the Model
+
+Load your model and tokenizer using the standard `transformers` AutoModel classes:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+```
+
+### 2. Preparing Calibration Data
+
+When quantizing weights to INT4, you need sample data to estimate the weight updates and calibrated scales.
+It's best to use calibration data that closely matches your deployment data.
+For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
+
+```python
+from datasets import load_dataset
+
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load and preprocess the dataset
+ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+def preprocess(example):
+    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+ds = ds.map(preprocess)
+
+def tokenize(sample):
+    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+```
+
+### 3. Applying Quantization
+
+Now, apply the quantization algorithms:
+
+```python
+from llmcompressor.transformers import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+
+# Configure the quantization algorithms
+recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+
+# Apply quantization
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save the compressed model
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+```
+
+This process creates a W4A16 model with weights quantized to 4-bit integers.
+
+### 4. Evaluating Accuracy
+
+After quantization, you can load and run the model in vLLM:
+
+```python
+from vllm import LLM
+model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
+```
+
+To evaluate accuracy, you can use `lm_eval`:
+
+```console
+$ lm_eval --model vllm \
+  --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \
+  --tasks gsm8k \
+  --num_fewshot 5 \
+  --limit 250 \
+  --batch_size 'auto'
+```
+
+:::{note}
+Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
+:::
+
+## Best Practices
+
+- Start with 512 samples for calibration data, and increase if accuracy drops
+- Ensure the calibration data contains a high variety of samples to prevent overfitting towards a specific use case
+- Use a sequence length of 2048 as a starting point
+- Employ the chat template or instruction template that the model was trained with
+- If you've fine-tuned a model, consider using a sample of your training data for calibration
+- Tune key hyperparameters to the quantization algorithm:
+  - `dampening_frac` sets how much influence the GPTQ algorithm has. Lower values can improve accuracy, but can lead to numerical instabilities that cause the algorithm to fail.
+  - `actorder` sets the activation ordering. When compressing the weights of a layer weight, the order in which channels are quantized matters. Setting `actorder="weight"` can improve accuracy without added latency.
+
+The following is an example of an expanded quantization recipe you can tune to your own use case:
+
+```python
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationScheme,
+    QuantizationStrategy,
+    QuantizationType,
+) 
+recipe = GPTQModifier(
+    targets="Linear",
+    config_groups={
+        "config_group": QuantizationScheme(
+            targets=["Linear"],
+            weights=QuantizationArgs(
+                num_bits=4,
+                type=QuantizationType.INT,
+                strategy=QuantizationStrategy.GROUP,
+                group_size=128,
+                symmetric=True,
+                dynamic=False,
+                actorder="weight",
+            ),
+        ),
+    },
+    ignore=["lm_head"],
+    update_size=NUM_CALIBRATION_SAMPLES,
+    dampening_frac=0.01
+)
+```
+
+## Troubleshooting and Support
+
+If you encounter any issues or have feature requests, please open an issue on the [`vllm-project/llm-compressor`](https://github.com/vllm-project/llm-compressor) GitHub repository. The full INT4 quantization example in `llm-compressor` is available [here](https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w4a16/llama3_example.py).
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
index fedb16f4350e5..b381f34bccd34 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@@ -8,7 +8,7 @@ This quantization method is particularly useful for reducing model size while ma
 Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
 
 :::{note}
-INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
+INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
 :::
 
 ## Prerequisites
@@ -132,4 +132,4 @@ Quantized models can be sensitive to the presence of the `bos` token. Make sure
 
 ## Troubleshooting and Support
 
-If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository.
+If you encounter any issues or have feature requests, please open an issue on the [`vllm-project/llm-compressor`](https://github.com/vllm-project/llm-compressor) GitHub repository.

From b1340f9d55cd36a92aff713213e95f354a1bd1b4 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 31 Jan 2025 21:32:04 -0500
Subject: [PATCH 61/69] [V1] Bugfix: Validate Model Input Length (#12600)

SUMMARY:
* avoid crashing the engine when we get an input longer than
max_model_len

FIX #12567(*link existing issues this PR will resolve*)
---
 vllm/v1/engine/processor.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 43419d2ff5381..6196c11052078 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -206,6 +206,11 @@ def _validate_model_inputs(self, inputs: ProcessorInputs):
         if prompt_ids is None or len(prompt_ids) == 0:
             raise ValueError("Prompt cannot be empty")
 
+        if len(prompt_ids) >= self.model_config.max_model_len:
+            raise ValueError(
+                f"Prompt length of {len(prompt_ids)} is longer than the "
+                f"maximum model length of {self.model_config.max_model_len}.")
+
         if self.model_config.is_multimodal_model:
             max_prompt_len = self.model_config.max_model_len
 

From cb3e73e4c8142b5ce8ac34efc2fa04d90f142dc5 Mon Sep 17 00:00:00 2001
From: fade_away <1028552010@qq.com>
Date: Sat, 1 Feb 2025 12:52:07 +0800
Subject: [PATCH 62/69] [BugFix] fix wrong output when using lora and
 num_scheduler_steps=8 (#11161)

FIX issue https://github.com/vllm-project/vllm/issues/9688
https://github.com/vllm-project/vllm/issues/11086 #12487

---------

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: weilong.yu <weilong.yu@shopee.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/worker/model_runner.py | 4 ++++
 vllm/worker/worker.py       | 3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 160c0662ce976..322d91d62ce46 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1346,6 +1346,10 @@ def _dummy_run(self,
 
             self.execute_model(model_input, kv_caches, intermediate_tensors)
             torch.cuda.synchronize()
+            if self.lora_config:
+                # Remove dummy loras.
+                assert self.lora_manager is not None
+                self.remove_all_loras()
             return
 
     def remove_all_loras(self):
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 24bba79fedd75..1d2884d3ddf51 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -264,10 +264,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
 
         logger.info(msg)
-
         # Final cleanup
-        if self.model_runner.lora_manager:
-            self.model_runner.remove_all_loras()
         gc.collect()
 
         return num_gpu_blocks, num_cpu_blocks

From 1867c258bda3bc6adb07090c508fd85e3ceed547 Mon Sep 17 00:00:00 2001
From: Eldar Kurtic <eldarkurtic314@gmail.com>
Date: Sat, 1 Feb 2025 06:07:46 +0100
Subject: [PATCH 63/69] Fix target matching for fused layers with
 compressed-tensors (#12617)

Without this PR
---------------
Quantizing models with llm-compressor and a recipe that explicitly lists
names of layers produces a model that is not loadable by vLLM (i.e.
`vllm serve <model>` fails with `raise ValueError(f"Unable to find
matching target for {module} in the ...`).

Example recipe:
```
recipe = """
quantization_stage:
  run_type: oneshot
  quantization_modifiers:
    GPTQModifier:
      ignore: ["lm_head"]
      config_groups:
        group_0:
          weights:
            num_bits: 4
            type: "int"
            symmetric: true
            strategy: "group"
            group_size: 128
          targets: [
            "model.layers.0.mlp.down_proj",
            "model.layers.2.mlp.down_proj",
            "model.layers.3.mlp.down_proj",
            "model.layers.4.mlp.down_proj",
            "model.layers.5.mlp.down_proj",
            "model.layers.6.mlp.down_proj",
            "model.layers.7.mlp.down_proj",
            "model.layers.8.mlp.down_proj",
            "model.layers.9.mlp.down_proj",
            "model.layers.10.mlp.down_proj",
            "model.layers.11.mlp.down_proj",
            "model.layers.12.mlp.down_proj",
            "model.layers.13.mlp.down_proj",
            "model.layers.14.mlp.down_proj",
            "model.layers.15.mlp.down_proj",
            "model.layers.16.mlp.down_proj",
            "model.layers.17.mlp.down_proj",
            "model.layers.19.mlp.down_proj",
            "model.layers.21.mlp.down_proj",
            "model.layers.22.mlp.down_proj",
            .
            .
            .
          ]
"""
```

To reproduce the vLLM error:
```bash
vllm serve nm-testing/eldar-test
```

With this PR
------------
Models are loaded correctly without any errors.
---
 .../quantization/compressed_tensors/utils.py  | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 8fcbda377428e..5bab5a02d83b7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -103,7 +103,8 @@ def find_matched_target(layer_name: Optional[str], module: Module,
 
     matched_target = (_find_first_match(layer_name, targets)
                       or _find_first_match(module.__class__.__name__, targets,
-                                           True))
+                                           True)
+                      or _match_fused_layer(layer_name, targets))
 
     if matched_target is None:
         raise ValueError(f"Unable to find matching target for {module} in the "
@@ -152,3 +153,41 @@ def _is_equal_or_regex_match(value: str,
     elif target == value:
         return True
     return False
+
+
+def _match_fused_layer(layer_name: str,
+                       target_layers: Iterable[str]) -> Optional[str]:
+    """
+    Match a fused layer name to its corresponding individual layer in 
+    target_layers.
+
+    Examples:
+        layer_name = "model.layers.0.self_attn.qkv_proj"
+        target_layers = ["model.layers.0.self_attn.q_proj",
+                        "model.layers.0.self_attn.k_proj",
+                        "model.layers.0.self_attn.v_proj"]
+    """
+    # Split into parent path and layer type
+    # e.g., "model.layers.0.self_attn" and "qkv_proj"
+    parent_path = ".".join(layer_name.split(".")[:-1])
+    layer_type = layer_name.split(".")[-1]
+
+    if layer_type not in FUSED_LAYER_NAME_MAPPING:
+        return None
+
+    possible_layer_types = FUSED_LAYER_NAME_MAPPING[layer_type]
+
+    # Look for a target layer that:
+    # 1. Has the same parent path
+    # 2. Ends with one of the possible individual layer types
+    for target in target_layers:
+        is_same_parent = parent_path in target
+        is_matching_type = any(type_suffix in target
+                               for type_suffix in possible_layer_types)
+
+        if is_same_parent and is_matching_type and all(
+                '.'.join([parent_path, type_suffix])
+                for type_suffix in possible_layer_types):
+            return target
+
+    return None

From 35b7a05507e32f6cc60aec7148d3df0c788f7373 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 31 Jan 2025 21:22:23 -0800
Subject: [PATCH 64/69] [ci] Upgrade transformers to 4.48.2 in CI dependencies
 (#12599)

---
 requirements-common.txt | 2 +-
 requirements-test.in    | 2 +-
 requirements-test.txt   | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 7051ca8cb50cd..e5248572ce4d4 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -5,7 +5,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
+transformers >= 4.48.2  # Required for Bamba.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
diff --git a/requirements-test.in b/requirements-test.in
index 13ad17b256734..229d743ec802b 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -27,7 +27,7 @@ matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.0 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
-
+transformers==4.48.2 
 # quantization
 bitsandbytes>=0.45.0
 buildkite-test-collector==0.1.9
diff --git a/requirements-test.txt b/requirements-test.txt
index df7e904bb0d34..e032aac710dd0 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
-#    python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
+# python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
 #
 absl-py==2.1.0
     # via rouge-score
@@ -617,8 +617,9 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.47.0
+transformers==4.48.2
     # via
+    #   -r requirements-test.in
     #   genai-perf
     #   lm-eval
     #   peft

From cfa134d2475096eae47a58d14a6dec4c3fba9294 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sat, 1 Feb 2025 00:41:35 -0500
Subject: [PATCH 65/69] [Bugfix/CI] Fixup benchmark_moe.py (#12562)

Fixes `is_marlin` not being passed into `get_default_config`

Also allow `--tensor-parallel-size` in addition to `-tp` and `--tp-size`

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 benchmarks/kernels/benchmark_moe.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 5c8bf33afebc8..068830f02fb5e 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -343,9 +343,13 @@ def benchmark(
         op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
                                     dtype_str)
         if op_config is None:
-            config = get_default_config(num_tokens, num_experts,
-                                        shard_intermediate_size, hidden_size,
-                                        topk, dtype_str)
+            config = get_default_config(num_tokens,
+                                        num_experts,
+                                        shard_intermediate_size,
+                                        hidden_size,
+                                        topk,
+                                        dtype_str,
+                                        is_marlin=False)
         else:
             config = op_config[min(op_config.keys(),
                                    key=lambda x: abs(x - num_tokens))]
@@ -536,7 +540,11 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
     parser.add_argument("--model",
                         type=str,
                         default="mistralai/Mixtral-8x7B-Instruct-v0.1")
-    parser.add_argument("--tp-size", "-tp", type=int, default=2)
+    parser.add_argument("--tp-size",
+                        "-tp",
+                        "--tensor-parallel-size",
+                        type=int,
+                        default=2)
     parser.add_argument("--dtype",
                         type=str,
                         choices=["auto", "fp8_w8a8", "int8_w8a16"],

From 3e1c76cf3a87854396d9e86a56a335e7d750c85f Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Fri, 31 Jan 2025 23:41:59 -0600
Subject: [PATCH 66/69] Fix: Respect `sparsity_config.ignore` in Cutlass
 Integration (#12517)

This PR addresses a bug in the Cutlass integration where the
`sparsity_config.ignore` list was not being respected. When only a
subset of modules were configured as Sparse24, the system incorrectly
selected Cutlass for non-sparse modules as well. This update ensures the
correct scheme is selected for non-sparse modules, fixing this behavior.

---

### Changes

- Updated logic to correctly respect `sparsity_config.ignore`.
- Ensured non-sparse modules use the appropriate scheme instead of
defaulting to Cutlass.

---

<details>
<summary>Testing Setup</summary>

The fix has been tested on top of [this
diff](https://github.com/vllm-project/vllm/pull/12097).

#### Steps to Test:
```bash
git checkout -b my-test-branch origin/rahul-bitmask-additions # compressed Cutlass support
git revert --no-edit aa2cd2c # revert Tyler's commit to turn off Cutlass for W16A16
git cherry-pick ca624cddb # this branch
```

#### Additional Patch Required:
```diff
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index a54177c1c..f916dd0c9 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -9,7 +9,7 @@ from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
 from pydantic import BaseModel
-
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
@@ -27,7 +27,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     should_ignore_layer)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
-
+logger = init_logger(__name__)
 __all__ = ["CompressedTensorsLinearMethod"]

 SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
```

Apply using:
```bash
git apply logging-patch.patch
```

</details>

---

<details>
<summary>Models Tested</summary>

- `nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-partial-24`
- `nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-full-sparse24`
-
`nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-partial-24-entire-fp8-compressed`
-
`nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-partial-24-remaining-fp8-compressed`

</details>

---


<details>
<summary>Example Output</summary>

#### Layers 0-5 (Sparse24)
```
Using scheme: CompressedTensors24 for model.layers.0.self_attn.qkv_proj
Using scheme: CompressedTensors24 for model.layers.0.self_attn.o_proj
Using scheme: CompressedTensors24 for model.layers.0.mlp.gate_up_proj
Using scheme: CompressedTensors24 for model.layers.0.mlp.down_proj
...
```

#### Layers 6+ (Non-Sparse, FP8)
```
Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj
Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj
Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj
Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj
...
```

</details>

**Note:** Assumed all modules in fused layers such as `QKV_proj` and
`Gate_up_proj` follow the same quantization/pruning scheme.

---

For related tasks using the Asana app for GitHub, refer to [[this
link](https://app.asana.com/0/0/1209227810815160)](https://app.asana.com/0/0/1209227810815160).

Signed-off-by: Rahul Tuli <rahul@neuralmagic.com>
---
 .../compressed_tensors/compressed_tensors.py  | 59 +++++++++++-------
 .../quantization/compressed_tensors/utils.py  | 60 +++++++++++++++++--
 2 files changed, 91 insertions(+), 28 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index dd2dd02eaf723..37981ed918e7a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,4 +1,5 @@
-from typing import Any, Dict, List, Literal, Optional, cast
+from contextlib import suppress
+from typing import Any, Dict, List, Literal, Optional, Tuple, cast
 
 import torch
 from compressed_tensors.config import (CompressionFormat,
@@ -44,6 +45,7 @@ def __init__(
         ignore: List[str],
         quant_format: str,
         sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
+        sparsity_ignore_list: List[str],
         kv_cache_scheme: Optional[Dict[str, Any]] = None,
         config: Optional[Dict[str, Any]] = None,
     ):
@@ -54,6 +56,7 @@ def __init__(
         self.target_scheme_map = target_scheme_map
         self.kv_cache_scheme = kv_cache_scheme
         self.sparsity_scheme_map = sparsity_scheme_map
+        self.sparsity_ignore_list = sparsity_ignore_list
         self.config = config
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
@@ -98,7 +101,7 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         quant_format = cast(str, config.get("format"))
         target_scheme_map = cls._quantization_scheme_map_from_config(
             config=config)
-        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(
+        sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
             config=config)
 
         return cls(
@@ -106,20 +109,23 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
             ignore=ignore,
             quant_format=quant_format,
             sparsity_scheme_map=sparsity_scheme_map,
+            sparsity_ignore_list=sparsity_ignore_list,
             config=config,
         )
 
     @classmethod
-    def _sparsity_scheme_map_from_config(
-            cls, config: Dict[str,
-                              Any]) -> Dict[str, SparsityCompressionConfig]:
+    def _parse_sparsity_config(
+        cls, config: Dict[str, Any]
+    ) -> Tuple[Dict[str, SparsityCompressionConfig], List[str]]:
         """
         :param config: The `quantization_config` dictionary from config.json
-        :return: A dictionary mapping target layer names to their corresponding
-            sparsity compression configurations
+        :return: A tuple with two elements
+            1. A dictionary mapping target layer names to their corresponding
+                sparsity_config
+            2. A list of layer names to ignore for sparsity
         """
         if not (sparsity_config := config.get(SPARSITY_CONFIG_NAME)):
-            return dict()
+            return dict(), []
 
         sparsity_config = SparsityCompressionConfig.model_validate(
             sparsity_config)
@@ -127,7 +133,8 @@ def _sparsity_scheme_map_from_config(
             target: sparsity_config
             for target in sparsity_config.targets or list()
         }
-        return sparse_scheme_map
+        sparsity_ignore_list = sparsity_config.ignore or list()
+        return sparse_scheme_map, sparsity_ignore_list
 
     @classmethod
     def _quantization_scheme_map_from_config(
@@ -352,7 +359,6 @@ def get_scheme(self,
         """
         compressed-tensors supports non uniform in the following way:
 
-        ignore: List of layer_names or nn.Module names to be ignored.
         targets of config_groups: There can be N config_groups which each
             have a quantization scheme. Each config_group has a list of targets
             which can be a full layer_name, a regex for a layer_name, or
@@ -370,6 +376,8 @@ def get_scheme(self,
         # need to make accelerate optional in ct to do this
 
         # Will be empty for models with only sparsity
+        weight_quant = input_quant = None
+        sparsity_scheme: Optional[SparsityCompressionConfig] = None
         if self.target_scheme_map:
             matched_target = find_matched_target(
                 layer_name=layer_name,
@@ -379,19 +387,24 @@ def get_scheme(self,
             scheme_dict = self.target_scheme_map[matched_target]
             weight_quant = scheme_dict.get("weights")
             input_quant = scheme_dict.get("input_activations")
-        elif self.sparsity_scheme_map:
-            matched_target = find_matched_target(
-                layer_name=layer_name,
-                module=layer,
-                targets=self.sparsity_scheme_map.keys())
-            weight_quant = None
-            input_quant = None
 
-        # For models with sparsity, assumes that the sparse layers are also
-        # quantized for cutlass 2:4 support
-        sparsity_scheme: Optional[
-            SparsityCompressionConfig] = self.sparsity_scheme_map.get(
-                matched_target)
+        if self.sparsity_scheme_map:
+            is_ignored = False
+            with suppress(ValueError):
+                is_ignored = find_matched_target(
+                    layer_name=layer_name,
+                    module=layer,
+                    targets=self.sparsity_ignore_list)
+
+            # if the layer is in the sparsity ignore list,
+            # we should not apply any sparsity scheme
+
+            if not is_ignored:
+                matched_target = find_matched_target(
+                    layer_name=layer_name,
+                    module=layer,
+                    targets=self.sparsity_scheme_map.keys())
+                sparsity_scheme = self.sparsity_scheme_map.get(matched_target)
 
         if self.supports_cutlass_24(weight_quant=weight_quant,
                                     input_quant=input_quant,
@@ -419,6 +432,8 @@ def get_scheme(self,
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
+        logger.debug("Using scheme: %s for %s", scheme.__class__.__name__,
+                     layer_name)
         return scheme
 
     def get_cache_scale(self, name: str) -> Optional[str]:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 5bab5a02d83b7..34996b08e9c91 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -12,7 +12,7 @@ def is_activation_quantization_format(format: str) -> bool:
     _ACTIVATION_QUANTIZATION_FORMATS = [
         CompressionFormat.naive_quantized.value,
         CompressionFormat.int_quantized.value,
-        CompressionFormat.float_quantized.value
+        CompressionFormat.float_quantized.value,
     ]
     return format in _ACTIVATION_QUANTIZATION_FORMATS
 
@@ -68,7 +68,7 @@ def should_ignore_layer(layer_name: Optional[str],
 def check_equal_or_regex_match(layer_name: str,
                                targets: Iterable[str]) -> bool:
     """
-    Checks whether a layer_name is exactly equal or a regex match for 
+    Checks whether a layer_name is exactly equal or a regex match for
     if target starts with 're:' to any target in list.
     """
     for target in targets:
@@ -77,17 +77,64 @@ def check_equal_or_regex_match(layer_name: str,
     return False
 
 
+def _handle_fused_layers(func):
+    """
+    Decorator to handle fused layers by mapping vllm fused layer names
+    to their corresponding unfused layer names for quantization/pruning schemes.
+    """
+    # fused_layer_name -> unfused_layer_name
+    fused_layer_map = {
+        "qkv_proj": "q_proj",
+        "gate_up_proj": "up_proj",
+    }
+
+    def fused_layer_handler(layer_name: Optional[str], module: Module,
+                            targets: Iterable[str]) -> Optional[str]:
+        """
+        Wrapper function specifically designed to support the
+        find_matched_target function.
+
+        It handles cases where the provided layer name corresponds to a
+        fused layer in vllm, mapping it to its equivalent unfused layer name
+        based on the predefined fused_layer_map. If the original layer name
+        raises a ValueError in the wrapped function, this handler
+        will attempt to resolve the issue by substituting with unfused
+        layer name.
+
+        :param layer_name: Name of the layer, which may be fused.
+        :param module: An instance of torch.nn.Module.
+        :param targets: A list of target names or patterns to match.
+        :return: The result of the wrapped find_matched_target function with
+            the resolved layer name.
+        :raises ValueError: If the layer name cannot be resolved to a 
+            valid target.
+        """
+        try:
+            return func(layer_name, module, targets)
+        except ValueError:
+            if layer_name is None:
+                layer_name = ""
+            parent_name, fused_proj_name = layer_name.rsplit(".", 1)
+            unfused_proj_name = fused_layer_map.get(fused_proj_name,
+                                                    fused_proj_name)
+            new_layer_name = f"{parent_name}.{unfused_proj_name}"
+            return func(new_layer_name, module, targets)
+
+    return fused_layer_handler
+
+
+@_handle_fused_layers
 def find_matched_target(layer_name: Optional[str], module: Module,
                         targets: Iterable[str]) -> str:
     """
     Helper function to look up which "target" in the compressed-tensors
     config that a layer corresponds to.
 
-    Recall that a compressed-tensors configs has a concept of 
+    Recall that a compressed-tensors configs has a concept of
     config_groups, where each layer can be quantized with with a different
     scheme.
 
-    targets in each config_group will be a list of either layer names 
+    targets in each config_group will be a list of either layer names
     (or regexes corresponding to layer names) or names of torch Modules.
 
     First, we try to match the layer_name with a target
@@ -107,8 +154,9 @@ def find_matched_target(layer_name: Optional[str], module: Module,
                       or _match_fused_layer(layer_name, targets))
 
     if matched_target is None:
-        raise ValueError(f"Unable to find matching target for {module} in the "
-                         "compressed-tensors config.")
+        raise ValueError(
+            f"Unable to find matching target for {layer_name} in the "
+            "compressed-tensors config.")
 
     return matched_target
 

From baeded25699f9f4851843306f27f685c4d4ee7c5 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sat, 1 Feb 2025 00:52:51 -0500
Subject: [PATCH 67/69] [Attention] Deepseek v3 MLA support with FP8 compute
 (#12601)

This PR implements the Deepseek V3 support by performing matrix absorption the fp8 weights

---------

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: simon-mo <simon.mo@hey.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
---
 vllm/attention/backends/mla/utils.py          | 220 +++++++++++++++---
 vllm/attention/backends/triton_mla.py         |  18 +-
 vllm/attention/layer.py                       |   4 +-
 vllm/config.py                                |  39 +++-
 vllm/envs.py                                  |  12 +-
 .../layers/quantization/utils/fp8_utils.py    |  74 ++++--
 .../layers/quantization/utils/quant_utils.py  | 116 ++++++++-
 vllm/model_executor/model_loader/loader.py    |  24 +-
 vllm/model_executor/models/deepseek_v3.py     | 154 +++++++++++-
 vllm/worker/cache_engine.py                   |   4 +-
 10 files changed, 580 insertions(+), 85 deletions(-)

diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
index c6c8a6034e20f..e8fec234c0225 100644
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -1,17 +1,29 @@
 from abc import abstractmethod
 from dataclasses import dataclass
-from typing import Any, Dict, Generic, List, Optional
+from typing import Any, Dict, Generic, List, Optional, Tuple
 
 import torch
+from compressed_tensors.quantization import QuantizationStrategy
 
 from vllm import _custom_ops as ops
 from vllm import envs
 from vllm.attention.backends.abstract import (AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl, T)
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import (get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               RowParallelLinear)
+                                               LinearBase, RowParallelLinear,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+    CompressedTensorsLinearMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsW8A8Fp8)
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    scaled_dequantize, scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.vllm_flash_attn import flash_attn_varlen_func
 
@@ -25,11 +37,11 @@ class MLACommonMetadata(AttentionMetadata):
 
 class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
     """
-    Common class for implementing repeated parts 
-    
+    Common class for implementing repeated parts
+
     Main reference: DeepseekV2 paper, and FlashInfer Implementation
     (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
-    
+
     Deepseek's MLA attention works the following way:
     * Use a single latent vector to represent the entire KV cache.
     * The attention "simulates" a multi-head attention, while the compute is
@@ -46,7 +58,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         * V: V head dim.
         * kv_c: latent/compressed KV
         * q_c: latent/compressed Q
-        
+
         #
         # Outside the MLA attention backend
         #
@@ -55,21 +67,21 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
            kv_c_k_pe (B, Lkv+R).
         2. The kv_c_k_pe is split into kv_c (B, Lkv) and k_pe (B, R). cq
            and kv_c are normalized.
-        
+
         #
         # Inside the MLA attention backend
         #
 
         * if prefill:
-        
-        3. The q_c is then projected up into the multi-head version. 
-           * q_c goes from (B, Lq) to (B, N, (P+R)), which is split into q_nope 
-             (B, N, P) and q_pe (B, N, R). 
+
+        3. The q_c is then projected up into the multi-head version.
+           * q_c goes from (B, Lq) to (B, N, (P+R)), which is split into q_nope
+             (B, N, P) and q_pe (B, N, R).
         4. q_pe, k_pe are then passed through rotary embeddings.
         5. kv_c and k_pe are concatenated and inserted into the cache
-        6. The kv_c is then projected up into the multi-head version. 
-           * kv_c goes from (B, Lkv) to (B, N, (P+V)) which has the nope 
-             dimensions for K and V, which is split into k_nope (B, N, P) 
+        6. The kv_c is then projected up into the multi-head version.
+           * kv_c goes from (B, Lkv) to (B, N, (P+V)) which has the nope
+             dimensions for K and V, which is split into k_nope (B, N, P)
              and v (B, N, V).
         7. q (B, N, (P+R)) and k (B, N, (P+R)) matrices are assembled from
            q_nope, q_pe, k_nope, k_pe.
@@ -112,7 +124,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
     From @tsu-bin's calculation, we only want to use the absorption technique
     for decode. The prefill algorithm should still use the up-projected MHA
     for less flops and memory usage.
-    
+
     """
 
     def __init__(
@@ -162,8 +174,19 @@ def __init__(
 
     def _v_up_proj_and_o_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
-            return self.o_proj_absorbed(
-                x.reshape(-1, self.num_heads * self.kv_lora_rank))[0]
+            if is_fp8(self.W_UV_O):
+                output_parallel = apply_fp8_linear_generic(
+                    x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
+                    self.reqaunt_input_group_shape,
+                    self.reqaunt_weight_group_shape)
+            else:
+                output_parallel = torch.matmul(x.flatten(start_dim=1),
+                                               self.W_UV_O)
+            if self.tp_size > 1:
+                output = tensor_model_parallel_all_reduce(output_parallel)
+            else:
+                output = output_parallel
+            return output
         else:
             x = torch.einsum("bnl,lnv->bnv", x, self.W_UV)
             return self.o_proj(x.reshape(-1,
@@ -171,6 +194,12 @@ def _v_up_proj_and_o_proj(self, x):
 
     def _q_proj_and_k_up_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            if is_fp8(self.W_Q_UK):
+                return apply_fp8_linear_generic(
+                    x, self.W_Q_UK, self.W_Q_UK_scales,
+                    self.reqaunt_input_group_shape,
+                    self.reqaunt_weight_group_shape).view(
+                        -1, self.num_heads, self.kv_lora_rank)
             return torch.matmul(x, self.W_Q_UK)\
                 .view(-1, self.num_heads, self.kv_lora_rank)
         else:
@@ -179,8 +208,91 @@ def _q_proj_and_k_up_proj(self, x):
             return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\
                 .view(-1, self.num_heads, self.kv_lora_rank)
 
-    def process_weights_after_loading(self):
-        kv_b_proj_weight = self.kv_b_proj.weight.T
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+
+        def is_layer_fp8(layer: LinearBase) -> bool:
+            return isinstance(layer.quant_method, Fp8LinearMethod) or\
+                (isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
+                and isinstance(layer.scheme, CompressedTensorsW8A8Fp8))
+
+        def quantization_scheme_supported(layer: LinearBase) -> bool:
+            return isinstance(layer.quant_method, UnquantizedLinearMethod) or \
+                is_layer_fp8(layer)
+
+        # TODO(lucas) This is very gross, we need a more wide scale refactor of
+        # all the FP8 code with a more standard way of
+        # defining schemes/group-shapes, we should also potentially force
+        # quant_methods to support a decompress function
+        #
+        # returns input_group_shape, weight_group_shape
+        def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
+            Tuple[Tuple[int, int], Tuple[int, int]]:
+            if isinstance(layer.quant_method, Fp8LinearMethod):
+                if layer.quant_method.block_quant is not None:
+                    weight_block_size = \
+                        layer.quant_method.quant_config.weight_block_size
+                    # per-token-group (1, X), block-quantized (X, Y)
+                    return (1, weight_block_size[-1]), weight_block_size
+                else:
+                    return (-1, -1), (-1, -1)  # per-tensor, per-tensor
+            elif isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
+                and isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
+                # this is hacky but we always assume the for
+                # CompressedTensorsW8A8Fp8 the input is dynamic per-token
+                # we ignore if it is static-per-tensor since we are going to
+                # requantize after later anyways
+                strategy = layer.scheme.strategy
+                if strategy == QuantizationStrategy.TENSOR:
+                    return (1, -1), (-1, -1)  # per-token, per-tensor
+                elif strategy == QuantizationStrategy.CHANNEL:
+                    return (1, -1), (-1, 1)  # per-token, per-channel
+                else:
+                    raise NotImplementedError(
+                        f"QuantizationStrategy.{strategy} is not supported for "
+                        "fp8 MLA, please run with VLLM_MLA_DISABLE=1")
+            else:
+                raise NotImplementedError(
+                    "Can't determine scale group shapes for "
+                    f"{layer.quant_method}, please run with VLLM_MLA_DISABLE=1"
+                )
+
+        def get_scales(layer: LinearBase) -> torch.Tensor:
+            if hasattr(layer, "weight_scale_inv"):
+                return layer.weight_scale_inv
+            return layer.weight_scale
+
+        def get_and_maybe_dequant_weights(layer: LinearBase):
+            if is_layer_fp8(layer):
+                if isinstance(layer.quant_method, \
+                    CompressedTensorsLinearMethod) and \
+                    isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
+                    # NOTE(lucas): note sure why but `CompressedTensorsW8A8Fp8`
+                    # seems to store weights as (input, output) instead of
+                    # (output, input) so we need to transpose
+                    weight = layer.weight.T  # standardize to (output, input)
+                else:
+                    weight = layer.weight
+                _, weight_scale_group_shape = \
+                    get_scale_group_shapes_for_fp8(layer)
+                scales = get_scales(layer)
+
+                return scaled_dequantize(weight, scales,
+                                         weight_scale_group_shape)
+            else:
+                return layer.weight
+
+        if not (quantization_scheme_supported(self.kv_b_proj) and\
+            quantization_scheme_supported(self.q_proj) and\
+                quantization_scheme_supported(self.o_proj)):
+            raise NotImplementedError(
+                "Only FP8 and UnquantizedLinearMethod are supported for MLA"
+                ", please run with VLLM_MLA_DISABLE=1")
+
+        weight_dtype = self.kv_b_proj.weight.dtype
+        assert self.o_proj.weight.dtype == weight_dtype
+        assert self.q_proj.weight.dtype == weight_dtype
+
+        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
         assert kv_b_proj_weight.shape == (
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
@@ -198,18 +310,35 @@ def process_weights_after_loading(self):
         W_UK, W_UV = kv_b_proj_weight.split(
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
-        q_proj = self.q_proj.weight.T\
+        q_proj_weight = get_and_maybe_dequant_weights(self.q_proj).T\
                 .view(-1, self.num_heads, self.qk_head_dim)
 
         # can be W_Q or W_UQ depending q_lora_rank, the former if
         # q_lora_rank is None, the latter otherwise. From the Attention backend
         # perspective though we call these both W_Q and rely on the layer
         # to pass in the correct matrix
-        W_Q = q_proj[..., :self.qk_nope_head_dim]
-        self.W_QR = q_proj[..., self.qk_nope_head_dim:]\
+        W_Q = q_proj_weight[..., :self.qk_nope_head_dim]
+        self.W_QR = q_proj_weight[..., self.qk_nope_head_dim:]\
             .flatten(start_dim=1).contiguous()
 
+        # W_QR is small so for simplicity we dont bother requantizing it
+        self.W_QR = self.W_QR.to(act_dtype)
+
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            requantization_enabled = not envs.VLLM_MLA_DISABLE_REQUANTIZATION
+            if is_fp8(weight_dtype) and requantization_enabled:
+                # This assumes it wise to requantize using the same group shapes
+                # (i.e. strategy, per-tensor, per-channel, block etc.) that the
+                # weights were originally quantized
+                requant_input_group_shape, requant_weight_group_shape = \
+                    get_scale_group_shapes_for_fp8(self.q_proj)
+                assert (requant_input_group_shape, requant_weight_group_shape)\
+                    == get_scale_group_shapes_for_fp8(self.kv_b_proj)
+                assert (requant_input_group_shape, requant_weight_group_shape)\
+                    == get_scale_group_shapes_for_fp8(self.o_proj)
+                self.reqaunt_input_group_shape = requant_input_group_shape
+                self.reqaunt_weight_group_shape = requant_weight_group_shape
+
             #
             # Perform matrix-absorption following
             #     https://github.com/flashinfer-ai/flashinfer/pull/551
@@ -223,25 +352,44 @@ def process_weights_after_loading(self):
             # latter otherwise
             # basically if q_lora_rank is none we are absorbing into q_proj
             # instead of UQ
-            self.W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
+            W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
                 .flatten(start_dim=1).contiguous()
 
-            W_O = self.o_proj.weight\
+            if is_fp8(weight_dtype) and requantization_enabled:
+                W_Q_UK, W_Q_UK_scales = scaled_quantize(
+                    W_Q_UK,
+                    self.reqaunt_weight_group_shape,
+                    quant_dtype=current_platform_fp8_dtype)
+                # For FP8 save the transpose so we can use
+                # `apply_w8a8_block_fp8_linear` directly
+                self.W_Q_UK = W_Q_UK.T.contiguous()
+                self.W_Q_UK_scales = W_Q_UK_scales.T.contiguous()
+            else:
+                self.W_Q_UK = W_Q_UK.to(act_dtype)
+
+            W_O = get_and_maybe_dequant_weights(self.o_proj)\
                 .view(-1, self.num_heads, self.v_head_dim)
-            self.W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
+            W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
                 .flatten(start_dim=0, end_dim=1).contiguous()
 
-            tp_size = get_tensor_model_parallel_world_size()
-            self.o_proj_absorbed = RowParallelLinear(
-                self.W_UV_O.shape[0] * tp_size,
-                self.W_UV_O.shape[1],
-                bias=False,
-                # TODO(lucas) figure out how to properly forward quant_method
-                #quant_config=self.o_proj.quant_method,
-            )
-
-            self.o_proj_absorbed.weight = torch.nn.Parameter(self.W_UV_O.T)
+            if is_fp8(weight_dtype) and requantization_enabled:
+                W_UV_O, W_UV_O_scales = scaled_quantize(
+                    W_UV_O,
+                    self.reqaunt_weight_group_shape,
+                    quant_dtype=current_platform_fp8_dtype)
+                # For FP8 save the transpose so we can use
+                # `apply_w8a8_block_fp8_linear` directly
+                self.W_UV_O = W_UV_O.T.contiguous()
+                self.W_UV_O_scales = W_UV_O_scales.T.contiguous()
+            else:
+                self.W_UV_O = W_UV_O.to(act_dtype)
+
+            self.tp_size = get_tensor_model_parallel_world_size()
         else:
+            if is_fp8(weight_dtype):
+                raise NotImplementedError(
+                    "Currently fp8 requires matrix absorption")
+
             self.W_UV = W_UV
             self.W_UK = W_UK
             self.W_Q = W_Q.flatten(start_dim=1)
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index da09bb70b4f1a..95dc119a47bb5 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -57,14 +57,12 @@ def get_state_cls() -> Type["TritonMLAState"]:
 
     @staticmethod
     def get_kv_cache_shape(
-            num_blocks: int,
-            block_size: int,
-            num_kv_heads: int,  # assumed to be 1 for MLA
-            kv_lora_rank: int,  # passed via head_size
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,  # assumed to be 1 for MLA
+        head_size: int,
     ) -> Tuple[int, ...]:
-        # TODO(lucas): remove hardcoding k_pe size as 1/8th of kv_lora_rank
-        k_pe_size = kv_lora_rank // 8
-        return (num_blocks, block_size, kv_lora_rank + k_pe_size)
+        return (num_blocks, block_size, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -83,7 +81,7 @@ def copy_blocks(
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
-        return [512]
+        return [576]
 
 
 class TritonMLAState(AttentionState):
@@ -624,8 +622,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             self.multimodal_placeholder_maps.items()
         }
 
-        num_kv_splits = 8
-
         return TritonMLAMetadata(
             num_prefills=self.num_prefills,
             slot_mapping=slot_mapping_tensor,
@@ -645,7 +641,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=use_captured_graph,
-            num_kv_splits=num_kv_splits,
+            num_kv_splits=4,  # TODO(lucas) add heuristic
             head_dim=self.runner.model_config.get_head_size(),
         )
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 9b804a29a485d..b97165f625e51 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -200,9 +200,9 @@ def extra_repr(self) -> str:
         s += f", backend={self.impl.__class__.__name__}"
         return s
 
-    def process_weights_after_loading(self):
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
         if hasattr(self.impl, "process_weights_after_loading"):
-            self.impl.process_weights_after_loading()
+            self.impl.process_weights_after_loading(act_dtype)
 
 
 class MultiHeadAttention(nn.Module):
diff --git a/vllm/config.py b/vllm/config.py
index f6bd8b1ad8f14..f998502eef0da 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -739,18 +739,19 @@ def get_hidden_size(self) -> int:
     @property
     def is_deepseek_mla(self) -> bool:
         # TODO add deepseek_v3
-        return hasattr(self.hf_text_config,
-                       "model_type") and (self.hf_text_config.model_type
-                                          in ('deepseek_v2'))
+        return (hasattr(self.hf_text_config, "model_type")) \
+                and (self.hf_text_config.model_type in \
+                    ('deepseek_v2', 'deepseek_v3'))\
+                and (self.hf_text_config.kv_lora_rank is not None)
 
     def get_head_size(self) -> int:
         # TODO remove hard code
         if self.is_deepseek_mla:
+            qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim",
+                                       0)
             if self.use_mla:
-                return self.hf_text_config.kv_lora_rank
+                return self.hf_text_config.kv_lora_rank + qk_rope_head_dim
             else:
-                qk_rope_head_dim = getattr(self.hf_text_config,
-                                           "qk_rope_head_dim", 0)
                 qk_nope_head_dim = getattr(self.hf_text_config,
                                            "qk_nope_head_dim", 0)
                 if qk_rope_head_dim and qk_nope_head_dim:
@@ -969,6 +970,32 @@ def is_cross_encoder(self) -> bool:
 
     @property
     def use_mla(self) -> bool:
+        if self.quantization is not None and self.quantization not in [\
+            "fp8", "compressed-tensors"]:
+            logger.warning(
+                "MLA is not supported with %s quantization. "
+                "Disabling MLA.", self.quantization)
+            return False
+
+        # If using a "compressed-tensors" checkpoint, check that all groups
+        # have fp8 for both weights and activations.
+        if self.quantization == "compressed-tensors":
+            quant_config = self._parse_quant_hf_config()
+            for group_name, cfg in quant_config.get("config_groups",
+                                                    ("", {})).items():
+                act_cfg = cfg.get("input_activations", {})
+                act_type = None if act_cfg is None else act_cfg.get("type", "")
+                w_cfg = cfg.get("weights", {})
+                w_type = None if w_cfg is None else w_cfg.get("type", "")
+                if act_type != "fp8" or w_type != "fp8":
+                    logger.warning(
+                        "compressed-tensors MLA support requires fp8 "
+                        "activations and weights in group '%s', but got "
+                        "activations type '%s' and weights type '%s'.\n "
+                        "Full config: %s", group_name, act_type, w_type,
+                        quant_config)
+                    return False
+
         use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
         return use_mla
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 2a18e3b9bc51d..25098070b00c9 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -79,6 +79,7 @@
     VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
     VLLM_MLA_DISABLE: bool = False
     VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
+    VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
 
 
 def get_default_cache_root():
@@ -519,7 +520,16 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # storing more weights, W_Q_UK and W_UV_O, so can increase memory usage,
     # the is enabled by default
     "VLLM_MLA_PERFORM_MATRIX_ABSORPTION":
-    lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1")))
+    lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1"))),
+
+    # When running MLA with matrix-absorption enabled and fp8 quantized weights
+    # we perform the matrix-absorption in float32 precision, after the matrices
+    # are absorbed we requantize the weights back to fp8, this flag can be used
+    # to disable the requantization step, and instead convert the absorbed
+    # matrices to match the activation type. This can lead to higher memory and
+    # compute usage but better preserves the accuracy of the original model.
+    "VLLM_MLA_DISABLE_REQUANTIZATION":
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index ccebff341a7ed..850820f66ff90 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -2,7 +2,7 @@
 import functools
 import json
 import os
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 import triton
@@ -10,10 +10,24 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    _normalize_quant_group_shape, scaled_dequantize)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
+current_platform_fp8_dtype = (torch.float8_e4m3fnuz
+                              if current_platform.is_rocm() else
+                              torch.float8_e4m3fn)
+
+
+def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
+    if isinstance(x, torch.Tensor):
+        x = x.dtype
+    return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz
+
 
 def apply_w8a8_block_fp8_linear(
     input: torch.Tensor,
@@ -55,6 +69,42 @@ def apply_w8a8_block_fp8_linear(
     return output.to(dtype=input.dtype).view(*output_shape)
 
 
+# Unify the interface between `apply_w8a8_block_fp8_linear` and
+# `apply_fp8_linear`
+# NOTE(lucas): this is quite messy, we should think through this more formally
+def apply_fp8_linear_generic(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        input_group_shape: Tuple[int, int],
+        weight_group_shape: Tuple[int, int],
+        input_scale: Optional[torch.Tensor] = None,  # static scale if one
+) -> torch.Tensor:
+    # View input as 2D matrix for fp8 methods
+    input = input.view(-1, input.shape[-1])
+
+    weight_group_shape = _normalize_quant_group_shape(\
+        weight, weight_group_shape)
+    input_group_shape = _normalize_quant_group_shape(input, input_group_shape)
+
+    def is_dim_blocked(dim, shape, group_shape):
+        return group_shape < shape[dim] and group_shape > 1
+
+    if is_dim_blocked(0, weight.shape, weight_group_shape[0])\
+     and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\
+     input_group_shape == (1, weight_group_shape[1]):
+        return apply_w8a8_block_fp8_linear(input, weight,
+                                           list(weight_group_shape),
+                                           weight_scale)
+    else:
+        # Despite having linear in the it doesn't conform to
+        # `torch.nn.functional.linear` which is defined as `input @ weight.T`
+        # so we explicitly transpose the weight matrix here
+        return apply_fp8_linear(input, weight.T, weight_scale.T,
+                         use_per_token_if_dynamic=\
+                             (input_group_shape == (1, input.shape[1])))
+
+
 def input_to_float8(
         x: torch.Tensor,
         dtype: Optional[torch.dtype] = None
@@ -75,7 +125,6 @@ def input_to_float8(
 def block_quant_to_tensor_quant(
     x_q_block: torch.Tensor,
     x_s: torch.Tensor,
-    block_size: List[int],
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """This function converts block-wise quantization to tensor-wise
     quantization. The inputs are block-wise quantization tensor `x_q_block`,
@@ -83,26 +132,7 @@ def block_quant_to_tensor_quant(
     The outputs are tensor-wise quantization tensor and tensor-wise
     quantization scale. Note only float8 is supported for now.
     """
-    block_n, block_k = block_size[0], block_size[1]
-    n, k = x_q_block.shape
-    n_tiles = (n + block_n - 1) // block_n
-    k_tiles = (k + block_k - 1) // block_k
-    assert n_tiles == x_s.shape[0]
-    assert k_tiles == x_s.shape[1]
-
-    x_dq_block = x_q_block.to(torch.float32)
-
-    x_dq_block_tiles = [[
-        x_dq_block[
-            j * block_n:min((j + 1) * block_n, n),
-            i * block_k:min((i + 1) * block_k, k),
-        ] for i in range(k_tiles)
-    ] for j in range(n_tiles)]
-
-    for i in range(k_tiles):
-        for j in range(n_tiles):
-            x_dq_block_tiles[j][i][:, :] = x_dq_block_tiles[j][i] * x_s[j][i]
-
+    x_dq_block = scaled_dequantize(x_q_block, x_s)
     x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype)
     return x_q_tensor, scale
 
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 83055d6000d83..95e785dcc4078 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,5 +1,5 @@
 """This file is used for /tests and /benchmarks"""
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import numpy
 import torch
@@ -20,6 +20,120 @@
 }
 
 
+# Normalize the group_shape to the full extent for any dims that are -1
+def _normalize_quant_group_shape(x: torch.Tensor, group_shape: Tuple[int,
+                                                                     int]):
+    # -1 means full extent
+    return (group_shape[0] if group_shape[0] > 0 else x.shape[-2],
+            group_shape[1] if group_shape[1] > 0 else x.shape[-1])
+
+
+# Useful when treating N-dimensional group scaling as extended numpy-style
+# broadcasting in numpy simply stretches dimensions with an extent of 1 to match
+# the target shape by repeating the data along that dimension (broadcasting)
+# , we extend these semantics to say if the extent of a dimension in the
+# source shape is not 1 and does not match the target shape we repeat each
+# element along that dimension src_shape[dim] // target_shape[dim] times
+# example if we have:
+#       a = [[1, 2], and target_shape = (2, 4)
+#            [3, 4]]
+# then we would expand a to:
+#       a = [[1, 1, 2, 2],
+#            [3, 3, 4, 4]]
+# NOTE this function this function does not explicitly broadcast dimensions
+# with an extent of 1, since this can be done implicitly by pytorch
+def group_broadcast(t, shape):
+    for i, s in enumerate(shape):
+        if t.shape[i] != s and t.shape[i] != 1:
+            assert s % t.shape[i] == 0
+            t = t.unsqueeze(i + 1)\
+                .expand(*t.shape[:i+1], s // t.shape[i], *t.shape[i+1:])\
+                .flatten(i, i + 1)
+    return t
+
+
+# Quantize assuming once scale per group of elements with shape group_shape,
+# example group shapes:
+#  * (-1, -1)   for per-tensor quantization
+#  * (1, -1)    for per-row quantization
+#  * (-1, 1)    for per-column quantization
+#  * (128, 128) for 128x128 deepseek style block quantization
+#  * (1, 128)   for deepseek style activation quantization
+#               (i.e. per-token-per-group)
+def scaled_quantize(
+    x: torch.Tensor,
+    group_shape: Tuple[int, int],
+    quant_dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    group_shape = _normalize_quant_group_shape(x, group_shape)
+    assert quant_dtype.is_floating_point, \
+        "currently `scaled_quantize` only supports floating point dtypes " \
+        "but could be extended to support other dtypes"
+
+    finfo = torch.finfo(quant_dtype)
+
+    # Reshape (M, N) into (BLK_M, BLOCK_SIZE_M, BLK_N, BLOCK_SIZE_N)
+    assert x.ndim == 2
+    assert x.shape[0] % group_shape[0] == 0 and x.shape[1] % group_shape[1] == 0
+    blk_m, blk_n = x.shape[0] // group_shape[0], x.shape[1] // group_shape[1]
+    x_blkd = x.reshape(blk_m, group_shape[0], blk_n, group_shape[1])
+
+    # Permute to (BLK_M, BLK_N, BLOCK_SIZE_M, BLOCK_SIZE_N)
+    x_blkd_permd = x_blkd.permute(0, 2, 1, 3)
+    # Flatten to (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N)
+    x_blkd_permd = x_blkd_permd.flatten(start_dim=2)
+
+    # Compute scales
+    min_val, max_val = x_blkd_permd.aminmax(dim=-1)
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax
+
+    # Apply scale and convert form:
+    # (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N) to (M, N)
+    x_scl_sat = (x_blkd_permd * scale.unsqueeze(-1))\
+        .clamp(min=finfo.min, max=finfo.max)\
+        .reshape(blk_m, blk_n, group_shape[0], group_shape[1])\
+        .permute(0, 2, 1, 3)\
+        .reshape(x.shape)
+
+    return x_scl_sat.to(quant_dtype).contiguous(), scale.float().reciprocal()
+
+
+# inverses `scaled_quantize`
+def scaled_dequantize(
+    x_q: torch.Tensor,
+    x_s: torch.Tensor,
+    group_shape: Optional[Tuple[int, int]] = None,
+    out_dtype: torch.dtype = torch.float32,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if group_shape is not None:
+        group_shape = _normalize_quant_group_shape(x_q, group_shape)
+
+    if x_s.ndim == 0:  # scalar
+        x_s = x_s.unsqueeze(-1).unsqueeze(-1)  # convert to (1, 1) tensor
+    if x_s.ndim == 1:
+        if group_shape is None:
+            raise AssertionError(
+                "if x_s is 1D tensor, group_shape must be provided otherwise "
+                "its ambiguous which dimension to broadcast x_s to")
+        # unsqueeze the scales for the dimension where we want to broadcast
+        # across the full extent
+        if group_shape[0] == x_q.shape[-2]:
+            x_s = x_s.unsqueeze(-2)
+        elif group_shape[1] == x_q.shape[-1]:
+            x_s = x_s.unsqueeze(-1)
+        else:
+            raise AssertionError(
+                "if x_s is a vector we should be broadcasting it to the full "
+                "extent of one of the dimensions")
+
+    if group_shape is not None:
+        assert x_s.shape[-1] == x_q.shape[-1] // group_shape[1]
+        assert x_s.shape[-2] == x_q.shape[-2] // group_shape[0]
+    x_s = group_broadcast(x_s.to(torch.float32), x_q.shape)
+    return (x_q.to(torch.float32) * x_s).to(out_dtype)
+
+
 def pack_quantized_values_into_int32(w_q: torch.Tensor,
                                      wtype: ScalarType,
                                      packed_dim: int = 0):
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 62babcddd61b1..4be511d12838d 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -398,11 +398,13 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                     # parameters onto device for processing and back off after.
                     with device_loading_context(module, target_device):
                         quant_method.process_weights_after_loading(module)
-                elif isinstance(module, Attention) and \
+                if isinstance(module, Attention) and \
                     hasattr(module, "process_weights_after_loading"):
                     # When attention modules need to process weights after
                     # currently only used by MLA
-                    module.process_weights_after_loading()
+                    # TODO(lucas): see if there is a way to unify the signatures
+                    # of process_weights_after_loading
+                    module.process_weights_after_loading(model_config.dtype)
         return model.eval()
 
 
@@ -439,6 +441,11 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                     with device_loading_context(
                             module, torch.device(device_config.device)):
                         quant_method.process_weights_after_loading(module)
+                if isinstance(module, Attention) and \
+                    hasattr(module, "process_weights_after_loading"):
+                    # When attention modules need to process weights after
+                    # currently only used by MLA
+                    module.process_weights_after_loading(model_config.dtype)
         return model.eval()
 
 
@@ -633,6 +640,12 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                     quant_method = getattr(module, "quant_method", None)
                     if quant_method is not None:
                         quant_method.process_weights_after_loading(module)
+                    if isinstance(module, Attention) and \
+                        hasattr(module, "process_weights_after_loading"):
+                        # When attention modules need to process weights after
+                        # currently only used by MLA
+                        module.process_weights_after_loading(
+                            model_config.dtype)
             rank = get_tensor_model_parallel_rank()
             pattern = os.path.join(
                 local_model_path,
@@ -1272,7 +1285,7 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
 
 class RunaiModelStreamerLoader(BaseModelLoader):
     """
-        Model loader that can load safetensors 
+        Model loader that can load safetensors
         files from local FS or S3 bucket.
     """
 
@@ -1369,6 +1382,11 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                 if quant_method is not None:
                     with device_loading_context(module, target_device):
                         quant_method.process_weights_after_loading(module)
+                if isinstance(module, Attention) and \
+                    hasattr(module, "process_weights_after_loading"):
+                    # When attention modules need to process weights after
+                    # currently only used by MLA
+                    module.process_weights_after_loading(model_config.dtype)
         return model.eval()
 
 
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index 0b44f0d062c40..f6ab53c85faa3 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -27,7 +27,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -333,12 +333,156 @@ def forward(
         return output
 
 
+class DeepseekV3MLAAttention(nn.Module):
+    """
+    Main reference: DeepseekV2 paper, and FlashInfer Implementation
+    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+    
+    For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.q_lora_rank,
+                                             bias=False,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.q_a_proj")
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
+                                         eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
+                                                 self.num_heads *
+                                                 self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.q_b_proj")
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads *
+                                               self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.q_proj")
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa")
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj")
+        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        rope_scaling["rope_type"] = 'deepseek_yarn'
+        self.rotary_emb = get_rope(qk_rope_head_dim,
+                                   rotary_dim=qk_rope_head_dim,
+                                   max_position=max_position_embeddings,
+                                   base=rope_theta,
+                                   rope_scaling=rope_scaling,
+                                   is_neox_style=False)
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        self.mla_attn = Attention(
+            num_heads=self.num_local_heads,
+            head_size=self.kv_lora_rank,
+            scale=self.scaling,
+            num_kv_heads=1,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_mla=True,
+            # MLA Args
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            qk_head_dim=self.qk_head_dim,
+            v_head_dim=self.v_head_dim,
+            rotary_emb=self.rotary_emb,
+            q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
+            kv_b_proj=self.kv_b_proj,
+            o_proj=self.o_proj,
+        )
+
+        self.prefix = prefix
+        self.debug_layer_idx = int(self.prefix.split(".")[-2])
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        if self.q_lora_rank is not None:
+            ckq = self.q_a_proj(hidden_states)[0]
+            hidden_states_or_q_c = self.q_a_layernorm(ckq)
+        else:
+            hidden_states_or_q_c = hidden_states
+        kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
+        return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, kv_cache,
+                             attn_metadata)
+
+
 class DeepseekV3DecoderLayer(nn.Module):
 
     def __init__(
         self,
         config: PretrainedConfig,
         prefix: str,
+        model_config: ModelConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -351,7 +495,11 @@ def __init__(
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
         layer_idx = int(prefix.split(sep='.')[-1])
-        self.self_attn = DeepseekV3Attention(
+        if model_config.use_mla:
+            attn_cls = DeepseekV3MLAAttention
+        else:
+            attn_cls = DeepseekV3Attention
+        self.self_attn = attn_cls(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -428,6 +576,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
@@ -447,6 +596,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             lambda prefix: DeepseekV3DecoderLayer(
                 config,
                 prefix,
+                model_config=model_config,
                 cache_config=cache_config,
                 quant_config=quant_config,
             ),
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 08316ba74aad8..c427b759b2e97 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -110,7 +110,9 @@ def get_cache_block_size(
             parallel_config, LayerBlockType.attention)
 
         key_cache_block = cache_config.block_size * num_heads * head_size
-        value_cache_block = key_cache_block
+        # For MLA there is no value cache, since the latent vector
+        # is joint keys and values.
+        value_cache_block = key_cache_block if not model_config.use_mla else 0
         total = num_attention_layers * (key_cache_block + value_cache_block)
         if cache_config.cache_dtype == "auto":
             dtype = model_config.dtype

From 1e3698393fca22c70dc03539cf534181466d1d25 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sat, 1 Feb 2025 02:13:10 -0500
Subject: [PATCH 68/69] [CI/Build] Add label automation for structured-output,
 speculative-decoding, v1 (#12280)

We have `v1`, `structured-output`, and `speculative-decoding` labels on
github. This adds automation for applying these labels based on the
files touched by a PR.

Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/mergify.yml | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index ca4bd7ee2b87f..43bc5ce623d3c 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -35,6 +35,43 @@ pull_request_rules:
       add:
         - frontend
 
+- name: label-structured-output
+  description: Automatically apply structured-output label
+  conditions:
+    - or:
+      - files~=^vllm/model_executor/guided_decoding/
+      - files=tests/model_executor/test_guided_processors.py
+      - files=tests/entrypoints/llm/test_guided_generate.py
+      - files=benchmarks/benchmark_serving_guided.py
+      - files=benchmarks/benchmark_guided.py
+  actions:
+    label:
+      add:
+        - structured-output
+
+- name: label-speculative-decoding
+  description: Automatically apply speculative-decoding label
+  conditions:
+    - or:
+      - files~=^vllm/spec_decode/
+      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
+      - files~=^tests/spec_decode/
+  actions:
+    label:
+      add:
+        - speculative-decoding
+
+- name: label-v1
+  description: Automatically apply v1 label
+  conditions:
+    - or:
+      - files~=^vllm/v1/
+      - files~=^tests/v1/
+  actions:
+    label:
+      add:
+        - v1
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict

From 4f4d427ac2cee0f8ff7f79103001f6617fa8989c Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 31 Jan 2025 23:46:57 -0800
Subject: [PATCH 69/69] Disable chunked prefill and/or prefix caching when MLA
 is enabled  (#12642)

From @mgoin in https://github.com/vllm-project/vllm/pull/12638

I cannot push to that branch, therefore a new PR to unblock release.

---------

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 vllm/config.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index f998502eef0da..a13700aba3435 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3252,6 +3252,16 @@ def __post_init__(self):
 
         current_platform.check_and_update_config(self)
 
+        # If MLA is enabled, force disable chunked prefill and prefix caching
+        if self.model_config and self.model_config.use_mla:
+            logger.info("MLA is enabled; forcing chunked prefill and prefix "
+                        "caching to be disabled.")
+            self.scheduler_config.enable_chunked_prefill = False
+            self.scheduler_config.chunked_prefill_enabled = False
+
+            if self.cache_config is not None:
+                self.cache_config.enable_prefix_caching = False
+
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]