From 4f4d427ac2cee0f8ff7f79103001f6617fa8989c Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 31 Jan 2025 23:46:57 -0800 Subject: [PATCH] Disable chunked prefill and/or prefix caching when MLA is enabled (#12642) From @mgoin in https://github.com/vllm-project/vllm/pull/12638 I cannot push to that branch, therefore a new PR to unblock release. --------- Signed-off-by: mgoin Signed-off-by: simon-mo Co-authored-by: mgoin --- vllm/config.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index f998502eef0da..a13700aba3435 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3252,6 +3252,16 @@ def __post_init__(self): current_platform.check_and_update_config(self) + # If MLA is enabled, force disable chunked prefill and prefix caching + if self.model_config and self.model_config.use_mla: + logger.info("MLA is enabled; forcing chunked prefill and prefix " + "caching to be disabled.") + self.scheduler_config.enable_chunked_prefill = False + self.scheduler_config.chunked_prefill_enabled = False + + if self.cache_config is not None: + self.cache_config.enable_prefix_caching = False + if not self.instance_id: self.instance_id = random_uuid()[:5]