From 92d618243ddecaa16a77ac0ff1f86cc6d2d0132b Mon Sep 17 00:00:00 2001 From: charlifu Date: Mon, 24 Jun 2024 15:27:02 +0000 Subject: [PATCH] fix error --- vllm/model_executor/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f935f6060e65a..efe27e77dd4a2 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -572,7 +572,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 - if hasattr(layer_self_attn, "kv_scale"): + if hasattr(layer_self_attn.attn, "_kv_scale"): layer_self_attn.attn._kv_scale = scaling_factor else: raise RuntimeError("Self attention has no KV cache scaling "