From 92d618243ddecaa16a77ac0ff1f86cc6d2d0132b Mon Sep 17 00:00:00 2001
From: charlifu <charlifu@amd.com>
Date: Mon, 24 Jun 2024 15:27:02 +0000
Subject: [PATCH] fix error

---
 vllm/model_executor/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f935f6060e65a..efe27e77dd4a2 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -572,7 +572,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
                 # which is consistent with the practice of setting
                 # scaling_factor = tensor_amax / FPtype_max
                 scaling_factor *= 2
-            if hasattr(layer_self_attn, "kv_scale"):
+            if hasattr(layer_self_attn.attn, "_kv_scale"):
                 layer_self_attn.attn._kv_scale = scaling_factor
             else:
                 raise RuntimeError("Self attention has no KV cache scaling "