fix error (#65)

ROCm · Jun 24, 2024 · 17e6307 · 17e6307
1 parent fa78403
commit 17e6307
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -572,7 +572,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
                 # which is consistent with the practice of setting
                 # scaling_factor = tensor_amax / FPtype_max
                 scaling_factor *= 2
-            if hasattr(layer_self_attn, "kv_scale"):
+            if hasattr(layer_self_attn.attn, "_kv_scale"):
                 layer_self_attn.attn._kv_scale = scaling_factor
             else:
                 raise RuntimeError("Self attention has no KV cache scaling "