From 84f03a3fac7967bb3f71b32cb5f9deee46ec7d59 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi3.zhang@intel.com>
Date: Mon, 6 Jan 2025 11:03:47 +0800
Subject: [PATCH] [CPU]Define key/value cache prec/group_size priority

Signed-off-by: Zhang Yi <yi3.zhang@intel.com>
---
 .../openvino/runtime/properties/__init__.py   |   4 +
 .../runtime/properties/hint/__init__.py       |   4 -
 .../pyopenvino/core/properties/properties.cpp |   8 +-
 .../tests/test_runtime/test_properties.py     |  24 ++--
 .../include/openvino/runtime/properties.hpp   |  48 +++----
 src/plugins/intel_cpu/src/compiled_model.cpp  |  24 ++--
 src/plugins/intel_cpu/src/config.cpp          |  48 +++++--
 src/plugins/intel_cpu/src/config.h            |   4 +
 .../intel_cpu/src/nodes/scaled_attn.cpp       |  19 ++-
 src/plugins/intel_cpu/src/plugin.cpp          |  24 ++--
 .../ov_executable_network/properties.cpp      | 128 +++++++++++++-----
 .../custom/behavior/ov_plugin/properties.cpp  |   8 +-
 12 files changed, 225 insertions(+), 118 deletions(-)

diff --git a/src/bindings/python/src/openvino/runtime/properties/__init__.py b/src/bindings/python/src/openvino/runtime/properties/__init__.py
index 3269ea42e32ac2..a02a18e556135b 100644
--- a/src/bindings/python/src/openvino/runtime/properties/__init__.py
+++ b/src/bindings/python/src/openvino/runtime/properties/__init__.py
@@ -30,6 +30,10 @@
 from openvino._pyopenvino.properties import loaded_from_cache
 from openvino._pyopenvino.properties import cache_encryption_callbacks
 from openvino._pyopenvino.properties import weights_path
+from openvino._pyopenvino.properties import key_cache_precision
+from openvino._pyopenvino.properties import value_cache_precision
+from openvino._pyopenvino.properties import key_cache_group_size
+from openvino._pyopenvino.properties import value_cache_group_size
 
 # Submodules
 from openvino.runtime.properties import hint
diff --git a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py
index d5c5d5595e5e0b..d1dce289d09941 100644
--- a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py
+++ b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py
@@ -23,8 +23,4 @@
 from openvino._pyopenvino.properties.hint import allow_auto_batching
 from openvino._pyopenvino.properties.hint import dynamic_quantization_group_size
 from openvino._pyopenvino.properties.hint import kv_cache_precision
-from openvino._pyopenvino.properties.hint import key_cache_precision
-from openvino._pyopenvino.properties.hint import value_cache_precision
-from openvino._pyopenvino.properties.hint import key_cache_group_size
-from openvino._pyopenvino.properties.hint import value_cache_group_size
 from openvino._pyopenvino.properties.hint import activations_scale_factor
diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
index 2b997c6664cee0..937e9b66a0135f 100644
--- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
+++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
@@ -44,6 +44,10 @@ void regmodule_properties(py::module m) {
     wrap_property_RW(m_properties, ov::force_tbb_terminate, "force_tbb_terminate");
     wrap_property_RW(m_properties, ov::enable_mmap, "enable_mmap");
     wrap_property_RW(m_properties, ov::weights_path, "weights_path");
+    wrap_property_RW(m_properties, ov::key_cache_precision, "key_cache_precision");
+    wrap_property_RW(m_properties, ov::value_cache_precision, "value_cache_precision");
+    wrap_property_RW(m_properties, ov::key_cache_group_size, "key_cache_group_size");
+    wrap_property_RW(m_properties, ov::value_cache_group_size, "value_cache_group_size");
 
     wrap_property_RO(m_properties, ov::supported_properties, "supported_properties");
     wrap_property_RO(m_properties, ov::available_devices, "available_devices");
@@ -101,10 +105,6 @@ void regmodule_properties(py::module m) {
     wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching");
     wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size");
     wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision");
-    wrap_property_RW(m_hint, ov::hint::key_cache_precision, "key_cache_precision");
-    wrap_property_RW(m_hint, ov::hint::value_cache_precision, "value_cache_precision");
-    wrap_property_RW(m_hint, ov::hint::key_cache_group_size, "key_cache_group_size");
-    wrap_property_RW(m_hint, ov::hint::value_cache_group_size, "value_cache_group_size");
     wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor");
 
     // Submodule intel_cpu
diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py
index d0745f84361310..cbdd117c9fe97f 100644
--- a/src/bindings/python/tests/test_runtime/test_properties.py
+++ b/src/bindings/python/tests/test_runtime/test_properties.py
@@ -271,6 +271,18 @@ def test_properties_ro(ov_property_ro, expected_value):
             "WEIGHTS_PATH",
             (("./model.bin", "./model.bin"),),
         ),
+        (
+            props.key_cache_group_size,
+            "KEY_CACHE_GROUP_SIZE",
+            ((64, 64),),
+        ),
+        (
+            props.value_cache_group_size,
+            "VALUE_CACHE_GROUP_SIZE",
+            ((64, 64),),
+        ),
+        (props.key_cache_precision, "KEY_CACHE_PRECISION", ((Type.f32, Type.f32),)),
+        (props.value_cache_precision, "VALUE_CACHE_PRECISION", ((Type.f32, Type.f32),)),
         (hints.inference_precision, "INFERENCE_PRECISION_HINT", ((Type.f32, Type.f32),)),
         (
             hints.model_priority,
@@ -334,19 +346,7 @@ def test_properties_ro(ov_property_ro, expected_value):
             "DYNAMIC_QUANTIZATION_GROUP_SIZE",
             ((64, 64),),
         ),
-        (
-            hints.key_cache_group_size,
-            "KEY_CACHE_GROUP_SIZE",
-            ((64, 64),),
-        ),
-        (
-            hints.value_cache_group_size,
-            "VALUE_CACHE_GROUP_SIZE",
-            ((64, 64),),
-        ),
         (hints.kv_cache_precision, "KV_CACHE_PRECISION", ((Type.f32, Type.f32),)),
-        (hints.key_cache_precision, "KEY_CACHE_PRECISION", ((Type.f32, Type.f32),)),
-        (hints.value_cache_precision, "VALUE_CACHE_PRECISION", ((Type.f32, Type.f32),)),
         (
             hints.activations_scale_factor,
             "ACTIVATIONS_SCALE_FACTOR",
diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp
index 729ccc93feac1f..c7570b818f9665 100644
--- a/src/inference/include/openvino/runtime/properties.hpp
+++ b/src/inference/include/openvino/runtime/properties.hpp
@@ -580,30 +580,6 @@ static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization
  */
 static constexpr Property<element::Type, PropertyMutability::RW> kv_cache_precision{"KV_CACHE_PRECISION"};
 
-/**
- * @brief Hint for device to use specified precision for key cache compression
- * @ingroup ov_runtime_cpp_prop_api
- */
-static constexpr Property<element::Type, PropertyMutability::RW> key_cache_precision{"KEY_CACHE_PRECISION"};
-
-/**
- * @brief Hint for device to use specified precision for value cache compression
- * @ingroup ov_runtime_cpp_prop_api
- */
-static constexpr Property<element::Type, PropertyMutability::RW> value_cache_precision{"VALUE_CACHE_PRECISION"};
-
-/**
- * @brief Hint for device to use group_size for key cache compression
- * @ingroup ov_runtime_cpp_prop_api
- */
-static constexpr Property<uint64_t, PropertyMutability::RW> key_cache_group_size{"KEY_CACHE_GROUP_SIZE"};
-
-/**
- * @brief Hint for device to use group_size for value cache compression
- * @ingroup ov_runtime_cpp_prop_api
- */
-static constexpr Property<uint64_t, PropertyMutability::RW> value_cache_group_size{"VALUE_CACHE_GROUP_SIZE"};
-
 /**
  * @brief This property scales down activations to prevent overflows when inference precision is f16.
  * @ingroup ov_runtime_cpp_prop_api
@@ -1383,4 +1359,28 @@ static constexpr Property<std::vector<std::string>, PropertyMutability::RO> exec
  * @note This property is used for weightless caching. Only used when ov::CacheMode Property is set to "OPTIMIZE_SIZE".
  */
 static constexpr Property<std::string, PropertyMutability::RW> weights_path{"WEIGHTS_PATH"};
+
+/**
+ * @brief The precision of key cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<element::Type, PropertyMutability::RW> key_cache_precision{"KEY_CACHE_PRECISION"};
+
+/**
+ * @brief The precision of value cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<element::Type, PropertyMutability::RW> value_cache_precision{"VALUE_CACHE_PRECISION"};
+
+/**
+ * @brief The group_size of key cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<uint64_t, PropertyMutability::RW> key_cache_group_size{"KEY_CACHE_GROUP_SIZE"};
+
+/**
+ * @brief The group_size of value cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<uint64_t, PropertyMutability::RW> value_cache_group_size{"VALUE_CACHE_GROUP_SIZE"};
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp
index 59ba95ffbeb4c1..60b63f871e0c95 100644
--- a/src/plugins/intel_cpu/src/compiled_model.cpp
+++ b/src/plugins/intel_cpu/src/compiled_model.cpp
@@ -256,10 +256,10 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
             RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
             RO_property(ov::hint::dynamic_quantization_group_size.name()),
             RO_property(ov::hint::kv_cache_precision.name()),
-            RO_property(ov::hint::key_cache_precision.name()),
-            RO_property(ov::hint::value_cache_precision.name()),
-            RO_property(ov::hint::key_cache_group_size.name()),
-            RO_property(ov::hint::value_cache_group_size.name()),
+            RO_property(ov::key_cache_precision.name()),
+            RO_property(ov::value_cache_precision.name()),
+            RO_property(ov::key_cache_group_size.name()),
+            RO_property(ov::value_cache_group_size.name()),
         };
 
         OPENVINO_SUPPRESS_DEPRECATED_START
@@ -336,14 +336,14 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
         return decltype(ov::hint::dynamic_quantization_group_size)::value_type(config.fcDynamicQuantizationGroupSize);
     } else if (name == ov::hint::kv_cache_precision) {
         return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision);
-    } else if (name == ov::hint::key_cache_precision) {
-        return decltype(ov::hint::key_cache_precision)::value_type(config.keyCachePrecision);
-    } else if (name == ov::hint::value_cache_precision) {
-        return decltype(ov::hint::value_cache_precision)::value_type(config.valueCachePrecision);
-    } else if (name == ov::hint::key_cache_group_size) {
-        return decltype(ov::hint::key_cache_group_size)::value_type(config.keyCacheGroupSize);
-    } else if (name == ov::hint::value_cache_group_size) {
-        return decltype(ov::hint::value_cache_group_size)::value_type(config.valueCacheGroupSize);
+    } else if (name == ov::key_cache_precision) {
+        return decltype(ov::key_cache_precision)::value_type(config.keyCachePrecision);
+    } else if (name == ov::value_cache_precision) {
+        return decltype(ov::value_cache_precision)::value_type(config.valueCachePrecision);
+    } else if (name == ov::key_cache_group_size) {
+        return decltype(ov::key_cache_group_size)::value_type(config.keyCacheGroupSize);
+    } else if (name == ov::value_cache_group_size) {
+        return decltype(ov::value_cache_group_size)::value_type(config.valueCacheGroupSize);
     }
     OPENVINO_THROW("Unsupported property: ", name);
 }
diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
index 3b052e7094d34c..1004ae076eac10 100644
--- a/src/plugins/intel_cpu/src/config.cpp
+++ b/src/plugins/intel_cpu/src/config.cpp
@@ -373,9 +373,9 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
                                ov::hint::kv_cache_precision.name(),
                                ". Supported values: u8, bf16, f16, f32");
             }
-        } else if (key == ov::hint::key_cache_precision.name()) {
+        } else if (key == ov::key_cache_precision.name()) {
             try {
-                kvCachePrecisionSetExplicitly = true;
+                keyCachePrecisionSetExplicitly = true;
                 auto const prec = val.as<ov::element::Type>();
                 if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) {
                     keyCachePrecision = prec;
@@ -386,12 +386,12 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
                 OPENVINO_THROW("Wrong value ",
                                val.as<std::string>(),
                                " for property key ",
-                               ov::hint::key_cache_precision.name(),
+                               ov::key_cache_precision.name(),
                                ". Supported values: u8, bf16, f16, f32");
             }
-        } else if (key == ov::hint::value_cache_precision.name()) {
+        } else if (key == ov::value_cache_precision.name()) {
             try {
-                kvCachePrecisionSetExplicitly = true;
+                valueCachePrecisionSetExplicitly = true;
                 auto const prec = val.as<ov::element::Type>();
                 if (one_of(prec,
                            ov::element::f32,
@@ -407,15 +407,17 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
                 OPENVINO_THROW("Wrong value ",
                                val.as<std::string>(),
                                " for property key ",
-                               ov::hint::value_cache_precision.name(),
+                               ov::value_cache_precision.name(),
                                ". Supported values: u4, u8, bf16, f16, f32");
             }
-        } else if (key == ov::hint::key_cache_group_size.name() || key == ov::hint::value_cache_group_size.name()) {
+        } else if (key == ov::key_cache_group_size.name() || key == ov::value_cache_group_size.name()) {
             try {
                 auto const groupSize = val.as<uint64_t>();
-                if (key == ov::hint::key_cache_group_size.name()) {
+                if (key == ov::key_cache_group_size.name()) {
+                    keyCacheGroupSizeSetExplicitly = true;
                     keyCacheGroupSize = groupSize;
                 } else {
+                    valueCacheGroupSizeSetExplicitly = true;
                     valueCacheGroupSize = groupSize;
                 }
             } catch (ov::Exception&) {
@@ -460,6 +462,13 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
         aclFastMath = true;
     }
 #endif
+    // key/value cache precision has higher priority, if not defined use kvCachePrecision
+    if (!keyCachePrecisionSetExplicitly && kvCachePrecisionSetExplicitly) {
+        keyCachePrecision = kvCachePrecision;
+    }
+    if (!valueCachePrecisionSetExplicitly && kvCachePrecisionSetExplicitly) {
+        valueCachePrecision = kvCachePrecision;
+    }
     // disable dynamic quantization and kv quantization for best accuracy
     if (executionMode == ov::hint::ExecutionMode::ACCURACY) {
         if (!fcDynamicQuantizationGroupSizeSetExplicitly) {
@@ -467,9 +476,13 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
         }
         if (!kvCachePrecisionSetExplicitly) {
             kvCachePrecision = ov::element::f32;
-            valueCachePrecision = ov::element::f32;
+        }
+        if (!keyCachePrecisionSetExplicitly) {
             keyCachePrecision = ov::element::f32;
         }
+        if (!valueCachePrecisionSetExplicitly) {
+            valueCachePrecision = ov::element::f32;
+        }
     }
 
     if (!prop.empty())
@@ -524,6 +537,23 @@ void Config::applyRtInfo(const std::shared_ptr<const ov::Model>& model) {
         this->fcDynamicQuantizationGroupSize =
             model->get_rt_info<uint64_t>({"runtime_options", ov::hint::dynamic_quantization_group_size.name()});
     }
+    if (!keyCachePrecisionSetExplicitly && model->has_rt_info({"runtime_options", ov::key_cache_precision.name()})) {
+        this->keyCachePrecision =
+            model->get_rt_info<ov::element::Type>({"runtime_options", ov::key_cache_precision.name()});
+    }
+    if (!valueCachePrecisionSetExplicitly &&
+        model->has_rt_info({"runtime_options", ov::value_cache_precision.name()})) {
+        this->valueCachePrecision =
+            model->get_rt_info<ov::element::Type>({"runtime_options", ov::value_cache_precision.name()});
+    }
+    if (!keyCacheGroupSizeSetExplicitly && model->has_rt_info({"runtime_options", ov::key_cache_group_size.name()})) {
+        this->keyCacheGroupSize = model->get_rt_info<uint64_t>({"runtime_options", ov::key_cache_group_size.name()});
+    }
+    if (!valueCacheGroupSizeSetExplicitly &&
+        model->has_rt_info({"runtime_options", ov::value_cache_group_size.name()})) {
+        this->valueCacheGroupSize =
+            model->get_rt_info<uint64_t>({"runtime_options", ov::value_cache_group_size.name()});
+    }
 }
 
 }  // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
index 94d4b6e90c531d..75bfde2303a34f 100644
--- a/src/plugins/intel_cpu/src/config.h
+++ b/src/plugins/intel_cpu/src/config.h
@@ -48,6 +48,10 @@ struct Config {
     uint64_t fcDynamicQuantizationGroupSize = 32;
     bool fcDynamicQuantizationGroupSizeSetExplicitly = false;
     bool kvCachePrecisionSetExplicitly = false;
+    bool keyCachePrecisionSetExplicitly = false;
+    bool valueCachePrecisionSetExplicitly = false;
+    bool keyCacheGroupSizeSetExplicitly = false;
+    bool valueCacheGroupSizeSetExplicitly = false;
 #if defined(OV_CPU_WITH_ACL)
     bool aclFastMath = false;
 #endif
diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
index c0d19a9acd6e15..41d87e3388a035 100644
--- a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp
@@ -1061,7 +1061,14 @@ ScaledDotProductAttention::ScaledDotProductAttention(const std::shared_ptr<ov::N
     if (!isSupportedOperation(op, errorMessage)) {
         OPENVINO_THROW("CPU: " + errorMessage);
     }
-
+    const auto& cpuConfig = context->getConfig();
+    const auto& keyCachePrecision = cpuConfig.keyCachePrecision;
+    const auto& valueCachePrecision = cpuConfig.valueCachePrecision;
+    OPENVINO_ASSERT(valueCachePrecision == keyCachePrecision,
+                    "CPU: SDPA node only supports same key/value cache precision");
+    OPENVINO_ASSERT(one_of(keyCachePrecision, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8),
+                    "CPU: SDPA only supports key/value cache precision f32, f16, bf16, u8 but gets ",
+                    keyCachePrecision);
     if (const auto node = std::dynamic_pointer_cast<const ov::op::v13::ScaledDotProductAttention>(op)) {
         m_config.config.is_causal = node->get_causal();
     } else if (const auto node = std::dynamic_pointer_cast<const ScaledDotProductAttentionWithKVCache>(op)) {
@@ -1835,12 +1842,16 @@ void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const M
 
 ov::element::Type ScaledDotProductAttention::getKVCachePrecision() {
     ov::element::Type kvcache_precision;
+    // TODO: SDPA only supports same key/value cache precision.
     auto rtPrecision = getRuntimePrecision();
-    auto kvCachePrecisionHint = context->getConfig().kvCachePrecision;
+    auto keyCachePrecisionHint = context->getConfig().keyCachePrecision;
+    auto valueCachePrecisionHint = context->getConfig().valueCachePrecision;
     bool enableKVCacheFP16 = m_config.config.fuse_concat && mayiuse(cpu_isa_t::avx2) &&
-                             rtPrecision != ov::element::bf16 && kvCachePrecisionHint == ov::element::f16;
+                             rtPrecision != ov::element::bf16 &&
+                             (keyCachePrecisionHint == ov::element::f16 && valueCachePrecisionHint == ov::element::f16);
     kvcache_precision = enableKVCacheFP16 ? ov::element::f16 : rtPrecision;
-    bool use_int8_kv_cache_precision = kvCachePrecisionHint == ov::element::u8;
+    bool use_int8_kv_cache_precision =
+        (keyCachePrecisionHint == ov::element::u8 && valueCachePrecisionHint == ov::element::u8);
     if (use_int8_kv_cache_precision)
         kvcache_precision = ov::element::u8;
     else
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index 1c7c79a9c9c6e0..ec9b37c2c2d22e 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -392,14 +392,14 @@ ov::Any Plugin::get_property(const std::string& name, const ov::AnyMap& options)
             engConfig.fcDynamicQuantizationGroupSize);
     } else if (name == ov::hint::kv_cache_precision) {
         return decltype(ov::hint::kv_cache_precision)::value_type(engConfig.kvCachePrecision);
-    } else if (name == ov::hint::key_cache_precision) {
-        return decltype(ov::hint::key_cache_precision)::value_type(engConfig.keyCachePrecision);
-    } else if (name == ov::hint::value_cache_precision) {
-        return decltype(ov::hint::value_cache_precision)::value_type(engConfig.valueCachePrecision);
-    } else if (name == ov::hint::key_cache_group_size) {
-        return decltype(ov::hint::key_cache_group_size)::value_type(engConfig.keyCacheGroupSize);
-    } else if (name == ov::hint::value_cache_group_size) {
-        return decltype(ov::hint::value_cache_group_size)::value_type(engConfig.valueCacheGroupSize);
+    } else if (name == ov::key_cache_precision) {
+        return decltype(ov::key_cache_precision)::value_type(engConfig.keyCachePrecision);
+    } else if (name == ov::value_cache_precision) {
+        return decltype(ov::value_cache_precision)::value_type(engConfig.valueCachePrecision);
+    } else if (name == ov::key_cache_group_size) {
+        return decltype(ov::key_cache_group_size)::value_type(engConfig.keyCacheGroupSize);
+    } else if (name == ov::value_cache_group_size) {
+        return decltype(ov::value_cache_group_size)::value_type(engConfig.valueCacheGroupSize);
     }
     return get_ro_property(name, options);
 }
@@ -443,10 +443,10 @@ ov::Any Plugin::get_ro_property(const std::string& name, const ov::AnyMap& optio
             RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
             RW_property(ov::hint::dynamic_quantization_group_size.name()),
             RW_property(ov::hint::kv_cache_precision.name()),
-            RW_property(ov::hint::key_cache_precision.name()),
-            RW_property(ov::hint::value_cache_precision.name()),
-            RW_property(ov::hint::key_cache_group_size.name()),
-            RW_property(ov::hint::value_cache_group_size.name()),
+            RW_property(ov::key_cache_precision.name()),
+            RW_property(ov::value_cache_precision.name()),
+            RW_property(ov::key_cache_group_size.name()),
+            RW_property(ov::value_cache_group_size.name()),
         };
 
         OPENVINO_SUPPRESS_DEPRECATED_START
diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
index 016648a7e1026f..9d38d03e5eadde 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
@@ -2,14 +2,15 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "openvino/runtime/properties.hpp"
+
 #include <gtest/gtest.h>
 
-#include "utils/properties_test.hpp"
-#include "openvino/runtime/system_conf.hpp"
-#include "openvino/runtime/core.hpp"
 #include "openvino/runtime/compiled_model.hpp"
-#include "openvino/runtime/properties.hpp"
+#include "openvino/runtime/core.hpp"
 #include "openvino/runtime/intel_cpu/properties.hpp"
+#include "openvino/runtime/system_conf.hpp"
+#include "utils/properties_test.hpp"
 
 namespace {
 
@@ -41,10 +42,10 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSupportedPropertiesAreAvailable
         RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
         RO_property(ov::hint::dynamic_quantization_group_size.name()),
         RO_property(ov::hint::kv_cache_precision.name()),
-        RO_property(ov::hint::key_cache_precision.name()),
-        RO_property(ov::hint::value_cache_precision.name()),
-        RO_property(ov::hint::key_cache_group_size.name()),
-        RO_property(ov::hint::value_cache_group_size.name()),
+        RO_property(ov::key_cache_precision.name()),
+        RO_property(ov::value_cache_precision.name()),
+        RO_property(ov::key_cache_group_size.name()),
+        RO_property(ov::value_cache_group_size.name()),
     };
 
     ov::Core ie;
@@ -88,7 +89,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSetROPropertiesThrow) {
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreStreamsHasHigherPriorityThanThroughputHint) {
     ov::Core ie;
-    int32_t streams = 1; // throughput hint should apply higher number of streams
+    int32_t streams = 1;  // throughput hint should apply higher number of streams
     int32_t value = 0;
 
     OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::num_streams(streams)));
@@ -101,7 +102,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreStreamsHasHigherPriori
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreStreamsHasHigherPriorityThanLatencyHint) {
     ov::Core ie;
-    int32_t streams = ov::get_number_of_cpu_cores(); // latency hint should apply lower number of streams
+    int32_t streams = ov::get_number_of_cpu_cores();  // latency hint should apply lower number of streams
     int32_t value = 0;
 
     OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::num_streams(streams)));
@@ -114,7 +115,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreStreamsHasHigherPriori
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelStreamsHasHigherPriorityThanLatencyHint) {
     ov::Core ie;
-    int32_t streams = ov::get_number_of_cpu_cores(); // latency hint should apply lower number of streams
+    int32_t streams = ov::get_number_of_cpu_cores();  // latency hint should apply lower number of streams
     int32_t value = 0;
 
     OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)));
@@ -129,7 +130,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelStreamsHasHigherPrior
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelStreamsHasHigherPriorityThanThroughputHint) {
     ov::Core ie;
-    int32_t streams = 1; // throughput hint should apply higher number of streams
+    int32_t streams = 1;  // throughput hint should apply higher number of streams
     int32_t value = 0;
 
     ov::AnyMap config;
@@ -190,14 +191,14 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckKVCachePrecision) {
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkFinetuneKVCachePrecision) {
     ov::Core core;
 
-    core.set_property(deviceName, ov::hint::key_cache_precision(ov::element::f16));
-    core.set_property(deviceName, ov::hint::value_cache_precision(ov::element::u4));
+    core.set_property(deviceName, ov::key_cache_precision(ov::element::f16));
+    core.set_property(deviceName, ov::value_cache_precision(ov::element::u4));
     ov::CompiledModel compiledModel = core.compile_model(model, deviceName);
 
     auto key_cache_precision_value = ov::element::undefined;
     auto value_cache_precision_value = ov::element::undefined;
-    OV_ASSERT_NO_THROW(key_cache_precision_value = compiledModel.get_property(ov::hint::key_cache_precision));
-    OV_ASSERT_NO_THROW(value_cache_precision_value = compiledModel.get_property(ov::hint::value_cache_precision));
+    OV_ASSERT_NO_THROW(key_cache_precision_value = compiledModel.get_property(ov::key_cache_precision));
+    OV_ASSERT_NO_THROW(value_cache_precision_value = compiledModel.get_property(ov::value_cache_precision));
     ASSERT_EQ(key_cache_precision_value, ov::element::f16);
     ASSERT_EQ(value_cache_precision_value, ov::element::u4);
 }
@@ -205,14 +206,14 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkFinetuneKVCachePrecision) {
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkFinetuneKVCacheGroupSize) {
     ov::Core core;
 
-    core.set_property(deviceName, ov::hint::key_cache_group_size(32));
-    core.set_property(deviceName, ov::hint::value_cache_group_size(16));
+    core.set_property(deviceName, ov::key_cache_group_size(32));
+    core.set_property(deviceName, ov::value_cache_group_size(16));
     ov::CompiledModel compiledModel = core.compile_model(model, deviceName);
 
     auto key_cache_group_size_value = 0;
     auto value_cache_group_size_value = 0;
-    OV_ASSERT_NO_THROW(key_cache_group_size_value = compiledModel.get_property(ov::hint::key_cache_group_size));
-    OV_ASSERT_NO_THROW(value_cache_group_size_value = compiledModel.get_property(ov::hint::value_cache_group_size));
+    OV_ASSERT_NO_THROW(key_cache_group_size_value = compiledModel.get_property(ov::key_cache_group_size));
+    OV_ASSERT_NO_THROW(value_cache_group_size_value = compiledModel.get_property(ov::value_cache_group_size));
     ASSERT_EQ(key_cache_group_size_value, 32);
     ASSERT_EQ(value_cache_group_size_value, 16);
 }
@@ -260,7 +261,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckExecutionModeIsAvailableIn
     ASSERT_FALSE(model_exec_mode_it->is_mutable());
 }
 
-TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelInferencePrecisionHasHigherPriorityThanCoreInferencePrecision) {
+TEST_F(OVClassConfigTestCPU,
+       smoke_CpuExecNetworkCheckModelInferencePrecisionHasHigherPriorityThanCoreInferencePrecision) {
     ov::Core ie;
     auto inference_precision_value = ov::element::undefined;
 
@@ -274,7 +276,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelInferencePrecisionHas
     ASSERT_EQ(inference_precision_value, bf16_if_can_be_emulated);
 }
 
-TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreInferencePrecisionHasHigherPriorityThanModelPerformanceExecutionMode) {
+TEST_F(OVClassConfigTestCPU,
+       smoke_CpuExecNetworkCheckCoreInferencePrecisionHasHigherPriorityThanModelPerformanceExecutionMode) {
     ov::Core ie;
     auto execution_mode_value = ov::hint::ExecutionMode::ACCURACY;
     auto inference_precision_value = ov::element::undefined;
@@ -292,7 +295,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreInferencePrecisionHasH
     ASSERT_EQ(inference_precision_value, ov::element::f32);
 }
 
-TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelInferencePrecisionHasHigherPriorityThanCorePerformanceExecutionMode) {
+TEST_F(OVClassConfigTestCPU,
+       smoke_CpuExecNetworkCheckModelInferencePrecisionHasHigherPriorityThanCorePerformanceExecutionMode) {
     ov::Core ie;
     auto execution_mode_value = ov::hint::ExecutionMode::PERFORMANCE;
     auto inference_precision_value = ov::element::undefined;
@@ -323,14 +327,13 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckLogLevel) {
         OV_ASSERT_NO_THROW(value = compiledModel.get_property(ov::log::level));
         ASSERT_EQ(value.as<ov::log::Level>(), ov::log::Level::NO);
     }
-    //check set and get
-    const std::vector<ov::log::Level> logLevels = {
-        ov::log::Level::ERR,
-        ov::log::Level::NO,
-        ov::log::Level::WARNING,
-        ov::log::Level::INFO,
-        ov::log::Level::DEBUG,
-        ov::log::Level::TRACE};
+    // check set and get
+    const std::vector<ov::log::Level> logLevels = {ov::log::Level::ERR,
+                                                   ov::log::Level::NO,
+                                                   ov::log::Level::WARNING,
+                                                   ov::log::Level::INFO,
+                                                   ov::log::Level::DEBUG,
+                                                   ov::log::Level::TRACE};
 
     for (unsigned int i = 0; i < logLevels.size(); i++) {
         ov::Any value;
@@ -365,50 +368,109 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptions) {
     ov::Core ie;
     ov::Any type;
     ov::Any size;
+    ov::Any keySize;
+    ov::Any valueSize;
+    ov::Any keyCacheType;
+    ov::Any valueCacheType;
     ov::CompiledModel compiledModel;
     model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
     model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
+    model->set_rt_info("32", "runtime_options", ov::key_cache_group_size.name());
+    model->set_rt_info("16", "runtime_options", ov::value_cache_group_size.name());
+    model->set_rt_info("u8", "runtime_options", ov::key_cache_precision.name());
+    model->set_rt_info("u8", "runtime_options", ov::value_cache_precision.name());
     OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName));
     OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
     OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
+    OV_ASSERT_NO_THROW(keySize = compiledModel.get_property(ov::key_cache_group_size));
+    OV_ASSERT_NO_THROW(valueSize = compiledModel.get_property(ov::value_cache_group_size));
+    OV_ASSERT_NO_THROW(keyCacheType = compiledModel.get_property(ov::key_cache_precision));
+    OV_ASSERT_NO_THROW(valueCacheType = compiledModel.get_property(ov::value_cache_precision));
     ASSERT_EQ(type.as<ov::element::Type>(), ov::element::f16);
     ASSERT_EQ(size.as<uint64_t>(), 0);
+    ASSERT_EQ(keySize.as<uint64_t>(), 32);
+    ASSERT_EQ(valueSize.as<uint64_t>(), 16);
+    ASSERT_EQ(keyCacheType.as<ov::element::Type>(), ov::element::u8);
+    ASSERT_EQ(valueCacheType.as<ov::element::Type>(), ov::element::u8);
 }
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCompileConfig) {
     ov::Core ie;
     ov::Any type;
     ov::Any size;
+    ov::Any keySize;
+    ov::Any valueSize;
+    ov::Any keyCacheType;
+    ov::Any valueCacheType;
     ov::CompiledModel compiledModel;
     model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
     model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
+    model->set_rt_info("0", "runtime_options", ov::key_cache_group_size.name());
+    model->set_rt_info("0", "runtime_options", ov::value_cache_group_size.name());
+    model->set_rt_info("f32", "runtime_options", ov::key_cache_precision.name());
+    model->set_rt_info("f32", "runtime_options", ov::value_cache_precision.name());
     ov::AnyMap config;
     config[ov::hint::kv_cache_precision.name()] = "u8";
     config[ov::hint::dynamic_quantization_group_size.name()] = "16";
+    // propperty has higher priority than rt_info
+    config[ov::key_cache_group_size.name()] = "32";
+    config[ov::value_cache_group_size.name()] = "16";
+    // key/value cache prec has higher priority than kvCachePrec
+    config[ov::key_cache_precision.name()] = "f16";
+    config[ov::value_cache_precision.name()] = "bf16";
     OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName, config));
     OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
     OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
+    OV_ASSERT_NO_THROW(keySize = compiledModel.get_property(ov::key_cache_group_size));
+    OV_ASSERT_NO_THROW(valueSize = compiledModel.get_property(ov::value_cache_group_size));
+    OV_ASSERT_NO_THROW(keyCacheType = compiledModel.get_property(ov::key_cache_precision));
+    OV_ASSERT_NO_THROW(valueCacheType = compiledModel.get_property(ov::value_cache_precision));
     ASSERT_EQ(type.as<ov::element::Type>(), ov::element::u8);
     ASSERT_EQ(size.as<uint64_t>(), 16);
+    ASSERT_EQ(keySize.as<uint64_t>(), 32);
+    ASSERT_EQ(valueSize.as<uint64_t>(), 16);
+    ASSERT_EQ(keyCacheType.as<ov::element::Type>(), ov::element::f16);
+    ASSERT_EQ(valueCacheType.as<ov::element::Type>(), ov::element::bf16);
 }
 
 TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCoreProperties) {
     ov::Core core;
     ov::Any type;
     ov::Any size;
-
+    ov::Any keySize;
+    ov::Any valueSize;
+    ov::Any keyCacheType;
+    ov::Any valueCacheType;
     core.set_property(deviceName, ov::hint::kv_cache_precision(ov::element::f32));
     core.set_property(deviceName, ov::hint::dynamic_quantization_group_size(16));
+    core.set_property(deviceName, ov::key_cache_group_size(8));
+    core.set_property(deviceName, ov::value_cache_group_size(8));
+    core.set_property(deviceName, ov::key_cache_precision(ov::element::f16));
+    core.set_property(deviceName, ov::value_cache_precision(ov::element::bf16));
 
     ov::CompiledModel compiledModel;
     model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
     model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
+    model->set_rt_info("32", "runtime_options", ov::key_cache_group_size.name());
+    model->set_rt_info("16", "runtime_options", ov::value_cache_group_size.name());
+    // User's setting has higher priority than rt_info
+    model->set_rt_info("f32", "runtime_options", ov::key_cache_precision.name());
+    model->set_rt_info("f32", "runtime_options", ov::value_cache_precision.name());
 
     OV_ASSERT_NO_THROW(compiledModel = core.compile_model(model, deviceName));
     OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
     OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
+    OV_ASSERT_NO_THROW(keySize = compiledModel.get_property(ov::key_cache_group_size));
+    OV_ASSERT_NO_THROW(valueSize = compiledModel.get_property(ov::value_cache_group_size));
+    OV_ASSERT_NO_THROW(keyCacheType = compiledModel.get_property(ov::key_cache_precision));
+    OV_ASSERT_NO_THROW(valueCacheType = compiledModel.get_property(ov::value_cache_precision));
+
     ASSERT_EQ(type.as<ov::element::Type>(), ov::element::f32);
     ASSERT_EQ(size.as<uint64_t>(), 16);
+    ASSERT_EQ(keySize.as<uint64_t>(), 8);
+    ASSERT_EQ(valueSize.as<uint64_t>(), 8);
+    ASSERT_EQ(keyCacheType.as<ov::element::Type>(), ov::element::f16);
+    ASSERT_EQ(valueCacheType.as<ov::element::Type>(), ov::element::bf16);
 }
 
-} // namespace
+}  // namespace
diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
index 589f0641eae0e8..c6289a4dc80716 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
@@ -56,10 +56,10 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginAllSupportedPropertiesAreAvailable) {
         RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
         RW_property(ov::hint::dynamic_quantization_group_size.name()),
         RW_property(ov::hint::kv_cache_precision.name()),
-        RW_property(ov::hint::key_cache_precision.name()),
-        RW_property(ov::hint::value_cache_precision.name()),
-        RW_property(ov::hint::key_cache_group_size.name()),
-        RW_property(ov::hint::value_cache_group_size.name()),
+        RW_property(ov::key_cache_precision.name()),
+        RW_property(ov::value_cache_precision.name()),
+        RW_property(ov::key_cache_group_size.name()),
+        RW_property(ov::value_cache_group_size.name()),
     };
 
     ov::Core ie;