From 84f03a3fac7967bb3f71b32cb5f9deee46ec7d59 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 6 Jan 2025 11:03:47 +0800 Subject: [PATCH] [CPU]Define key/value cache prec/group_size priority Signed-off-by: Zhang Yi --- .../openvino/runtime/properties/__init__.py | 4 + .../runtime/properties/hint/__init__.py | 4 - .../pyopenvino/core/properties/properties.cpp | 8 +- .../tests/test_runtime/test_properties.py | 24 ++-- .../include/openvino/runtime/properties.hpp | 48 +++---- src/plugins/intel_cpu/src/compiled_model.cpp | 24 ++-- src/plugins/intel_cpu/src/config.cpp | 48 +++++-- src/plugins/intel_cpu/src/config.h | 4 + .../intel_cpu/src/nodes/scaled_attn.cpp | 19 ++- src/plugins/intel_cpu/src/plugin.cpp | 24 ++-- .../ov_executable_network/properties.cpp | 128 +++++++++++++----- .../custom/behavior/ov_plugin/properties.cpp | 8 +- 12 files changed, 225 insertions(+), 118 deletions(-) diff --git a/src/bindings/python/src/openvino/runtime/properties/__init__.py b/src/bindings/python/src/openvino/runtime/properties/__init__.py index 3269ea42e32ac2..a02a18e556135b 100644 --- a/src/bindings/python/src/openvino/runtime/properties/__init__.py +++ b/src/bindings/python/src/openvino/runtime/properties/__init__.py @@ -30,6 +30,10 @@ from openvino._pyopenvino.properties import loaded_from_cache from openvino._pyopenvino.properties import cache_encryption_callbacks from openvino._pyopenvino.properties import weights_path +from openvino._pyopenvino.properties import key_cache_precision +from openvino._pyopenvino.properties import value_cache_precision +from openvino._pyopenvino.properties import key_cache_group_size +from openvino._pyopenvino.properties import value_cache_group_size # Submodules from openvino.runtime.properties import hint diff --git a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py index d5c5d5595e5e0b..d1dce289d09941 100644 --- a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py +++ b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py @@ -23,8 +23,4 @@ from openvino._pyopenvino.properties.hint import allow_auto_batching from openvino._pyopenvino.properties.hint import dynamic_quantization_group_size from openvino._pyopenvino.properties.hint import kv_cache_precision -from openvino._pyopenvino.properties.hint import key_cache_precision -from openvino._pyopenvino.properties.hint import value_cache_precision -from openvino._pyopenvino.properties.hint import key_cache_group_size -from openvino._pyopenvino.properties.hint import value_cache_group_size from openvino._pyopenvino.properties.hint import activations_scale_factor diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp index 2b997c6664cee0..937e9b66a0135f 100644 --- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp +++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp @@ -44,6 +44,10 @@ void regmodule_properties(py::module m) { wrap_property_RW(m_properties, ov::force_tbb_terminate, "force_tbb_terminate"); wrap_property_RW(m_properties, ov::enable_mmap, "enable_mmap"); wrap_property_RW(m_properties, ov::weights_path, "weights_path"); + wrap_property_RW(m_properties, ov::key_cache_precision, "key_cache_precision"); + wrap_property_RW(m_properties, ov::value_cache_precision, "value_cache_precision"); + wrap_property_RW(m_properties, ov::key_cache_group_size, "key_cache_group_size"); + wrap_property_RW(m_properties, ov::value_cache_group_size, "value_cache_group_size"); wrap_property_RO(m_properties, ov::supported_properties, "supported_properties"); wrap_property_RO(m_properties, ov::available_devices, "available_devices"); @@ -101,10 +105,6 @@ void regmodule_properties(py::module m) { wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching"); wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size"); wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision"); - wrap_property_RW(m_hint, ov::hint::key_cache_precision, "key_cache_precision"); - wrap_property_RW(m_hint, ov::hint::value_cache_precision, "value_cache_precision"); - wrap_property_RW(m_hint, ov::hint::key_cache_group_size, "key_cache_group_size"); - wrap_property_RW(m_hint, ov::hint::value_cache_group_size, "value_cache_group_size"); wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor"); // Submodule intel_cpu diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index d0745f84361310..cbdd117c9fe97f 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -271,6 +271,18 @@ def test_properties_ro(ov_property_ro, expected_value): "WEIGHTS_PATH", (("./model.bin", "./model.bin"),), ), + ( + props.key_cache_group_size, + "KEY_CACHE_GROUP_SIZE", + ((64, 64),), + ), + ( + props.value_cache_group_size, + "VALUE_CACHE_GROUP_SIZE", + ((64, 64),), + ), + (props.key_cache_precision, "KEY_CACHE_PRECISION", ((Type.f32, Type.f32),)), + (props.value_cache_precision, "VALUE_CACHE_PRECISION", ((Type.f32, Type.f32),)), (hints.inference_precision, "INFERENCE_PRECISION_HINT", ((Type.f32, Type.f32),)), ( hints.model_priority, @@ -334,19 +346,7 @@ def test_properties_ro(ov_property_ro, expected_value): "DYNAMIC_QUANTIZATION_GROUP_SIZE", ((64, 64),), ), - ( - hints.key_cache_group_size, - "KEY_CACHE_GROUP_SIZE", - ((64, 64),), - ), - ( - hints.value_cache_group_size, - "VALUE_CACHE_GROUP_SIZE", - ((64, 64),), - ), (hints.kv_cache_precision, "KV_CACHE_PRECISION", ((Type.f32, Type.f32),)), - (hints.key_cache_precision, "KEY_CACHE_PRECISION", ((Type.f32, Type.f32),)), - (hints.value_cache_precision, "VALUE_CACHE_PRECISION", ((Type.f32, Type.f32),)), ( hints.activations_scale_factor, "ACTIVATIONS_SCALE_FACTOR", diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp index 729ccc93feac1f..c7570b818f9665 100644 --- a/src/inference/include/openvino/runtime/properties.hpp +++ b/src/inference/include/openvino/runtime/properties.hpp @@ -580,30 +580,6 @@ static constexpr Property dynamic_quantization */ static constexpr Property kv_cache_precision{"KV_CACHE_PRECISION"}; -/** - * @brief Hint for device to use specified precision for key cache compression - * @ingroup ov_runtime_cpp_prop_api - */ -static constexpr Property key_cache_precision{"KEY_CACHE_PRECISION"}; - -/** - * @brief Hint for device to use specified precision for value cache compression - * @ingroup ov_runtime_cpp_prop_api - */ -static constexpr Property value_cache_precision{"VALUE_CACHE_PRECISION"}; - -/** - * @brief Hint for device to use group_size for key cache compression - * @ingroup ov_runtime_cpp_prop_api - */ -static constexpr Property key_cache_group_size{"KEY_CACHE_GROUP_SIZE"}; - -/** - * @brief Hint for device to use group_size for value cache compression - * @ingroup ov_runtime_cpp_prop_api - */ -static constexpr Property value_cache_group_size{"VALUE_CACHE_GROUP_SIZE"}; - /** * @brief This property scales down activations to prevent overflows when inference precision is f16. * @ingroup ov_runtime_cpp_prop_api @@ -1383,4 +1359,28 @@ static constexpr Property, PropertyMutability::RO> exec * @note This property is used for weightless caching. Only used when ov::CacheMode Property is set to "OPTIMIZE_SIZE". */ static constexpr Property weights_path{"WEIGHTS_PATH"}; + +/** + * @brief The precision of key cache compression + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr Property key_cache_precision{"KEY_CACHE_PRECISION"}; + +/** + * @brief The precision of value cache compression + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr Property value_cache_precision{"VALUE_CACHE_PRECISION"}; + +/** + * @brief The group_size of key cache compression + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr Property key_cache_group_size{"KEY_CACHE_GROUP_SIZE"}; + +/** + * @brief The group_size of value cache compression + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr Property value_cache_group_size{"VALUE_CACHE_GROUP_SIZE"}; } // namespace ov diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp index 59ba95ffbeb4c1..60b63f871e0c95 100644 --- a/src/plugins/intel_cpu/src/compiled_model.cpp +++ b/src/plugins/intel_cpu/src/compiled_model.cpp @@ -256,10 +256,10 @@ ov::Any CompiledModel::get_property(const std::string& name) const { RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RO_property(ov::hint::dynamic_quantization_group_size.name()), RO_property(ov::hint::kv_cache_precision.name()), - RO_property(ov::hint::key_cache_precision.name()), - RO_property(ov::hint::value_cache_precision.name()), - RO_property(ov::hint::key_cache_group_size.name()), - RO_property(ov::hint::value_cache_group_size.name()), + RO_property(ov::key_cache_precision.name()), + RO_property(ov::value_cache_precision.name()), + RO_property(ov::key_cache_group_size.name()), + RO_property(ov::value_cache_group_size.name()), }; OPENVINO_SUPPRESS_DEPRECATED_START @@ -336,14 +336,14 @@ ov::Any CompiledModel::get_property(const std::string& name) const { return decltype(ov::hint::dynamic_quantization_group_size)::value_type(config.fcDynamicQuantizationGroupSize); } else if (name == ov::hint::kv_cache_precision) { return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision); - } else if (name == ov::hint::key_cache_precision) { - return decltype(ov::hint::key_cache_precision)::value_type(config.keyCachePrecision); - } else if (name == ov::hint::value_cache_precision) { - return decltype(ov::hint::value_cache_precision)::value_type(config.valueCachePrecision); - } else if (name == ov::hint::key_cache_group_size) { - return decltype(ov::hint::key_cache_group_size)::value_type(config.keyCacheGroupSize); - } else if (name == ov::hint::value_cache_group_size) { - return decltype(ov::hint::value_cache_group_size)::value_type(config.valueCacheGroupSize); + } else if (name == ov::key_cache_precision) { + return decltype(ov::key_cache_precision)::value_type(config.keyCachePrecision); + } else if (name == ov::value_cache_precision) { + return decltype(ov::value_cache_precision)::value_type(config.valueCachePrecision); + } else if (name == ov::key_cache_group_size) { + return decltype(ov::key_cache_group_size)::value_type(config.keyCacheGroupSize); + } else if (name == ov::value_cache_group_size) { + return decltype(ov::value_cache_group_size)::value_type(config.valueCacheGroupSize); } OPENVINO_THROW("Unsupported property: ", name); } diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 3b052e7094d34c..1004ae076eac10 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -373,9 +373,9 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { ov::hint::kv_cache_precision.name(), ". Supported values: u8, bf16, f16, f32"); } - } else if (key == ov::hint::key_cache_precision.name()) { + } else if (key == ov::key_cache_precision.name()) { try { - kvCachePrecisionSetExplicitly = true; + keyCachePrecisionSetExplicitly = true; auto const prec = val.as(); if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) { keyCachePrecision = prec; @@ -386,12 +386,12 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { OPENVINO_THROW("Wrong value ", val.as(), " for property key ", - ov::hint::key_cache_precision.name(), + ov::key_cache_precision.name(), ". Supported values: u8, bf16, f16, f32"); } - } else if (key == ov::hint::value_cache_precision.name()) { + } else if (key == ov::value_cache_precision.name()) { try { - kvCachePrecisionSetExplicitly = true; + valueCachePrecisionSetExplicitly = true; auto const prec = val.as(); if (one_of(prec, ov::element::f32, @@ -407,15 +407,17 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { OPENVINO_THROW("Wrong value ", val.as(), " for property key ", - ov::hint::value_cache_precision.name(), + ov::value_cache_precision.name(), ". Supported values: u4, u8, bf16, f16, f32"); } - } else if (key == ov::hint::key_cache_group_size.name() || key == ov::hint::value_cache_group_size.name()) { + } else if (key == ov::key_cache_group_size.name() || key == ov::value_cache_group_size.name()) { try { auto const groupSize = val.as(); - if (key == ov::hint::key_cache_group_size.name()) { + if (key == ov::key_cache_group_size.name()) { + keyCacheGroupSizeSetExplicitly = true; keyCacheGroupSize = groupSize; } else { + valueCacheGroupSizeSetExplicitly = true; valueCacheGroupSize = groupSize; } } catch (ov::Exception&) { @@ -460,6 +462,13 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { aclFastMath = true; } #endif + // key/value cache precision has higher priority, if not defined use kvCachePrecision + if (!keyCachePrecisionSetExplicitly && kvCachePrecisionSetExplicitly) { + keyCachePrecision = kvCachePrecision; + } + if (!valueCachePrecisionSetExplicitly && kvCachePrecisionSetExplicitly) { + valueCachePrecision = kvCachePrecision; + } // disable dynamic quantization and kv quantization for best accuracy if (executionMode == ov::hint::ExecutionMode::ACCURACY) { if (!fcDynamicQuantizationGroupSizeSetExplicitly) { @@ -467,9 +476,13 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { } if (!kvCachePrecisionSetExplicitly) { kvCachePrecision = ov::element::f32; - valueCachePrecision = ov::element::f32; + } + if (!keyCachePrecisionSetExplicitly) { keyCachePrecision = ov::element::f32; } + if (!valueCachePrecisionSetExplicitly) { + valueCachePrecision = ov::element::f32; + } } if (!prop.empty()) @@ -524,6 +537,23 @@ void Config::applyRtInfo(const std::shared_ptr& model) { this->fcDynamicQuantizationGroupSize = model->get_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()}); } + if (!keyCachePrecisionSetExplicitly && model->has_rt_info({"runtime_options", ov::key_cache_precision.name()})) { + this->keyCachePrecision = + model->get_rt_info({"runtime_options", ov::key_cache_precision.name()}); + } + if (!valueCachePrecisionSetExplicitly && + model->has_rt_info({"runtime_options", ov::value_cache_precision.name()})) { + this->valueCachePrecision = + model->get_rt_info({"runtime_options", ov::value_cache_precision.name()}); + } + if (!keyCacheGroupSizeSetExplicitly && model->has_rt_info({"runtime_options", ov::key_cache_group_size.name()})) { + this->keyCacheGroupSize = model->get_rt_info({"runtime_options", ov::key_cache_group_size.name()}); + } + if (!valueCacheGroupSizeSetExplicitly && + model->has_rt_info({"runtime_options", ov::value_cache_group_size.name()})) { + this->valueCacheGroupSize = + model->get_rt_info({"runtime_options", ov::value_cache_group_size.name()}); + } } } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 94d4b6e90c531d..75bfde2303a34f 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -48,6 +48,10 @@ struct Config { uint64_t fcDynamicQuantizationGroupSize = 32; bool fcDynamicQuantizationGroupSizeSetExplicitly = false; bool kvCachePrecisionSetExplicitly = false; + bool keyCachePrecisionSetExplicitly = false; + bool valueCachePrecisionSetExplicitly = false; + bool keyCacheGroupSizeSetExplicitly = false; + bool valueCacheGroupSizeSetExplicitly = false; #if defined(OV_CPU_WITH_ACL) bool aclFastMath = false; #endif diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp index c0d19a9acd6e15..41d87e3388a035 100644 --- a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp +++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp @@ -1061,7 +1061,14 @@ ScaledDotProductAttention::ScaledDotProductAttention(const std::shared_ptrgetConfig(); + const auto& keyCachePrecision = cpuConfig.keyCachePrecision; + const auto& valueCachePrecision = cpuConfig.valueCachePrecision; + OPENVINO_ASSERT(valueCachePrecision == keyCachePrecision, + "CPU: SDPA node only supports same key/value cache precision"); + OPENVINO_ASSERT(one_of(keyCachePrecision, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8), + "CPU: SDPA only supports key/value cache precision f32, f16, bf16, u8 but gets ", + keyCachePrecision); if (const auto node = std::dynamic_pointer_cast(op)) { m_config.config.is_causal = node->get_causal(); } else if (const auto node = std::dynamic_pointer_cast(op)) { @@ -1835,12 +1842,16 @@ void ScaledDotProductAttention::updatePastkv(const MemoryPtr& mem_cur_k, const M ov::element::Type ScaledDotProductAttention::getKVCachePrecision() { ov::element::Type kvcache_precision; + // TODO: SDPA only supports same key/value cache precision. auto rtPrecision = getRuntimePrecision(); - auto kvCachePrecisionHint = context->getConfig().kvCachePrecision; + auto keyCachePrecisionHint = context->getConfig().keyCachePrecision; + auto valueCachePrecisionHint = context->getConfig().valueCachePrecision; bool enableKVCacheFP16 = m_config.config.fuse_concat && mayiuse(cpu_isa_t::avx2) && - rtPrecision != ov::element::bf16 && kvCachePrecisionHint == ov::element::f16; + rtPrecision != ov::element::bf16 && + (keyCachePrecisionHint == ov::element::f16 && valueCachePrecisionHint == ov::element::f16); kvcache_precision = enableKVCacheFP16 ? ov::element::f16 : rtPrecision; - bool use_int8_kv_cache_precision = kvCachePrecisionHint == ov::element::u8; + bool use_int8_kv_cache_precision = + (keyCachePrecisionHint == ov::element::u8 && valueCachePrecisionHint == ov::element::u8); if (use_int8_kv_cache_precision) kvcache_precision = ov::element::u8; else diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 1c7c79a9c9c6e0..ec9b37c2c2d22e 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -392,14 +392,14 @@ ov::Any Plugin::get_property(const std::string& name, const ov::AnyMap& options) engConfig.fcDynamicQuantizationGroupSize); } else if (name == ov::hint::kv_cache_precision) { return decltype(ov::hint::kv_cache_precision)::value_type(engConfig.kvCachePrecision); - } else if (name == ov::hint::key_cache_precision) { - return decltype(ov::hint::key_cache_precision)::value_type(engConfig.keyCachePrecision); - } else if (name == ov::hint::value_cache_precision) { - return decltype(ov::hint::value_cache_precision)::value_type(engConfig.valueCachePrecision); - } else if (name == ov::hint::key_cache_group_size) { - return decltype(ov::hint::key_cache_group_size)::value_type(engConfig.keyCacheGroupSize); - } else if (name == ov::hint::value_cache_group_size) { - return decltype(ov::hint::value_cache_group_size)::value_type(engConfig.valueCacheGroupSize); + } else if (name == ov::key_cache_precision) { + return decltype(ov::key_cache_precision)::value_type(engConfig.keyCachePrecision); + } else if (name == ov::value_cache_precision) { + return decltype(ov::value_cache_precision)::value_type(engConfig.valueCachePrecision); + } else if (name == ov::key_cache_group_size) { + return decltype(ov::key_cache_group_size)::value_type(engConfig.keyCacheGroupSize); + } else if (name == ov::value_cache_group_size) { + return decltype(ov::value_cache_group_size)::value_type(engConfig.valueCacheGroupSize); } return get_ro_property(name, options); } @@ -443,10 +443,10 @@ ov::Any Plugin::get_ro_property(const std::string& name, const ov::AnyMap& optio RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RW_property(ov::hint::dynamic_quantization_group_size.name()), RW_property(ov::hint::kv_cache_precision.name()), - RW_property(ov::hint::key_cache_precision.name()), - RW_property(ov::hint::value_cache_precision.name()), - RW_property(ov::hint::key_cache_group_size.name()), - RW_property(ov::hint::value_cache_group_size.name()), + RW_property(ov::key_cache_precision.name()), + RW_property(ov::value_cache_precision.name()), + RW_property(ov::key_cache_group_size.name()), + RW_property(ov::value_cache_group_size.name()), }; OPENVINO_SUPPRESS_DEPRECATED_START diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp index 016648a7e1026f..9d38d03e5eadde 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp @@ -2,14 +2,15 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/runtime/properties.hpp" + #include -#include "utils/properties_test.hpp" -#include "openvino/runtime/system_conf.hpp" -#include "openvino/runtime/core.hpp" #include "openvino/runtime/compiled_model.hpp" -#include "openvino/runtime/properties.hpp" +#include "openvino/runtime/core.hpp" #include "openvino/runtime/intel_cpu/properties.hpp" +#include "openvino/runtime/system_conf.hpp" +#include "utils/properties_test.hpp" namespace { @@ -41,10 +42,10 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSupportedPropertiesAreAvailable RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RO_property(ov::hint::dynamic_quantization_group_size.name()), RO_property(ov::hint::kv_cache_precision.name()), - RO_property(ov::hint::key_cache_precision.name()), - RO_property(ov::hint::value_cache_precision.name()), - RO_property(ov::hint::key_cache_group_size.name()), - RO_property(ov::hint::value_cache_group_size.name()), + RO_property(ov::key_cache_precision.name()), + RO_property(ov::value_cache_precision.name()), + RO_property(ov::key_cache_group_size.name()), + RO_property(ov::value_cache_group_size.name()), }; ov::Core ie; @@ -88,7 +89,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSetROPropertiesThrow) { TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreStreamsHasHigherPriorityThanThroughputHint) { ov::Core ie; - int32_t streams = 1; // throughput hint should apply higher number of streams + int32_t streams = 1; // throughput hint should apply higher number of streams int32_t value = 0; OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::num_streams(streams))); @@ -101,7 +102,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreStreamsHasHigherPriori TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreStreamsHasHigherPriorityThanLatencyHint) { ov::Core ie; - int32_t streams = ov::get_number_of_cpu_cores(); // latency hint should apply lower number of streams + int32_t streams = ov::get_number_of_cpu_cores(); // latency hint should apply lower number of streams int32_t value = 0; OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::num_streams(streams))); @@ -114,7 +115,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreStreamsHasHigherPriori TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelStreamsHasHigherPriorityThanLatencyHint) { ov::Core ie; - int32_t streams = ov::get_number_of_cpu_cores(); // latency hint should apply lower number of streams + int32_t streams = ov::get_number_of_cpu_cores(); // latency hint should apply lower number of streams int32_t value = 0; OV_ASSERT_NO_THROW(ie.set_property(deviceName, ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY))); @@ -129,7 +130,7 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelStreamsHasHigherPrior TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelStreamsHasHigherPriorityThanThroughputHint) { ov::Core ie; - int32_t streams = 1; // throughput hint should apply higher number of streams + int32_t streams = 1; // throughput hint should apply higher number of streams int32_t value = 0; ov::AnyMap config; @@ -190,14 +191,14 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckKVCachePrecision) { TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkFinetuneKVCachePrecision) { ov::Core core; - core.set_property(deviceName, ov::hint::key_cache_precision(ov::element::f16)); - core.set_property(deviceName, ov::hint::value_cache_precision(ov::element::u4)); + core.set_property(deviceName, ov::key_cache_precision(ov::element::f16)); + core.set_property(deviceName, ov::value_cache_precision(ov::element::u4)); ov::CompiledModel compiledModel = core.compile_model(model, deviceName); auto key_cache_precision_value = ov::element::undefined; auto value_cache_precision_value = ov::element::undefined; - OV_ASSERT_NO_THROW(key_cache_precision_value = compiledModel.get_property(ov::hint::key_cache_precision)); - OV_ASSERT_NO_THROW(value_cache_precision_value = compiledModel.get_property(ov::hint::value_cache_precision)); + OV_ASSERT_NO_THROW(key_cache_precision_value = compiledModel.get_property(ov::key_cache_precision)); + OV_ASSERT_NO_THROW(value_cache_precision_value = compiledModel.get_property(ov::value_cache_precision)); ASSERT_EQ(key_cache_precision_value, ov::element::f16); ASSERT_EQ(value_cache_precision_value, ov::element::u4); } @@ -205,14 +206,14 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkFinetuneKVCachePrecision) { TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkFinetuneKVCacheGroupSize) { ov::Core core; - core.set_property(deviceName, ov::hint::key_cache_group_size(32)); - core.set_property(deviceName, ov::hint::value_cache_group_size(16)); + core.set_property(deviceName, ov::key_cache_group_size(32)); + core.set_property(deviceName, ov::value_cache_group_size(16)); ov::CompiledModel compiledModel = core.compile_model(model, deviceName); auto key_cache_group_size_value = 0; auto value_cache_group_size_value = 0; - OV_ASSERT_NO_THROW(key_cache_group_size_value = compiledModel.get_property(ov::hint::key_cache_group_size)); - OV_ASSERT_NO_THROW(value_cache_group_size_value = compiledModel.get_property(ov::hint::value_cache_group_size)); + OV_ASSERT_NO_THROW(key_cache_group_size_value = compiledModel.get_property(ov::key_cache_group_size)); + OV_ASSERT_NO_THROW(value_cache_group_size_value = compiledModel.get_property(ov::value_cache_group_size)); ASSERT_EQ(key_cache_group_size_value, 32); ASSERT_EQ(value_cache_group_size_value, 16); } @@ -260,7 +261,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckExecutionModeIsAvailableIn ASSERT_FALSE(model_exec_mode_it->is_mutable()); } -TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelInferencePrecisionHasHigherPriorityThanCoreInferencePrecision) { +TEST_F(OVClassConfigTestCPU, + smoke_CpuExecNetworkCheckModelInferencePrecisionHasHigherPriorityThanCoreInferencePrecision) { ov::Core ie; auto inference_precision_value = ov::element::undefined; @@ -274,7 +276,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelInferencePrecisionHas ASSERT_EQ(inference_precision_value, bf16_if_can_be_emulated); } -TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreInferencePrecisionHasHigherPriorityThanModelPerformanceExecutionMode) { +TEST_F(OVClassConfigTestCPU, + smoke_CpuExecNetworkCheckCoreInferencePrecisionHasHigherPriorityThanModelPerformanceExecutionMode) { ov::Core ie; auto execution_mode_value = ov::hint::ExecutionMode::ACCURACY; auto inference_precision_value = ov::element::undefined; @@ -292,7 +295,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCoreInferencePrecisionHasH ASSERT_EQ(inference_precision_value, ov::element::f32); } -TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckModelInferencePrecisionHasHigherPriorityThanCorePerformanceExecutionMode) { +TEST_F(OVClassConfigTestCPU, + smoke_CpuExecNetworkCheckModelInferencePrecisionHasHigherPriorityThanCorePerformanceExecutionMode) { ov::Core ie; auto execution_mode_value = ov::hint::ExecutionMode::PERFORMANCE; auto inference_precision_value = ov::element::undefined; @@ -323,14 +327,13 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckLogLevel) { OV_ASSERT_NO_THROW(value = compiledModel.get_property(ov::log::level)); ASSERT_EQ(value.as(), ov::log::Level::NO); } - //check set and get - const std::vector logLevels = { - ov::log::Level::ERR, - ov::log::Level::NO, - ov::log::Level::WARNING, - ov::log::Level::INFO, - ov::log::Level::DEBUG, - ov::log::Level::TRACE}; + // check set and get + const std::vector logLevels = {ov::log::Level::ERR, + ov::log::Level::NO, + ov::log::Level::WARNING, + ov::log::Level::INFO, + ov::log::Level::DEBUG, + ov::log::Level::TRACE}; for (unsigned int i = 0; i < logLevels.size(); i++) { ov::Any value; @@ -365,50 +368,109 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptions) { ov::Core ie; ov::Any type; ov::Any size; + ov::Any keySize; + ov::Any valueSize; + ov::Any keyCacheType; + ov::Any valueCacheType; ov::CompiledModel compiledModel; model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name()); model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name()); + model->set_rt_info("32", "runtime_options", ov::key_cache_group_size.name()); + model->set_rt_info("16", "runtime_options", ov::value_cache_group_size.name()); + model->set_rt_info("u8", "runtime_options", ov::key_cache_precision.name()); + model->set_rt_info("u8", "runtime_options", ov::value_cache_precision.name()); OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName)); OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision)); OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size)); + OV_ASSERT_NO_THROW(keySize = compiledModel.get_property(ov::key_cache_group_size)); + OV_ASSERT_NO_THROW(valueSize = compiledModel.get_property(ov::value_cache_group_size)); + OV_ASSERT_NO_THROW(keyCacheType = compiledModel.get_property(ov::key_cache_precision)); + OV_ASSERT_NO_THROW(valueCacheType = compiledModel.get_property(ov::value_cache_precision)); ASSERT_EQ(type.as(), ov::element::f16); ASSERT_EQ(size.as(), 0); + ASSERT_EQ(keySize.as(), 32); + ASSERT_EQ(valueSize.as(), 16); + ASSERT_EQ(keyCacheType.as(), ov::element::u8); + ASSERT_EQ(valueCacheType.as(), ov::element::u8); } TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCompileConfig) { ov::Core ie; ov::Any type; ov::Any size; + ov::Any keySize; + ov::Any valueSize; + ov::Any keyCacheType; + ov::Any valueCacheType; ov::CompiledModel compiledModel; model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name()); model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name()); + model->set_rt_info("0", "runtime_options", ov::key_cache_group_size.name()); + model->set_rt_info("0", "runtime_options", ov::value_cache_group_size.name()); + model->set_rt_info("f32", "runtime_options", ov::key_cache_precision.name()); + model->set_rt_info("f32", "runtime_options", ov::value_cache_precision.name()); ov::AnyMap config; config[ov::hint::kv_cache_precision.name()] = "u8"; config[ov::hint::dynamic_quantization_group_size.name()] = "16"; + // propperty has higher priority than rt_info + config[ov::key_cache_group_size.name()] = "32"; + config[ov::value_cache_group_size.name()] = "16"; + // key/value cache prec has higher priority than kvCachePrec + config[ov::key_cache_precision.name()] = "f16"; + config[ov::value_cache_precision.name()] = "bf16"; OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName, config)); OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision)); OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size)); + OV_ASSERT_NO_THROW(keySize = compiledModel.get_property(ov::key_cache_group_size)); + OV_ASSERT_NO_THROW(valueSize = compiledModel.get_property(ov::value_cache_group_size)); + OV_ASSERT_NO_THROW(keyCacheType = compiledModel.get_property(ov::key_cache_precision)); + OV_ASSERT_NO_THROW(valueCacheType = compiledModel.get_property(ov::value_cache_precision)); ASSERT_EQ(type.as(), ov::element::u8); ASSERT_EQ(size.as(), 16); + ASSERT_EQ(keySize.as(), 32); + ASSERT_EQ(valueSize.as(), 16); + ASSERT_EQ(keyCacheType.as(), ov::element::f16); + ASSERT_EQ(valueCacheType.as(), ov::element::bf16); } TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCoreProperties) { ov::Core core; ov::Any type; ov::Any size; - + ov::Any keySize; + ov::Any valueSize; + ov::Any keyCacheType; + ov::Any valueCacheType; core.set_property(deviceName, ov::hint::kv_cache_precision(ov::element::f32)); core.set_property(deviceName, ov::hint::dynamic_quantization_group_size(16)); + core.set_property(deviceName, ov::key_cache_group_size(8)); + core.set_property(deviceName, ov::value_cache_group_size(8)); + core.set_property(deviceName, ov::key_cache_precision(ov::element::f16)); + core.set_property(deviceName, ov::value_cache_precision(ov::element::bf16)); ov::CompiledModel compiledModel; model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name()); model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name()); + model->set_rt_info("32", "runtime_options", ov::key_cache_group_size.name()); + model->set_rt_info("16", "runtime_options", ov::value_cache_group_size.name()); + // User's setting has higher priority than rt_info + model->set_rt_info("f32", "runtime_options", ov::key_cache_precision.name()); + model->set_rt_info("f32", "runtime_options", ov::value_cache_precision.name()); OV_ASSERT_NO_THROW(compiledModel = core.compile_model(model, deviceName)); OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision)); OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size)); + OV_ASSERT_NO_THROW(keySize = compiledModel.get_property(ov::key_cache_group_size)); + OV_ASSERT_NO_THROW(valueSize = compiledModel.get_property(ov::value_cache_group_size)); + OV_ASSERT_NO_THROW(keyCacheType = compiledModel.get_property(ov::key_cache_precision)); + OV_ASSERT_NO_THROW(valueCacheType = compiledModel.get_property(ov::value_cache_precision)); + ASSERT_EQ(type.as(), ov::element::f32); ASSERT_EQ(size.as(), 16); + ASSERT_EQ(keySize.as(), 8); + ASSERT_EQ(valueSize.as(), 8); + ASSERT_EQ(keyCacheType.as(), ov::element::f16); + ASSERT_EQ(valueCacheType.as(), ov::element::bf16); } -} // namespace +} // namespace diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp index 589f0641eae0e8..c6289a4dc80716 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp @@ -56,10 +56,10 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginAllSupportedPropertiesAreAvailable) { RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RW_property(ov::hint::dynamic_quantization_group_size.name()), RW_property(ov::hint::kv_cache_precision.name()), - RW_property(ov::hint::key_cache_precision.name()), - RW_property(ov::hint::value_cache_precision.name()), - RW_property(ov::hint::key_cache_group_size.name()), - RW_property(ov::hint::value_cache_group_size.name()), + RW_property(ov::key_cache_precision.name()), + RW_property(ov::value_cache_precision.name()), + RW_property(ov::key_cache_group_size.name()), + RW_property(ov::value_cache_group_size.name()), }; ov::Core ie;