Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU]PageAttn with 4bit-quantization #27992

Open
wants to merge 35 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
15fcdb8
[CPU]separate precisions of kv cache
zhangYiIntel Oct 31, 2024
82f843a
[CPU]use element as template args
zhangYiIntel Nov 6, 2024
a754404
[CPU]make quantize grouped
zhangYiIntel Nov 8, 2024
2aba224
[CPU]make u8 kernel grouped
zhangYiIntel Nov 13, 2024
fc435f6
[CPU]U4 Group size support with reference
zhangYiIntel Nov 18, 2024
d080e2a
[CPU]AVX512 support for u4 kernel
zhangYiIntel Nov 28, 2024
78ef4dd
[CPU]Support S4 quantization
zhangYiIntel Nov 29, 2024
3e821ea
[CPU]use AVX512 to quant s4
zhangYiIntel Nov 29, 2024
80b093f
[CPU]4-bit quantization with avx2
zhangYiIntel Dec 5, 2024
13a496e
fix build on elder compiler
zhangYiIntel Dec 6, 2024
92e6cb3
[CPU]fix fp32 inference
zhangYiIntel Dec 9, 2024
91ebc09
[CPU]set group size via hint
zhangYiIntel Dec 10, 2024
685f263
[CPU]fix code style
zhangYiIntel Dec 10, 2024
e56639a
[CPU]fix property test
zhangYiIntel Dec 11, 2024
a34ce8b
[CPU]add cache precision check
zhangYiIntel Dec 11, 2024
8548773
Merge branch 'master' into yi3/4bit-cache
zhangYiIntel Dec 12, 2024
fe6c311
[CPU]fix code style of config.cpp
zhangYiIntel Dec 12, 2024
522215a
Merge branch 'master' into yi3/4bit-cache
zhangYiIntel Dec 12, 2024
8faadd8
[CPU]pre calculate count
zhangYiIntel Dec 17, 2024
b4b0f0d
[CPU]Use ov::element as template args
zhangYiIntel Dec 18, 2024
5c838f7
[CPU]remove redundant marco
zhangYiIntel Dec 18, 2024
c98cec9
Merge branch 'master' into yi3/4bit-cache
zhangYiIntel Dec 19, 2024
f03e23c
apply review comments
zhangYiIntel Dec 19, 2024
99d5c4d
Merge branch 'master' into yi3/4bit-cache
zhangYiIntel Dec 19, 2024
dddb4d9
Merge branch 'master' into yi3/4bit-cache
zhangYiIntel Dec 19, 2024
c362399
[CPU]apply review comments
zhangYiIntel Jan 3, 2025
28bcf7b
[CPU]remove useless code of s4
zhangYiIntel Jan 3, 2025
94522a2
Merge branch 'master' into yi3/4bit-cache
zhangYiIntel Jan 3, 2025
56245d0
[CPU]Unify u8/u4 dequant kernel with template arg
zhangYiIntel Jan 5, 2025
84f03a3
[CPU]Define key/value cache prec/group_size priority
zhangYiIntel Jan 6, 2025
e0b437e
[CPU]fix prec order & check group_size
zhangYiIntel Jan 6, 2025
79df402
Merge branch 'master' into yi3/4bit-cache
zhangYiIntel Jan 6, 2025
f196535
Merge branch 'master' into yi3/4bit-cache
zhangYiIntel Jan 6, 2025
0515410
[CPU]fix sdpa test
zhangYiIntel Jan 7, 2025
7a412f7
[CPU]fix group_size in sdpa
zhangYiIntel Jan 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@
from openvino._pyopenvino.properties import loaded_from_cache
from openvino._pyopenvino.properties import cache_encryption_callbacks
from openvino._pyopenvino.properties import weights_path
from openvino._pyopenvino.properties import key_cache_precision
from openvino._pyopenvino.properties import value_cache_precision
from openvino._pyopenvino.properties import key_cache_group_size
from openvino._pyopenvino.properties import value_cache_group_size

# Submodules
from openvino.runtime.properties import hint
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ void regmodule_properties(py::module m) {
wrap_property_RW(m_properties, ov::force_tbb_terminate, "force_tbb_terminate");
wrap_property_RW(m_properties, ov::enable_mmap, "enable_mmap");
wrap_property_RW(m_properties, ov::weights_path, "weights_path");
wrap_property_RW(m_properties, ov::key_cache_precision, "key_cache_precision");
wrap_property_RW(m_properties, ov::value_cache_precision, "value_cache_precision");
wrap_property_RW(m_properties, ov::key_cache_group_size, "key_cache_group_size");
wrap_property_RW(m_properties, ov::value_cache_group_size, "value_cache_group_size");

wrap_property_RO(m_properties, ov::supported_properties, "supported_properties");
wrap_property_RO(m_properties, ov::available_devices, "available_devices");
Expand Down
12 changes: 12 additions & 0 deletions src/bindings/python/tests/test_runtime/test_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,18 @@ def test_properties_ro(ov_property_ro, expected_value):
"WEIGHTS_PATH",
(("./model.bin", "./model.bin"),),
),
(
props.key_cache_group_size,
"KEY_CACHE_GROUP_SIZE",
((64, 64),),
),
(
props.value_cache_group_size,
"VALUE_CACHE_GROUP_SIZE",
((64, 64),),
),
(props.key_cache_precision, "KEY_CACHE_PRECISION", ((Type.f32, Type.f32),)),
(props.value_cache_precision, "VALUE_CACHE_PRECISION", ((Type.f32, Type.f32),)),
(hints.inference_precision, "INFERENCE_PRECISION_HINT", ((Type.f32, Type.f32),)),
(
hints.model_priority,
Expand Down
24 changes: 24 additions & 0 deletions src/inference/include/openvino/runtime/properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1359,4 +1359,28 @@ static constexpr Property<std::vector<std::string>, PropertyMutability::RO> exec
* @note This property is used for weightless caching. Only used when ov::CacheMode Property is set to "OPTIMIZE_SIZE".
*/
static constexpr Property<std::string, PropertyMutability::RW> weights_path{"WEIGHTS_PATH"};

/**
* @brief The precision of key cache compression
* @ingroup ov_runtime_cpp_prop_api
*/
static constexpr Property<element::Type, PropertyMutability::RW> key_cache_precision{"KEY_CACHE_PRECISION"};

/**
* @brief The precision of value cache compression
* @ingroup ov_runtime_cpp_prop_api
*/
static constexpr Property<element::Type, PropertyMutability::RW> value_cache_precision{"VALUE_CACHE_PRECISION"};

/**
* @brief The group_size of key cache compression
* @ingroup ov_runtime_cpp_prop_api
*/
static constexpr Property<uint64_t, PropertyMutability::RW> key_cache_group_size{"KEY_CACHE_GROUP_SIZE"};

/**
* @brief The group_size of value cache compression
* @ingroup ov_runtime_cpp_prop_api
*/
static constexpr Property<uint64_t, PropertyMutability::RW> value_cache_group_size{"VALUE_CACHE_GROUP_SIZE"};
} // namespace ov
12 changes: 12 additions & 0 deletions src/plugins/intel_cpu/src/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,10 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
RO_property(ov::hint::dynamic_quantization_group_size.name()),
RO_property(ov::hint::kv_cache_precision.name()),
RO_property(ov::key_cache_precision.name()),
RO_property(ov::value_cache_precision.name()),
RO_property(ov::key_cache_group_size.name()),
RO_property(ov::value_cache_group_size.name()),
};

OPENVINO_SUPPRESS_DEPRECATED_START
Expand Down Expand Up @@ -332,6 +336,14 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
return decltype(ov::hint::dynamic_quantization_group_size)::value_type(config.fcDynamicQuantizationGroupSize);
} else if (name == ov::hint::kv_cache_precision) {
return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision);
} else if (name == ov::key_cache_precision) {
return decltype(ov::key_cache_precision)::value_type(config.keyCachePrecision);
} else if (name == ov::value_cache_precision) {
return decltype(ov::value_cache_precision)::value_type(config.valueCachePrecision);
} else if (name == ov::key_cache_group_size) {
return decltype(ov::key_cache_group_size)::value_type(config.keyCacheGroupSize);
} else if (name == ov::value_cache_group_size) {
return decltype(ov::value_cache_group_size)::value_type(config.valueCacheGroupSize);
}
OPENVINO_THROW("Unsupported property: ", name);
}
Expand Down
86 changes: 85 additions & 1 deletion src/plugins/intel_cpu/src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,60 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
ov::hint::kv_cache_precision.name(),
". Supported values: u8, bf16, f16, f32");
}
} else if (key == ov::key_cache_precision.name()) {
try {
keyCachePrecisionSetExplicitly = true;
auto const prec = val.as<ov::element::Type>();
if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) {
keyCachePrecision = prec;
} else {
OPENVINO_THROW("keyCachePrecision doesn't support value ", prec);
}
} catch (ov::Exception&) {
OPENVINO_THROW("Wrong value ",
val.as<std::string>(),
" for property key ",
ov::key_cache_precision.name(),
". Supported values: u8, bf16, f16, f32");
}
} else if (key == ov::value_cache_precision.name()) {
try {
valueCachePrecisionSetExplicitly = true;
auto const prec = val.as<ov::element::Type>();
if (one_of(prec,
ov::element::f32,
ov::element::f16,
ov::element::bf16,
ov::element::u8,
ov::element::u4)) {
valueCachePrecision = prec;
} else {
OPENVINO_THROW("valueCachePrecision doesn't support value ", prec);
}
} catch (ov::Exception&) {
OPENVINO_THROW("Wrong value ",
val.as<std::string>(),
" for property key ",
ov::value_cache_precision.name(),
". Supported values: u4, u8, bf16, f16, f32");
}
} else if (key == ov::key_cache_group_size.name() || key == ov::value_cache_group_size.name()) {
try {
auto const groupSize = val.as<uint64_t>();
if (key == ov::key_cache_group_size.name()) {
keyCacheGroupSizeSetExplicitly = true;
keyCacheGroupSize = groupSize;
} else {
valueCacheGroupSizeSetExplicitly = true;
valueCacheGroupSize = groupSize;
}
} catch (ov::Exception&) {
OPENVINO_THROW("Wrong value ",
val.as<std::string>(),
" for property key ",
key,
". Expected only unsinged integer numbers");
}
zhangYiIntel marked this conversation as resolved.
Show resolved Hide resolved
} else if (key == ov::cache_encryption_callbacks.name()) {
try {
auto encryption_callbacks = val.as<EncryptionCallbacks>();
Expand Down Expand Up @@ -408,6 +462,13 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
aclFastMath = true;
}
#endif
// key/value cache precision has higher priority, if not defined use kvCachePrecision
if (!keyCachePrecisionSetExplicitly && kvCachePrecisionSetExplicitly) {
zhangYiIntel marked this conversation as resolved.
Show resolved Hide resolved
keyCachePrecision = kvCachePrecision;
}
if (!valueCachePrecisionSetExplicitly && kvCachePrecisionSetExplicitly) {
valueCachePrecision = kvCachePrecision;
}
// disable dynamic quantization and kv quantization for best accuracy
if (executionMode == ov::hint::ExecutionMode::ACCURACY) {
if (!fcDynamicQuantizationGroupSizeSetExplicitly) {
Expand All @@ -416,6 +477,12 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
if (!kvCachePrecisionSetExplicitly) {
kvCachePrecision = ov::element::f32;
}
if (!keyCachePrecisionSetExplicitly) {
keyCachePrecision = ov::element::f32;
}
if (!valueCachePrecisionSetExplicitly) {
valueCachePrecision = ov::element::f32;
}
}

if (!prop.empty())
Expand Down Expand Up @@ -462,14 +529,31 @@ void Config::applyRtInfo(const std::shared_ptr<const ov::Model>& model) {
// if user sets explicitly, it will be higher priority than rt_info
if (!kvCachePrecisionSetExplicitly &&
model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) {
this->kvCachePrecision =
this->kvCachePrecision = this->keyCachePrecision = this->valueCachePrecision =
model->get_rt_info<ov::element::Type>({"runtime_options", ov::hint::kv_cache_precision.name()});
}
if (!fcDynamicQuantizationGroupSizeSetExplicitly &&
model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) {
this->fcDynamicQuantizationGroupSize =
model->get_rt_info<uint64_t>({"runtime_options", ov::hint::dynamic_quantization_group_size.name()});
}
if (!keyCachePrecisionSetExplicitly && model->has_rt_info({"runtime_options", ov::key_cache_precision.name()})) {
this->keyCachePrecision =
model->get_rt_info<ov::element::Type>({"runtime_options", ov::key_cache_precision.name()});
}
if (!valueCachePrecisionSetExplicitly &&
model->has_rt_info({"runtime_options", ov::value_cache_precision.name()})) {
this->valueCachePrecision =
model->get_rt_info<ov::element::Type>({"runtime_options", ov::value_cache_precision.name()});
}
if (!keyCacheGroupSizeSetExplicitly && model->has_rt_info({"runtime_options", ov::key_cache_group_size.name()})) {
this->keyCacheGroupSize = model->get_rt_info<uint64_t>({"runtime_options", ov::key_cache_group_size.name()});
}
if (!valueCacheGroupSizeSetExplicitly &&
model->has_rt_info({"runtime_options", ov::value_cache_group_size.name()})) {
this->valueCacheGroupSize =
model->get_rt_info<uint64_t>({"runtime_options", ov::value_cache_group_size.name()});
}
}

} // namespace intel_cpu
Expand Down
10 changes: 10 additions & 0 deletions src/plugins/intel_cpu/src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,27 @@ struct Config {
uint64_t fcDynamicQuantizationGroupSize = 32;
bool fcDynamicQuantizationGroupSizeSetExplicitly = false;
bool kvCachePrecisionSetExplicitly = false;
bool keyCachePrecisionSetExplicitly = false;
bool valueCachePrecisionSetExplicitly = false;
bool keyCacheGroupSizeSetExplicitly = false;
bool valueCacheGroupSizeSetExplicitly = false;
#if defined(OV_CPU_WITH_ACL)
bool aclFastMath = false;
#endif
#if defined(OPENVINO_ARCH_X86_64)
ov::element::Type kvCachePrecision = ov::element::u8;
ov::element::Type keyCachePrecision = ov::element::u8;
ov::element::Type valueCachePrecision = ov::element::u8;
size_t rtCacheCapacity = 5000ul;
#else
ov::element::Type kvCachePrecision = ov::element::f16;
ov::element::Type keyCachePrecision = ov::element::f16;
ov::element::Type valueCachePrecision = ov::element::f16;
// TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives
size_t rtCacheCapacity = 0ul;
#endif
size_t keyCacheGroupSize = 32ul;
size_t valueCacheGroupSize = 32ul;
ov::threading::IStreamsExecutor::Config streamExecutorConfig;
int streams = 1;
bool streamsChanged = false;
Expand Down
Loading
Loading