From 51b67cac8980e4c8f9cf60a357547563e774151a Mon Sep 17 00:00:00 2001 From: dmitrygo Date: Wed, 18 Dec 2024 16:26:12 +0400 Subject: [PATCH] review fixes --- .../executors/dnnl/dnnl_matmul_primitive.cpp | 2 +- .../intel_cpu/src/nodes/fullyconnected.cpp | 36 +++++++++++++ .../intel_cpu/src/nodes/fullyconnected.h | 2 + .../convert_to_cpu_specific_opset.hpp | 23 +------- .../transformation_pipeline.cpp | 23 +++----- .../src/arm/matmul_weights_decompression.cpp | 31 ++++++----- .../classes/matmul_weights_decompression.cpp | 29 +--------- .../classes/matmul_weights_decompression.hpp | 54 +++++++++++++------ 8 files changed, 105 insertions(+), 95 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp index c1e768d8120d88..9ffe4731689d43 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp @@ -155,7 +155,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs, const auto maxRank = std::max({srcDesc->getShape().getRank(), weiDesc->getShape().getRank(), dstDesc->getShape().getRank()}); - auto normWeiDims = normalizeToRank(weiDesc->getShape().getStaticDims(), maxRank); + const auto normWeiDims = normalizeToRank(weiDesc->getShape().getStaticDims(), maxRank); if (memory.count(ARG_WEI | ARG_ATTR_SCALES)) { auto dstPrc = ov::element::f32; dnnlpoc.appendDecompressionScales(memory.at(ARG_WEI | ARG_ATTR_SCALES), diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index eb1efe3d69190d..4a2e3728887087 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -41,6 +41,42 @@ namespace ov { namespace intel_cpu { namespace node { +ov::element::TypeVector FullyConnected::getSupportedCompressedWeightsTypes() { + using ov::element::Type_t; + + bool useMatmulPrim = false; + CPU_DEBUG_CAP_ENABLE(useMatmulPrim = getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");) + + if (useMatmulPrim) { + return {Type_t::u8, Type_t::i8}; + } else { +#if defined(OPENVINO_ARCH_X86_64) + return {Type_t::u8, Type_t::i8, Type_t::u4, Type_t::i4, Type_t::nf4, Type_t::f4e2m1}; +#else + return {}; +#endif + } +} + +ov::element::TypeVector FullyConnected::getSupportedCompressedActivationsTypes() { + using ov::element::Type_t; + + bool useMatmulPrim = false; + CPU_DEBUG_CAP_ENABLE(useMatmulPrim = getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");) + + if (useMatmulPrim) { + return {Type_t::f32, Type_t::f16}; + } else { +#if defined(OPENVINO_ARCH_X86_64) + // @todo enable for bf16 as well + // after EnforceInferencePrecision is replaced with ConvertPrecision + return {Type_t::f32}; +#else + return {}; +#endif + } +} + bool FullyConnected::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index 0b50d882c9e554..660e420d5c58cd 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -72,6 +72,8 @@ class FullyConnected : public Node { size_t OC, size_t G, ov::element::Type inferencePrecision) noexcept; + static ov::element::TypeVector getSupportedCompressedWeightsTypes(); + static ov::element::TypeVector getSupportedCompressedActivationsTypes(); bool isExecutable() const override { return !isInputTensorAtPortEmpty(0); diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp index 5e3062cc656853..614f7d690f8726 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp @@ -28,8 +28,6 @@ namespace ov { namespace intel_cpu { inline void ConvertToCPUSpecificOpset(std::shared_ptr& model, const Config& config) { - using ov::element::Type_t; - RUN_ON_FUNCTION_SCOPE(ConvertToCPUSpecificOpset); ov::pass::Manager manager("CPU:ConvertToCPUSpecificOpset"); @@ -38,28 +36,11 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr& model, const C CPU_REGISTER_PASS_COMMON(manager, ConvertMatMulToFC); CPU_REGISTER_PASS_COMMON(manager, FullyConnectedBiasFusion); - std::vector supported_activation_types; - std::vector supported_compressed_weights_types; - - bool useMatmulPrim = false; - CPU_DEBUG_CAP_ENABLE(useMatmulPrim = getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");) - - if (useMatmulPrim) { - supported_activation_types = {Type_t::f32, Type_t::f16}; - supported_compressed_weights_types = {Type_t::u8, Type_t::i8}; - } else { - // @todo enable for bf16 as well - // after EnforceInferencePrecision is replaced with ConvertPrecision - supported_activation_types = {Type_t::f32}; - supported_compressed_weights_types = - {Type_t::u8, Type_t::i8, Type_t::u4, Type_t::i4, Type_t::nf4, Type_t::f4e2m1}; - } - CPU_REGISTER_PASS_COMMON( manager, pass::ConvertFullyConnectedToFullyConnectedCompressed, - supported_activation_types, - supported_compressed_weights_types, + ov::intel_cpu::node::FullyConnected::getSupportedCompressedActivationsTypes(), + ov::intel_cpu::node::FullyConnected::getSupportedCompressedWeightsTypes(), [&config](const std::shared_ptr& fc, size_t IC, size_t OC, size_t G) { return ov::intel_cpu::node::FullyConnected::isSupportedCompressedOperation(fc, IC, diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 6522d11e4703b6..13e890f6339e81 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -338,23 +338,12 @@ void Transformations::PreLpt(const std::vector& defaultPrecis // We need to fuse Transpose to MatMul to have a simpler callback for the next transformation CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::TransposeMatMul); CPU_REGISTER_PASS_ARM(decompression_handling_manager, ov::pass::TransposeMatMul); - ov::element::TypeVector decompression_precisions{ov::element::u8, - ov::element::i8, - ov::element::u4, - ov::element::i4, - ov::element::nf4, - ov::element::f4e2m1}; - - CPU_REGISTER_PASS_X64(decompression_handling_manager, - ov::pass::MarkDequantization, - decompression_precisions, - false, - true); - CPU_REGISTER_PASS_ARM(decompression_handling_manager, - ov::pass::MarkDequantizationSubgraph, - decompression_precisions, - false, - true); + const auto& decompression_precisions = ov::intel_cpu::node::FullyConnected::getSupportedCompressedWeightsTypes(); + CPU_REGISTER_PASS_COMMON(decompression_handling_manager, + ov::pass::MarkDequantization, + decompression_precisions, + false, + true); CPU_SET_CALLBACK_COMMON( decompression_handling_manager, [&](const_node_ptr& node) -> bool { diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp index 18eb284e623297..408dd40b4c658f 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp @@ -2,10 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#ifdef CPU_DEBUG_CAPS - #include "custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp" +#include "openvino/util/env_util.hpp" + using namespace CPUTestUtils; namespace ov { @@ -20,14 +20,22 @@ std::vector filter_additional_config_basic() { const std::vector decompression_precisions = {ov::element::f32}; const std::vector weights_precisions = {ov::element::u8, ov::element::i8}; +bool should_use_decompression_impl() { +#ifdef CPU_DEBUG_CAPS + return ov::util::getenv_bool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC"); +#else + return false; +#endif +} + const std::vector input_shapes = { {{{-1, -1, -1}, {{1, 4, 16}, {10, 16, 16}}}, {16, 32}}, {{{}, {{1, 8, 16}}}, {16, 32}, 4ul}, {{{}, {{1, 4, 16}}}, {1, 16, 32}}, - {{{}, {{5, 40, 496}}}, {1, 496, 240}}, + {{{}, {{5, 40, 96}}}, {1, 96, 240}}, {{{}, {{1, 4, 48}}}, {48, 256}}, - {{{}, {{1, 11, 154}}}, {154, 77}, 154ul}, - {{{-1, -1, -1}, {{10, 40, 480}, {11, 40, 480}}}, {1, 480, 256}}, + {{{}, {{1, 11, 104}}}, {104, 77}, 104ul}, + {{{-1, -1, -1}, {{10, 40, 110}, {11, 40, 110}}}, {1, 110, 256}}, }; const std::vector fusing_params{emptyFusingSpec, fusingBias}; @@ -42,10 +50,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights, ::testing::Values(false), ::testing::ValuesIn(filter_additional_config_basic()), ::testing::ValuesIn(fusing_params), - ::testing::Values(true)), + ::testing::Values(should_use_decompression_impl())), MatmulWeightsDecompression::getTestCaseName); - const std::vector input_shapes_corner_cases = { {{{-1, -1, -1}, {{1, 4, 16}}}, {1, 16, 32}}, {{{-1, -1, -1}, {{1, 4, 16}}}, {16, 32}}, @@ -54,8 +61,9 @@ const std::vector input_shapes_corner_cases = { }; const std::vector transpose_weights = {true, false}; -const std::vector decompression_subtract_type = { - DecompressionSubtractType::full, DecompressionSubtractType::scalar, DecompressionSubtractType::empty}; +const std::vector decompression_subtract_type = {DecompressionSubtractType::full, + DecompressionSubtractType::scalar, + DecompressionSubtractType::empty}; const std::vector reshape_on_decompression = {true, false}; const std::vector decompression_precisions_corner_cases = {ov::element::f16, ov::element::f32}; @@ -70,12 +78,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_corner_cases, ::testing::ValuesIn(reshape_on_decompression), ::testing::ValuesIn(filter_additional_config_basic()), ::testing::Values(emptyFusingSpec), - ::testing::Values(true)), + ::testing::Values(should_use_decompression_impl())), MatmulWeightsDecompression::getTestCaseName); } // namespace - } // namespace test } // namespace ov - -#endif \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp index 0719ab326aa9bf..e14245f2906e16 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp @@ -9,33 +9,6 @@ using namespace CPUTestUtils; namespace ov { namespace test { -/* - * WP - weights precision - * DP - decompression precision - * IP - input precision - * SP - scale precision - * Opt - optional - * Subtract_const(WP) - * / - * Weights(WP) Convert(DP) - * | / Multiply_const(SP) - * Convert(DP) Reshape (Opt) / - * \ / Convert(if SP != DP) - * Subtract(Opt) / - * \ Reshape (Opt) - * \ / - * Multiply - * | - * Reshape (in case of group decompression) - * | - * Convert (if IP != DP) - * | - * Data(IP) Transpose(Opt) - * \ / - * Matmul - * | - * Bias - */ std::string MatmulWeightsDecompression::getTestCaseName(testing::TestParamInfo obj) { MatMulDecompressionShapeParams shape_params; @@ -144,7 +117,7 @@ void MatmulWeightsDecompression::SetUp() { if (configuration.count(ov::hint::inference_precision.name()) && configuration.at(ov::hint::inference_precision.name()) == ov::element::f16) { - abs_threshold = 0.1; + abs_threshold = 0.2; } ElementType netType = ov::element::f32; diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp index b4dabfc8835d32..266aab8e445928 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp @@ -1,32 +1,56 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -// #include "openvino/opsets/opset13.hpp" -// #include "openvino/pass/manager.hpp" -// #include "transformations/op_conversions/scaled_dot_product_attention_decomposition.hpp" +#include "common_test_utils/ov_tensor_utils.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" +#include "shared_test_classes/subgraph/weights_decompression_builders.hpp" #include "utils/cpu_test_utils.hpp" #include "utils/fusing_test_utils.hpp" -#include "common_test_utils/ov_tensor_utils.hpp" -#include "shared_test_classes/subgraph/weights_decompression_builders.hpp" using namespace CPUTestUtils; namespace ov { namespace test { +/* + * WP - weights precision + * DP - decompression precision + * IP - input precision + * SP - scale precision + * Opt - optional + * Subtract_const(WP) + * / + * Weights(WP) Convert(DP) + * | / Multiply_const(SP) + * Convert(DP) Reshape (Opt) / + * \ / Convert(if SP != DP) + * Subtract(Opt) / + * \ Reshape (Opt) + * \ / + * Multiply + * | + * Reshape (in case of group decompression) + * | + * Convert (if IP != DP) + * | + * Data(IP) Transpose(Opt) + * \ / + * Matmul + * | + * Bias + */ typedef std::tuple // should use decompression implementation -MatmulWeightsDecompressionParams; + ov::test::ElementType, // weights precision + ov::test::ElementType, // decompression precision + ov::test::ElementType, // scale precision + bool, // transpose on weights + DecompressionSubtractType, // decompression subtract type + bool, // reshape on decompression constants + ov::AnyMap, // additional config + fusingSpecificParams, + bool> // should use decompression implementation + MatmulWeightsDecompressionParams; class MatmulWeightsDecompression : public testing::WithParamInterface, virtual public SubgraphBaseTest,