From 51b67cac8980e4c8f9cf60a357547563e774151a Mon Sep 17 00:00:00 2001
From: dmitrygo <dmitry.gorokhov@intel.com>
Date: Wed, 18 Dec 2024 16:26:12 +0400
Subject: [PATCH] review fixes

---
 .../executors/dnnl/dnnl_matmul_primitive.cpp  |  2 +-
 .../intel_cpu/src/nodes/fullyconnected.cpp    | 36 +++++++++++++
 .../intel_cpu/src/nodes/fullyconnected.h      |  2 +
 .../convert_to_cpu_specific_opset.hpp         | 23 +-------
 .../transformation_pipeline.cpp               | 23 +++-----
 .../src/arm/matmul_weights_decompression.cpp  | 31 ++++++-----
 .../classes/matmul_weights_decompression.cpp  | 29 +---------
 .../classes/matmul_weights_decompression.hpp  | 54 +++++++++++++------
 8 files changed, 105 insertions(+), 95 deletions(-)
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
index c1e768d8120d88..9ffe4731689d43 100644
--- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
@@ -155,7 +155,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs,
 
     const auto maxRank =
         std::max({srcDesc->getShape().getRank(), weiDesc->getShape().getRank(), dstDesc->getShape().getRank()});
-    auto normWeiDims = normalizeToRank(weiDesc->getShape().getStaticDims(), maxRank);
+    const auto normWeiDims = normalizeToRank(weiDesc->getShape().getStaticDims(), maxRank);
     if (memory.count(ARG_WEI | ARG_ATTR_SCALES)) {
         auto dstPrc = ov::element::f32;
         dnnlpoc.appendDecompressionScales(memory.at(ARG_WEI | ARG_ATTR_SCALES),
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index eb1efe3d69190d..4a2e3728887087 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -41,6 +41,42 @@ namespace ov {
 namespace intel_cpu {
 namespace node {
 
+ov::element::TypeVector FullyConnected::getSupportedCompressedWeightsTypes() {
+    using ov::element::Type_t;
+
+    bool useMatmulPrim = false;
+    CPU_DEBUG_CAP_ENABLE(useMatmulPrim = getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");)
+
+    if (useMatmulPrim) {
+        return {Type_t::u8, Type_t::i8};
+    } else {
+#if defined(OPENVINO_ARCH_X86_64)
+        return {Type_t::u8, Type_t::i8, Type_t::u4, Type_t::i4, Type_t::nf4, Type_t::f4e2m1};
+#else
+        return {};
+#endif
+    }
+}
+
+ov::element::TypeVector FullyConnected::getSupportedCompressedActivationsTypes() {
+    using ov::element::Type_t;
+
+    bool useMatmulPrim = false;
+    CPU_DEBUG_CAP_ENABLE(useMatmulPrim = getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");)
+
+    if (useMatmulPrim) {
+        return {Type_t::f32, Type_t::f16};
+    } else {
+#if defined(OPENVINO_ARCH_X86_64)
+        // @todo enable for bf16 as well
+        // after EnforceInferencePrecision is replaced with ConvertPrecision
+        return {Type_t::f32};
+#else
+        return {};
+#endif
+    }
+}
+
 bool FullyConnected::isSupportedOperation(const std::shared_ptr<const ov::Node>& op,
                                           std::string& errorMessage) noexcept {
     try {
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
index 0b50d882c9e554..660e420d5c58cd 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -72,6 +72,8 @@ class FullyConnected : public Node {
                                                size_t OC,
                                                size_t G,
                                                ov::element::Type inferencePrecision) noexcept;
+    static ov::element::TypeVector getSupportedCompressedWeightsTypes();
+    static ov::element::TypeVector getSupportedCompressedActivationsTypes();
 
     bool isExecutable() const override {
         return !isInputTensorAtPortEmpty(0);
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp
index 5e3062cc656853..614f7d690f8726 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp
@@ -28,8 +28,6 @@ namespace ov {
 namespace intel_cpu {
 
 inline void ConvertToCPUSpecificOpset(std::shared_ptr<ov::Model>& model, const Config& config) {
-    using ov::element::Type_t;
-
     RUN_ON_FUNCTION_SCOPE(ConvertToCPUSpecificOpset);
 
     ov::pass::Manager manager("CPU:ConvertToCPUSpecificOpset");
@@ -38,28 +36,11 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr<ov::Model>& model, const C
     CPU_REGISTER_PASS_COMMON(manager, ConvertMatMulToFC);
     CPU_REGISTER_PASS_COMMON(manager, FullyConnectedBiasFusion);
 
-    std::vector<ov::element::Type> supported_activation_types;
-    std::vector<ov::element::Type> supported_compressed_weights_types;
-
-    bool useMatmulPrim = false;
-    CPU_DEBUG_CAP_ENABLE(useMatmulPrim = getEnvBool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");)
-
-    if (useMatmulPrim) {
-        supported_activation_types = {Type_t::f32, Type_t::f16};
-        supported_compressed_weights_types = {Type_t::u8, Type_t::i8};
-    } else {
-        // @todo enable for bf16 as well
-        // after EnforceInferencePrecision is replaced with ConvertPrecision
-        supported_activation_types = {Type_t::f32};
-        supported_compressed_weights_types =
-            {Type_t::u8, Type_t::i8, Type_t::u4, Type_t::i4, Type_t::nf4, Type_t::f4e2m1};
-    }
-
     CPU_REGISTER_PASS_COMMON(
         manager,
         pass::ConvertFullyConnectedToFullyConnectedCompressed,
-        supported_activation_types,
-        supported_compressed_weights_types,
+        ov::intel_cpu::node::FullyConnected::getSupportedCompressedActivationsTypes(),
+        ov::intel_cpu::node::FullyConnected::getSupportedCompressedWeightsTypes(),
         [&config](const std::shared_ptr<ov::op::internal::FullyConnected>& fc, size_t IC, size_t OC, size_t G) {
             return ov::intel_cpu::node::FullyConnected::isSupportedCompressedOperation(fc,
                                                                                        IC,
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 6522d11e4703b6..13e890f6339e81 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -338,23 +338,12 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
     // We need to fuse Transpose to MatMul to have a simpler callback for the next transformation
     CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::TransposeMatMul);
     CPU_REGISTER_PASS_ARM(decompression_handling_manager, ov::pass::TransposeMatMul);
-    ov::element::TypeVector decompression_precisions{ov::element::u8,
-                                                     ov::element::i8,
-                                                     ov::element::u4,
-                                                     ov::element::i4,
-                                                     ov::element::nf4,
-                                                     ov::element::f4e2m1};
-
-    CPU_REGISTER_PASS_X64(decompression_handling_manager,
-                          ov::pass::MarkDequantization,
-                          decompression_precisions,
-                          false,
-                          true);
-    CPU_REGISTER_PASS_ARM(decompression_handling_manager,
-                          ov::pass::MarkDequantizationSubgraph,
-                          decompression_precisions,
-                          false,
-                          true);
+    const auto& decompression_precisions = ov::intel_cpu::node::FullyConnected::getSupportedCompressedWeightsTypes();
+    CPU_REGISTER_PASS_COMMON(decompression_handling_manager,
+                             ov::pass::MarkDequantization,
+                             decompression_precisions,
+                             false,
+                             true);
     CPU_SET_CALLBACK_COMMON(
         decompression_handling_manager,
         [&](const_node_ptr& node) -> bool {
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp
index 18eb284e623297..408dd40b4c658f 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/arm/matmul_weights_decompression.cpp
@@ -2,10 +2,10 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#ifdef CPU_DEBUG_CAPS
-
 #include "custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp"
 
+#include "openvino/util/env_util.hpp"
+
 using namespace CPUTestUtils;
 
 namespace ov {
@@ -20,14 +20,22 @@ std::vector<ov::AnyMap> filter_additional_config_basic() {
 const std::vector<ov::test::ElementType> decompression_precisions = {ov::element::f32};
 const std::vector<ov::test::ElementType> weights_precisions = {ov::element::u8, ov::element::i8};
 
+bool should_use_decompression_impl() {
+#ifdef CPU_DEBUG_CAPS
+    return ov::util::getenv_bool("OV_CPU_ENABLE_DNNL_MAMTUL_FOR_FC");
+#else
+    return false;
+#endif
+}
+
 const std::vector<MatMulDecompressionShapeParams> input_shapes = {
     {{{-1, -1, -1}, {{1, 4, 16}, {10, 16, 16}}}, {16, 32}},
     {{{}, {{1, 8, 16}}}, {16, 32}, 4ul},
     {{{}, {{1, 4, 16}}}, {1, 16, 32}},
-    {{{}, {{5, 40, 496}}}, {1, 496, 240}},
+    {{{}, {{5, 40, 96}}}, {1, 96, 240}},
     {{{}, {{1, 4, 48}}}, {48, 256}},
-    {{{}, {{1, 11, 154}}}, {154, 77}, 154ul},
-    {{{-1, -1, -1}, {{10, 40, 480}, {11, 40, 480}}}, {1, 480, 256}},
+    {{{}, {{1, 11, 104}}}, {104, 77}, 104ul},
+    {{{-1, -1, -1}, {{10, 40, 110}, {11, 40, 110}}}, {1, 110, 256}},
 };
 const std::vector<fusingSpecificParams> fusing_params{emptyFusingSpec, fusingBias};
 
@@ -42,10 +50,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights,
                                             ::testing::Values(false),
                                             ::testing::ValuesIn(filter_additional_config_basic()),
                                             ::testing::ValuesIn(fusing_params),
-                                            ::testing::Values(true)),
+                                            ::testing::Values(should_use_decompression_impl())),
                          MatmulWeightsDecompression::getTestCaseName);
 
-
 const std::vector<MatMulDecompressionShapeParams> input_shapes_corner_cases = {
     {{{-1, -1, -1}, {{1, 4, 16}}}, {1, 16, 32}},
     {{{-1, -1, -1}, {{1, 4, 16}}}, {16, 32}},
@@ -54,8 +61,9 @@ const std::vector<MatMulDecompressionShapeParams> input_shapes_corner_cases = {
 };
 
 const std::vector<bool> transpose_weights = {true, false};
-const std::vector<DecompressionSubtractType> decompression_subtract_type = {
-    DecompressionSubtractType::full, DecompressionSubtractType::scalar, DecompressionSubtractType::empty};
+const std::vector<DecompressionSubtractType> decompression_subtract_type = {DecompressionSubtractType::full,
+                                                                            DecompressionSubtractType::scalar,
+                                                                            DecompressionSubtractType::empty};
 const std::vector<bool> reshape_on_decompression = {true, false};
 const std::vector<ov::test::ElementType> decompression_precisions_corner_cases = {ov::element::f16, ov::element::f32};
 
@@ -70,12 +78,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_corner_cases,
                                             ::testing::ValuesIn(reshape_on_decompression),
                                             ::testing::ValuesIn(filter_additional_config_basic()),
                                             ::testing::Values(emptyFusingSpec),
-                                            ::testing::Values(true)),
+                                            ::testing::Values(should_use_decompression_impl())),
                          MatmulWeightsDecompression::getTestCaseName);
 
 }  // namespace
-
 }  // namespace test
 }  // namespace ov
-
-#endif
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp
index 0719ab326aa9bf..e14245f2906e16 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.cpp
@@ -9,33 +9,6 @@ using namespace CPUTestUtils;
 
 namespace ov {
 namespace test {
-/*
- * WP - weights precision
- * DP - decompression precision
- * IP - input precision
- * SP - scale precision
- * Opt - optional
- *                        Subtract_const(WP)
- *                           /
- *    Weights(WP)     Convert(DP)
- *       |               /           Multiply_const(SP)
- *    Convert(DP)   Reshape (Opt)      /
- *            \        /          Convert(if SP != DP)
- *            Subtract(Opt)       /
- *                  \         Reshape (Opt)
- *                   \         /
- *                    Multiply
- *                      |
- *                   Reshape (in case of group decompression)
- *                      |
- *                   Convert (if IP != DP)
- *                      |
- *      Data(IP)   Transpose(Opt)
- *            \     /
- *             Matmul
- *               |
- *              Bias
- */
 
 std::string MatmulWeightsDecompression::getTestCaseName(testing::TestParamInfo<MatmulWeightsDecompressionParams> obj) {
     MatMulDecompressionShapeParams shape_params;
@@ -144,7 +117,7 @@ void MatmulWeightsDecompression::SetUp() {
 
     if (configuration.count(ov::hint::inference_precision.name()) &&
         configuration.at(ov::hint::inference_precision.name()) == ov::element::f16) {
-        abs_threshold = 0.1;
+        abs_threshold = 0.2;
     }
 
     ElementType netType = ov::element::f32;
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp
index b4dabfc8835d32..266aab8e445928 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/matmul_weights_decompression.hpp
@@ -1,32 +1,56 @@
 // Copyright (C) 2018-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-// #include "openvino/opsets/opset13.hpp"
-// #include "openvino/pass/manager.hpp"
-// #include "transformations/op_conversions/scaled_dot_product_attention_decomposition.hpp"
 
+#include "common_test_utils/ov_tensor_utils.hpp"
 #include "shared_test_classes/base/ov_subgraph.hpp"
+#include "shared_test_classes/subgraph/weights_decompression_builders.hpp"
 #include "utils/cpu_test_utils.hpp"
 #include "utils/fusing_test_utils.hpp"
-#include "common_test_utils/ov_tensor_utils.hpp"
-#include "shared_test_classes/subgraph/weights_decompression_builders.hpp"
 
 using namespace CPUTestUtils;
 
 namespace ov {
 namespace test {
 
+/*
+ * WP - weights precision
+ * DP - decompression precision
+ * IP - input precision
+ * SP - scale precision
+ * Opt - optional
+ *                        Subtract_const(WP)
+ *                           /
+ *    Weights(WP)     Convert(DP)
+ *       |               /           Multiply_const(SP)
+ *    Convert(DP)   Reshape (Opt)      /
+ *            \        /          Convert(if SP != DP)
+ *            Subtract(Opt)       /
+ *                  \         Reshape (Opt)
+ *                   \         /
+ *                    Multiply
+ *                      |
+ *                   Reshape (in case of group decompression)
+ *                      |
+ *                   Convert (if IP != DP)
+ *                      |
+ *      Data(IP)   Transpose(Opt)
+ *            \     /
+ *             Matmul
+ *               |
+ *              Bias
+ */
 typedef std::tuple<MatMulDecompressionShapeParams,
-        ov::test::ElementType,      // weights precision
-        ov::test::ElementType,      // decompression precision
-        ov::test::ElementType,      // scale precision
-        bool,                       // transpose on weights
-        DecompressionSubtractType,  // decompression subtract type
-        bool,                       // reshape on decompression constants
-        ov::AnyMap,                 // additional config
-        fusingSpecificParams,
-        bool>                      // should use decompression implementation
-MatmulWeightsDecompressionParams;
+                   ov::test::ElementType,      // weights precision
+                   ov::test::ElementType,      // decompression precision
+                   ov::test::ElementType,      // scale precision
+                   bool,                       // transpose on weights
+                   DecompressionSubtractType,  // decompression subtract type
+                   bool,                       // reshape on decompression constants
+                   ov::AnyMap,                 // additional config
+                   fusingSpecificParams,
+                   bool>  // should use decompression implementation
+    MatmulWeightsDecompressionParams;
 
 class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeightsDecompressionParams>,
                                    virtual public SubgraphBaseTest,