apply comments and more update

chenhu-wang · Mar 18, 2024 · 2e0c476 · 2e0c476
1 parent 1dbc0fd
commit 2e0c476
Show file tree

Hide file tree

Showing 12 changed files with 102 additions and 44 deletions.
diff --git a/src/common/snippets/include/snippets/pass/gn_tokenization.hpp b/src/common/snippets/include/snippets/pass/gn_tokenization.hpp
@@ -4,7 +4,6 @@
 
 #pragma once
 
-#include "openvino/pass/graph_rewrite.hpp"
 #include "openvino/pass/pattern/matcher.hpp"
 #include "snippets/pass/tokenization.hpp"
 
@@ -17,7 +16,7 @@ namespace pass {
  * @brief Tokenize GroupNormalization to a subgraph
  * @ingroup snippets
  */
-class TokenizeGNSnippets: public ov::pass::MatcherPass {
+class TokenizeGNSnippets : public ov::pass::MatcherPass {
 public:
     OPENVINO_RTTI("TokenizeGNSnippets", "0");
     TokenizeGNSnippets();

diff --git a/src/common/snippets/src/op/reshape.cpp b/src/common/snippets/src/op/reshape.cpp
@@ -12,7 +12,7 @@ namespace ov {
 namespace snippets {
 namespace op {
 Reshape::Reshape(const Output<Node>& arg, ov::PartialShape target_shape)
-    : Op({arg}), m_target_shape(target_shape) {
+    : Op({arg}), m_target_shape(std::move(target_shape)) {
     constructor_validate_and_infer_types();
 }
 
@@ -23,7 +23,7 @@ void Reshape::validate_and_infer_types() {
 std::shared_ptr<Node> Reshape::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(Reshape);
     check_new_args_count(this, new_args);
-    return std::make_shared<Reshape>(new_args.at(0), get_target_shape());
+    return std::make_shared<Reshape>(new_args.at(0), m_target_shape);
 }
 
 bool Reshape::visit_attributes(AttributeVisitor& visitor) {
@@ -36,7 +36,7 @@ const ov::PartialShape& Reshape::get_target_shape() const {
 }
 
 void Reshape::set_target_shape(ov::PartialShape shape) {
-    m_target_shape = shape;
+    m_target_shape = std::move(shape);
 }
 }// namespace op
 }// namespace snippets

diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
@@ -57,6 +57,8 @@
 #include <memory>
 #include <array>
 
+#include "snippets/lowered/pass/serialize_control_flow.hpp"
+
 using namespace std;
 using namespace ov::op::util;
 
@@ -321,7 +323,7 @@ VectorDims Subgraph::infer_master_shape() {
         OPENVINO_ASSERT(!output_dims.empty(), "Can't calculate master_shape before the first shape inference");
     } else {
         for (const auto& res : body_ptr()->get_results()) {
-            auto reshape = ov::as_type_ptr<op::Reshape>(res->get_input_node_shared_ptr(0))
+            auto reshape = ov::as_type_ptr<op::Reshape>(res->get_input_node_shared_ptr(0));
             auto res_input = reshape ? reshape->input(0) : res->input(0);
             OPENVINO_ASSERT(res_input.get_partial_shape().is_static(), "Result have dynamic shape in static pipeline");
             // We need to account to the shape's layout stored in Output<Node> rt_info
@@ -386,9 +388,10 @@ void Subgraph::data_flow_transformations(const BlockedShapeVector& blocked_input
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::op::data_flow_transformations")
 
     ov::snippets::pass::Manager manager;
-    // GroupNormalizationDecomposition should be before canonicalization(rankNorm) as scale/bias shape is C and need special process.
+    // GNDecomposition should be before canonicalization(rankNorm) as scale/bias shape is C and need special process.
     if (config.m_has_domain_sensitive_ops)
         manager.register_pass<snippets::pass::GNDecomposition>();
+
     if (!blocked_input_shapes.empty())
         manager.register_pass<snippets::pass::Canonicalization>(blocked_input_shapes);
     if (!input_precisions.empty() && !output_precisions.empty())
@@ -411,6 +414,12 @@ void Subgraph::data_flow_transformations(const BlockedShapeVector& blocked_input
 
     manager.register_positioned_passes(backend_passes);
     manager.run_passes(body_ptr());
+
+    // ov::pass::Manager magr;
+    // std::string xmlo = "data_flow.xml";
+    // std::string bino = "data_flow.bin";
+    // magr.register_pass<ov::pass::Serialize>(xmlo, bino);
+    // magr.run_passes(body_ptr());
 }
 
 void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,

diff --git a/src/common/snippets/src/pass/gn_decomposition.cpp b/src/common/snippets/src/pass/gn_decomposition.cpp
@@ -34,11 +34,11 @@ GNDecomposition::GNDecomposition() {
         const auto num_groups = static_cast<size_t>(group_norm_node->get_num_groups());
         const float eps = static_cast<float>(group_norm_node->get_epsilon());
 
-        ////////////collapse to reduce lastDim to avoid nested loop overhead(reduce tails process)///////////
+        ////////////collapse to reduce lastDim to avoid nested loop overhead(e.g. reduce tails in inner loop)///////////
         // reshape [N, C, spatial] to [N, group, 1, (C / group) * spatial]
         const auto orig_shape = group_norm_node->get_input_partial_shape(0);
         size_t orig_rank = orig_shape.rank().get_length();
-        size_t GNDecomposition = 4;
+        size_t group_rank = 4;
         std::vector<Dimension> group_dims(group_rank);
         group_dims[0] = orig_shape[0];
         group_dims[1] = Dimension(num_groups);
@@ -50,12 +50,17 @@ GNDecomposition::GNDecomposition() {
         }
         group_dims[3] = group_dims[3] * spatial_dim;
         ov::PartialShape group_shape(group_dims);
-        std::shared_ptr<ov::op::Op> reshaped_node = std::make_shared<ov::snippets::op::Reshape>(data, group_shape);
+        std::shared_ptr<ov::op::Op> reshaped_node_orig = std::make_shared<ov::snippets::op::Reshape>(data, group_shape);
+
+        std::shared_ptr<ov::op::Op> reshaped_node1 = reshaped_node_orig;
+        if (data.get_element_type() != element::f32) {
+            reshaped_node1 = std::make_shared<ov::snippets::op::ConvertSaturation>(reshaped_node_orig, element::f32);
+        }
 
         // reduceSum on dimension [C / group * spatial]
         std::vector<int64_t> axis(1, 3);
         auto axis_node = std::make_shared<ov::op::v0::Constant>(element::i64, Shape{axis.size()}, axis);
-        const auto reduce_sum = std::make_shared<ov::op::v1::ReduceSum>(reshaped_node, axis_node, true);
+        const auto reduce_sum = std::make_shared<ov::op::v1::ReduceSum>(reshaped_node1, axis_node, true);
 
         // reduceMean
         auto group_shape_static = group_shape.to_shape();
@@ -64,7 +69,11 @@ GNDecomposition::GNDecomposition() {
         const auto reduce_mean = std::make_shared<ov::op::v1::Multiply>(reduce_sum, group_size_inv_node);
 
         // x - mean
-        auto sub_mean = std::make_shared<ov::op::v1::Subtract>(reshaped_node, reduce_mean);
+        std::shared_ptr<ov::op::Op> reshaped_node2 = reshaped_node_orig;
+        if (data.get_element_type() != element::f32) {
+            reshaped_node2 = std::make_shared<ov::snippets::op::ConvertSaturation>(reshaped_node_orig, element::f32);
+        }
+        auto sub_mean = std::make_shared<ov::op::v1::Subtract>(reshaped_node2, reduce_mean);
         // (x - mean) ^ 2
         auto sqr_const = std::make_shared<ov::op::v0::Constant>(element::f32, Shape{1}, std::vector<int64_t>{2});
         auto sqr = std::make_shared<ov::op::v1::Power>(sub_mean, sqr_const);
@@ -110,8 +119,14 @@ GNDecomposition::GNDecomposition() {
         auto scaled_node = std::make_shared<ov::op::v1::Multiply>(mvn_reshaped, reshape_scale);
         auto biased_node = std::make_shared<ov::op::v1::Add>(scaled_node, reshape_bias);
 
+        auto result_prec = group_norm_node->get_output_element_type(0);
+        std::shared_ptr<ov::op::Op> biased_node_convert = biased_node;
+        if (result_prec != element::f32) {
+            biased_node_convert = std::make_shared<ov::snippets::op::ConvertSaturation>(biased_node, data.get_element_type());
+        }
+
         // reshape_back [N, group, C / group, spatial] to [N, C, spatial]
-        const auto reshape_back_node = std::make_shared<ov::snippets::op::Reshape>(biased_node, orig_shape);
+        const auto reshape_back_node = std::make_shared<ov::snippets::op::Reshape>(biased_node_convert, orig_shape);
 
         std::vector<size_t> subtensor(group_rank, 1);
         subtensor[3] = PortDescriptor::ServiceDimensions::FULL_DIM;

diff --git a/src/common/snippets/src/pass/gn_tokenization.cpp b/src/common/snippets/src/pass/gn_tokenization.cpp
@@ -3,6 +3,7 @@
 //
 
 #include "snippets/pass/gn_tokenization.hpp"
+#include "snippets/pass/collapse_subgraph.hpp"
 
 #include "snippets/itt.hpp"
 #include "snippets/op/subgraph.hpp"
@@ -19,7 +20,8 @@ ov::snippets::pass::TokenizeGNSnippets::TokenizeGNSnippets() {
     ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
         OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::pass::TokenizeGNSnippets")
         auto group_norm_node = ov::as_type_ptr<ov::op::v12::GroupNormalization>(m.get_match_root());
-        if (group_norm_node->is_dynamic())
+        if (group_norm_node->is_dynamic() ||
+            TokenizeSnippets::get_supported_element_types().count(group_norm_node->get_element_type()) == 0)
             return false;
 
         auto subgraph = op::Subgraph::wrap_node_as_subgraph(group_norm_node);

diff --git a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
@@ -252,9 +252,15 @@ ReshapeShapeInfer::ReshapeShapeInfer(const std::shared_ptr<Node>& n) {
 }
 
 Result ReshapeShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
-    OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes passed ReshapeShapeInfer");
+    OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes is passed in ReshapeShapeInfer");
+    OPENVINO_ASSERT(target_shape.is_static(), "target_shape should be static in ReshapeShapeInfer");
     VectorDims result_shape = target_shape.get_shape();
-    // todo: check static and size is the same
+    const auto input_elems =
+        std::accumulate(input_shapes[0].get().begin(), input_shapes[0].get().end(), static_cast<int64_t>(1), std::multiplies<int64_t>());
+    const auto output_elems =
+        std::accumulate(result_shape.begin(), result_shape.end(), static_cast<int64_t>(1), std::multiplies<int64_t>());
+    OPENVINO_ASSERT(input_elems == output_elems, "Tensor volume should be the same after reshape in ReshapeShapeInfer");
+
     return {{result_shape}, ShapeInferStatus::success};
 }
 

diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp
@@ -60,6 +60,7 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry
         SHAPE_INFER_PREDEFINED(op::KernelStatic, EmptyShapeInfer),
         SHAPE_INFER_PREDEFINED(op::KernelDynamic, EmptyShapeInfer),
         SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer),
+        SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Reshape, ReshapeShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Brgemm, BrgemmShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::ReduceMax, ReduceShapeInfer),
@@ -89,8 +90,6 @@ std::shared_ptr<IShapeInferSnippets> make_shape_inference(const std::shared_ptr<
                ov::is_type<ov::op::util::BinaryElementwiseComparison>(op) ||
                ov::is_type<ov::op::util::BinaryElementwiseLogical>(op)) {
         return std::make_shared<NumpyBroadcastShapeInfer>();
-    } else if (ov::is_type<ov::snippets::op::Reshape>(op)) {
-        return std::make_shared<ReshapeShapeInfer>(op);
     } else {
         OPENVINO_THROW("Operation type " + std::string(op->get_type_info().name) + " is not supported in Snippets shape inference pipeline");
     }

diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -641,6 +641,12 @@ void Snippet::SnippetJitExecutor::generate(const jit_snippets_compile_args* jcp)
     SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::intel_cpu::pass::FuseLoadStoreConvert,
                                     ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape);
 
+    // ov::pass::Manager magr;
+    // std::string xmlo = "original.xml";
+    // std::string bino = "original.bin";
+    // magr.register_pass<ov::pass::Serialize>(xmlo, bino);
+    // magr.run_passes(snippetAttrs.snippet->body_ptr());
+
     schedule = snippetAttrs.snippet->generate_from_linear_ir(std::make_shared<ov::snippets::lowered::pass::PassConfig>(),
                                                              backend_passes,
                                                              reinterpret_cast<const void*>(jcp));

diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -67,7 +67,7 @@
 #include "transformations/op_conversions/hswish_decomposition.hpp"
 #include "transformations/op_conversions/gru_cell_decomposition.hpp"
 #include "transformations/op_conversions/lstm_cell_decomposition.hpp"
-#include "transformations/op_conversions/gn_decomposition.hpp"
+#include "transformations/op_conversions/group_normalization_decomposition.hpp"
 #include "transformations/op_conversions/mvn6_decomposition.hpp"
 #include "transformations/op_conversions/normalize_l2_decomposition.hpp"
 #include "transformations/op_conversions/reduce_l1_decomposition.hpp"
@@ -471,11 +471,14 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
         },
         ov::pass::NormalizeL2Decomposition);
 
+    // todo: only support f32 in first version
     CPU_SET_CALLBACK_X64(manager,
         [](const_node_ptr &node) -> bool {
-            return !node->is_dynamic() && node->get_output_element_type(0) == element::f32;
+            return !node->is_dynamic() &&
+                ov::snippets::pass::TokenizeSnippets::get_supported_element_types().count(node->get_element_type()) != 0;
         },
-        ov::pass::GNDecomposition);
+        ov::pass::GroupNormalizationDecomposition);
+
     CPU_ENABLE_PASS_COMMON(manager, ov::pass::SoftmaxDecomposition);
     CPU_SET_CALLBACK_COMMON(manager,
             [](const_node_ptr &node) -> bool {

diff --git a/...el_cpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp b/...el_cpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp
@@ -1,33 +1,37 @@
-// Copyright (C) 2023 Intel Corporation
+// Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-#include "single_layer_tests/group_normalization.hpp"
-
-using namespace ov::test::subgraph;
+#include "single_op_tests/group_normalization.hpp"
 
 namespace {
+using ov::test::GroupNormalizationTest;
 
 const std::vector<ov::test::ElementType> netPrecisions = {
     ov::element::f32,
-    ov::element::bf16,
-    ov::element::i8
+    // ov::element::bf16,  // remove specific merge convert
+    // ov::element::i8     // ref impl does not support int8 precision
+};
+
+// static shapes
+const std::vector<ov::Shape> staticInputShapes = {
+    {3, 8, 3},
+    {3, 8, 8},
+    {3, 8, 16},
+    {3, 8, 21},
+    {1, 4, 8, 8},
+    {1, 8, 1, 22},
+    {3, 16, 1, 33},
+    {1, 4, 1, 1, 34},
+    {1, 8, 1, 8, 2, 2},
+    {1, 8, 1, 8, 2, 2, 2},
 };
 
-const std::vector<ov::test::InputShape> inputShapes = {
-    // static shape
-    {{1, 4, 1, 8}, {{1, 4, 1, 8}}},
-    {{3, 8, 2, 32}, {{3, 8, 2, 32}}},
-    {{3, 8, 16, 8, 4}, {{3, 8, 16, 8, 4}}},
-    {{3, 8, 16, 8, 64}, {{3, 8, 16, 8, 64}}},
-    {{3, 8, 16, 100, 4}, {{3, 8, 16, 100, 4}}},
-    {{3, 16, 16, 8, 4}, {{3, 16, 16, 8, 4}}},
-    {{1, 8, 8}, {{1, 8, 8}}},
-    {{1, 8, 1, 8, 2}, {{1, 8, 1, 8, 2}}},
-    {{1, 8, 1, 8, 2, 2}, {{1, 8, 1, 8, 2, 2}}},
-    {{1, 8, 1, 8, 2, 2, 2}, {{1, 8, 1, 8, 2, 2, 2}}},
-    // dynmaic shape
+// dynmaic shapes
+const std::vector<ov::test::InputShape> DynamicInputShapes = {
+    {{-1, -1, -1}, {{1, 8, 22}, {2, 4, 7}, {1, 8, 22}}},
     {{-1, -1, -1, -1}, {{1, 16, 8, 8}, {2, 8, 4, 4}, {1, 16, 8, 8}}},
-    {{{1, 4}, 16, -1, -1}, {{1, 16, 6, 6}, {4, 16, 10, 10}, {1, 16, 6, 6}}}
+    {{{1, 4}, {4, 16}, -1, -1}, {{1, 4, 6, 6}, {4, 16, 10, 10}, {1, 4, 6, 6}}},
+    {{-1, -1, -1, -1, -1}, {{1, 16, 7, 7, 1}, {2, 8, 4, 4, 1}, {1, 16, 7, 7, 1}}},
 };
 
 const std::vector<int64_t> numGroups = {
@@ -39,12 +43,25 @@ const std::vector<double> epsilon = {
 };
 
 INSTANTIATE_TEST_SUITE_P(
-    smoke_GroupNormalization,
+    smoke_GroupNormalizationStatic,
+    GroupNormalizationTest,
+    testing::Combine(testing::ValuesIn(netPrecisions),
+                     ::testing::Values(ov::element::undefined),
+                     ::testing::Values(ov::element::undefined),
+                     testing::ValuesIn(ov::test::static_shapes_to_test_representation(staticInputShapes)),
+                     testing::ValuesIn(numGroups),
+                     testing::ValuesIn(epsilon),
+                     testing::Values(ov::test::utils::DEVICE_CPU),
+                     testing::Values(ov::AnyMap())),
+                     GroupNormalizationTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(
+    smoke_GroupNormalizationDyn,
     GroupNormalizationTest,
     testing::Combine(testing::ValuesIn(netPrecisions),
                      ::testing::Values(ov::element::undefined),
                      ::testing::Values(ov::element::undefined),
-                     testing::ValuesIn(inputShapes),
+                     testing::ValuesIn(DynamicInputShapes),
                      testing::ValuesIn(numGroups),
                      testing::ValuesIn(epsilon),
                      testing::Values(ov::test::utils::DEVICE_CPU),

diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -292,6 +292,8 @@ std::vector<std::string> disabledTestPatterns() {
         retVector.emplace_back(R"(.*Extension.OnnxModelWithExtensionFromDSO.*)");
         retVector.emplace_back(R"(.*ONNXQuantizedModels/QuantizedModelsTests.MaxPool.*)");
         retVector.emplace_back(R"(.*ONNXQuantizedModels/QuantizedModelsTests.Convolution.*)");
+        // Ticket: 134601
+        retVector.emplace_back(R"(.*smoke_GroupNormalization.*)");
     }
     // invalid test: checks u8 precision for runtime graph, while it should be f32
     retVector.emplace_back(R"(smoke_NegativeQuantizedMatMulMultiplyFusion.*)");

diff --git a/...ctional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp b/...ctional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp
@@ -81,7 +81,7 @@ class GroupNormalizationTest : public testing::WithParamInterface<GroupNormaliza
         std::transform(shape.second.cbegin(), shape.second.cend(), std::back_inserter(biasShape),
                        [](const ov::Shape& s)->ov::Shape { return {s[1]}; });
         InputShape biasInputShape {
-            ov::PartialShape{shape.first[1]},
+            shape.first.is_dynamic() ? ov::PartialShape{shape.first[1]} : shape.first,
             std::move(biasShape)
         };
         return biasInputShape;