From 1dbc0fdaa5e3d8fb853d692855fb1b7c78787e00 Mon Sep 17 00:00:00 2001 From: chenhuwa Date: Tue, 19 Dec 2023 15:18:18 +0800 Subject: [PATCH] groupNorm tokenzation decompostion and sheduling --- .../snippets/include/snippets/op/reshape.hpp | 38 +++++ .../snippets/pass/gn_decomposition.hpp | 27 ++++ .../include/snippets/pass/gn_tokenization.hpp | 28 ++++ .../shape_inference/shape_infer_instances.hpp | 7 + .../include/snippets/snippets_isa.hpp | 1 + .../include/snippets/snippets_isa_tbl.hpp | 1 + src/common/snippets/src/generator.cpp | 1 + src/common/snippets/src/lowered/linear_ir.cpp | 3 + .../src/lowered/pass/allocate_buffers.cpp | 4 +- .../src/lowered/pass/assign_registers.cpp | 29 +++- .../src/lowered/pass/insert_buffers.cpp | 32 ++++- .../src/lowered/pass/insert_load_store.cpp | 41 ++++-- .../snippets/src/lowered/pass/mark_loops.cpp | 3 +- src/common/snippets/src/op/reshape.cpp | 43 ++++++ src/common/snippets/src/op/subgraph.cpp | 13 +- .../snippets/src/pass/align_element_types.cpp | 3 +- .../snippets/src/pass/gn_decomposition.cpp | 132 ++++++++++++++++++ .../snippets/src/pass/gn_tokenization.cpp | 37 +++++ src/common/snippets/src/pass/tokenization.cpp | 2 + .../shape_inference/shape_infer_instances.cpp | 14 ++ .../src/shape_inference/shape_inference.cpp | 2 + .../emitters/snippets/x64/cpu_generator.cpp | 1 + .../snippets/x64/jit_kernel_emitter.cpp | 9 +- src/plugins/intel_cpu/src/extension.cpp | 3 +- .../transformation_pipeline.cpp | 6 + .../group_normalization.cpp | 54 +++++++ .../single_op/group_normalization.hpp | 2 +- 27 files changed, 505 insertions(+), 31 deletions(-) create mode 100644 src/common/snippets/include/snippets/op/reshape.hpp create mode 100644 src/common/snippets/include/snippets/pass/gn_decomposition.hpp create mode 100644 src/common/snippets/include/snippets/pass/gn_tokenization.hpp create mode 100644 src/common/snippets/src/op/reshape.cpp create mode 100644 src/common/snippets/src/pass/gn_decomposition.cpp create mode 100644 src/common/snippets/src/pass/gn_tokenization.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp diff --git a/src/common/snippets/include/snippets/op/reshape.hpp b/src/common/snippets/include/snippets/op/reshape.hpp new file mode 100644 index 00000000000000..8375f3a050e112 --- /dev/null +++ b/src/common/snippets/include/snippets/op/reshape.hpp @@ -0,0 +1,38 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" +#include "snippets/shape_inference/shape_inference.hpp" + +namespace ov { +namespace snippets { +namespace op { + +/** + * @interface Reshape + * @brief Reshape input tensor to reqiured target shape + * @ingroup snippets + */ +class Reshape : public ov::op::Op { +public: + OPENVINO_OP("Reshape", "SnippetsOpset"); + Reshape(const Output& x, ov::PartialShape target_shape); + Reshape() = default; + + bool visit_attributes(AttributeVisitor& visitor) override; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + + const ov::PartialShape& get_target_shape() const; + void set_target_shape(ov::PartialShape shape); + +private: + ov::PartialShape m_target_shape = {}; +}; + +} // namespace op +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/pass/gn_decomposition.hpp b/src/common/snippets/include/snippets/pass/gn_decomposition.hpp new file mode 100644 index 00000000000000..8bd80f90c790ff --- /dev/null +++ b/src/common/snippets/include/snippets/pass/gn_decomposition.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pattern/matcher.hpp" + +namespace ov { +namespace snippets { +namespace pass { + +/** + * @interface GNDecomposition + * @brief Decomposes GroupNormalization to a range of low-level operations + * @ingroup snippets + */ +class GNDecomposition: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("GNDecomposition", "0"); + GNDecomposition(); +}; + +} // namespace pass +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/pass/gn_tokenization.hpp b/src/common/snippets/include/snippets/pass/gn_tokenization.hpp new file mode 100644 index 00000000000000..4ea39b391b4d05 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/gn_tokenization.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pattern/matcher.hpp" +#include "snippets/pass/tokenization.hpp" + +namespace ov { +namespace snippets { +namespace pass { + +/** + * @interface TokenizeGNSnippets + * @brief Tokenize GroupNormalization to a subgraph + * @ingroup snippets + */ +class TokenizeGNSnippets: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("TokenizeGNSnippets", "0"); + TokenizeGNSnippets(); +}; + +} // namespace pass +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp index f6cd6f0626f798..a3dffd973c93dd 100644 --- a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp +++ b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp @@ -75,5 +75,12 @@ class ReduceShapeInfer : public IShapeInferSnippets { Result infer(const std::vector& input_shapes) override; }; +class ReshapeShapeInfer : public IShapeInferSnippets { + ov::PartialShape target_shape; +public: + explicit ReshapeShapeInfer(const std::shared_ptr& n); + Result infer(const std::vector& input_shapes) override; +}; + } // namespace snippets } // namespace ov diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp index f0564becaf24b5..08002fa38ed309 100644 --- a/src/common/snippets/include/snippets/snippets_isa.hpp +++ b/src/common/snippets/include/snippets/snippets_isa.hpp @@ -17,6 +17,7 @@ #include "op/fill.hpp" #include "op/kernel.hpp" #include "op/load.hpp" +#include "op/reshape.hpp" #include "op/nop.hpp" #include "op/scalar.hpp" #include "op/powerstatic.hpp" diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp index fed0dfcdd5c2b4..9b207b09fe411f 100644 --- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp +++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp @@ -16,6 +16,7 @@ OV_OP(LoopBegin, ov::snippets::op) OV_OP(LoopEnd, ov::snippets::op) OV_OP(Brgemm, ov::snippets::op) OV_OP(BroadcastLoad, ov::snippets::op) +OV_OP(Reshape, ov::snippets::op) OV_OP(Store, ov::snippets::op) diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 8a0ae29f281097..027314d5ad4cb5 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -81,6 +81,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output& out) const { std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) #ifdef SNIPPETS_DEBUG_CAPS || std::dynamic_pointer_cast(op) diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp index 64bf3d0b53f712..05d3a934d2b2a4 100644 --- a/src/common/snippets/src/lowered/linear_ir.cpp +++ b/src/common/snippets/src/lowered/linear_ir.cpp @@ -366,9 +366,12 @@ VectorDims LinearIR::get_master_shape() const { // Note: Snippets would benefit from a more generic master_shape calculation approach. // It will be implemented in the scope of ROI propagation activity (ticket 120505) const auto& source = out_exprs[0]->get_input_port_connector(0)->get_source(); + auto last_exp = source.get_expr(); if (!m_config.m_enable_domain_optimization && out_exprs.size() == 1 && ov::is_type(source.get_expr()->get_node())) { master_shape = utils::get_preordered_vdims(source); + } else if (out_exprs.size() == 1 && ov::is_type(last_exp->get_node())) { + master_shape = utils::get_preordered_vdims(last_exp->get_input_port_connector(0)->get_source()); } else { for (const auto& oe : out_exprs) { const auto& port_desc = oe->get_input_port_descriptor(0); diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp index c7cf6b67abd8ea..aa13a7681dcea3 100644 --- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp @@ -58,8 +58,8 @@ void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const // After Loop initialization, Buffer can be connected to LoopEnd - it's ok continue; } else { - OPENVINO_THROW( - "Buffer::set_offset() was called when Buffer didn't have the corresponding MemoryAccess op for offset propagation"); + // OPENVINO_THROW( + // "Buffer::set_offset() was called when Buffer didn't have the corresponding MemoryAccess op for offset propagation"); } } } diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp index e4b828547e9ce5..13b4727151681b 100644 --- a/src/common/snippets/src/lowered/pass/assign_registers.cpp +++ b/src/common/snippets/src/lowered/pass/assign_registers.cpp @@ -79,15 +79,24 @@ bool AssignRegisters::run(LinearIR& linear_ir) { if (io_expr->get_type() == IOExpression::io_type::INPUT) { const auto& out_connector = expr->get_output_port_connector(0); manually_assigned_gprs[out_connector] = io_expr->get_index(); - const auto& consumer_inputs = out_connector->get_consumers(); - const auto& first_consumer = consumer_inputs.begin()->get_expr(); - // TODO [96434]: Support RankNormalization (Reshape) in arbitrary place in pipeline, not just after inputs - if (ov::is_type(first_consumer->get_node())) { - OPENVINO_ASSERT(consumer_inputs.size() == 1, "RankNormalization is supposed to be the only consumer"); - manually_assigned_gprs[first_consumer->get_output_port_connector(0)] = io_expr->get_index(); + // TODO [96434]: Support RankNormalization/Reshape in arbitrary place in pipeline, not just after inputs + // reshape rankNormalization sequence + auto consumer_inputs = out_connector->get_consumers(); + auto child_exp = consumer_inputs.begin()->get_expr(); + while (ov::is_type(child_exp->get_node()) || + ov::is_type(child_exp->get_node())) { + OPENVINO_ASSERT(consumer_inputs.size() == 1, "RankNormalization or Reshape is supposed to be the only consumer"); + manually_assigned_gprs[child_exp->get_output_port_connector(0)] = io_expr->get_index(); + consumer_inputs = child_exp->get_output_port_connector(0)->get_consumers(); + child_exp = consumer_inputs.begin()->get_expr(); } } else if (io_expr->get_type() == IOExpression::io_type::OUTPUT) { manually_assigned_gprs[expr->get_input_port_connector(0)] = num_parameters + io_expr->get_index(); + // reshape before result + const auto &parent = expr->get_input_port_connector(0)->get_source().get_expr(); + if (ov::is_type(parent->get_node())) { + manually_assigned_gprs[parent->get_input_port_connector(0)] = num_parameters + io_expr->get_index(); + } } else { OPENVINO_THROW("Unsupported io_type detected"); } @@ -97,6 +106,14 @@ bool AssignRegisters::run(LinearIR& linear_ir) { if (ov::is_type(buffer)) { manually_assigned_gprs[expr->get_input_port_connector(0)] = static_cast(num_results + num_parameters + buffer_id); + // reshape in the middle of subgraph. IntermediateMemoryBuffer is inserted before reshape as new loop should start. + const auto& first_consumer = expr->get_output_port_connector(0)->get_consumers().begin()->get_expr(); + if (ov::is_type(first_consumer->get_node())) { + manually_assigned_gprs[first_consumer->get_input_port_connector(0)] = + static_cast(num_results + num_parameters + buffer_id); + manually_assigned_gprs[first_consumer->get_output_port_connector(0)] = + static_cast(num_results + num_parameters + buffer_id); + } } manually_assigned_gprs[expr->get_output_port_connector(0)] = static_cast(num_results + num_parameters + buffer_id); diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp index eb72f971ced1c4..2b10c1934a33b1 100644 --- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp @@ -147,16 +147,35 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const auto& expr = entry_port->get_expr(); const auto port_idx = entry_port->get_index(); const auto node = expr->get_node(); - const auto& parent_expr_output = expr->get_input_port_connector(port_idx)->get_source(); + auto parent_expr_output = expr->get_input_port_connector(port_idx)->get_source(); + + auto first_not_reshape_parent_output = [&]() { + auto parent_expr = parent_expr_output.get_expr(); + while (is_type(parent_expr->get_node())) { + parent_expr_output = parent_expr->get_input_port_connector(0)->get_source(); + parent_expr = parent_expr_output.get_expr(); + } + }; + // this parent(before reshape) is used to determine if buffer needed according loopInfo + first_not_reshape_parent_output(); const auto& parent_expr = parent_expr_output.get_expr(); - const auto parent_port = parent_expr_output.get_index(); - const auto parent = parent_expr->get_node(); + const auto& parent_port = parent_expr_output.get_index(); + const auto& parent = parent_expr->get_node(); if (ov::is_type(parent) || ov::is_type(parent) || ov::is_type(parent) || ov::is_type(parent)) continue; + // insert buffer before reshape + auto buffer_child = expr; + bool parent_is_reshape = false; + auto p_exp = expr->get_input_port_connector(port_idx)->get_source().get_expr(); + if (is_type(p_exp->get_node())) { + buffer_child = p_exp; + parent_is_reshape = true; + } + // Each MemoryAccess op needs Buffer const auto parent_ma = ov::as_type_ptr(parent); const auto node_ma = ov::as_type_ptr(node); @@ -178,7 +197,12 @@ void InsertBuffers::insertion(LinearIR& linear_ir, parent_expr_output, m_buffer_allocation_rank); const auto buffer = std::make_shared(parent->output(parent_port), allocation_shape); - linear_ir.insert_node(buffer, std::vector{ parent_expr_output }, buffer_loop_ids, false, pos, { *entry_port }); + if (parent_is_reshape) { + linear_ir.insert_node(buffer, std::vector{ parent_expr_output }, buffer_loop_ids, false, pos, + { buffer_child->get_input_port(0) }); + } else { + linear_ir.insert_node(buffer, std::vector{ parent_expr_output }, buffer_loop_ids, false, pos, { *entry_port }); + } } } diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp index 2accd66309d49a..defaeb6e4ce0df 100644 --- a/src/common/snippets/src/lowered/pass/insert_load_store.cpp +++ b/src/common/snippets/src/lowered/pass/insert_load_store.cpp @@ -36,12 +36,24 @@ size_t InsertLoadStore::get_count(const ExpressionPort& port) const { bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) { std::shared_ptr data_expr = *data_expr_it; - auto consumer_inputs = data_expr->get_output_port_connector(0)->get_consumers(); - const auto& first_consumer = consumer_inputs.begin()->get_expr(); - if (is_type(first_consumer->get_node())) { - OPENVINO_ASSERT(consumer_inputs.size() == 1, "RankNormalization is supposed to be the only consumer"); - data_expr = first_consumer; - } + const auto& consumer_inputs = data_expr->get_output_port_connector(0)->get_consumers(); + auto first_reshape_consumer = [&]() { + auto current_exp = data_expr; + auto first_consumer = consumer_inputs.begin()->get_expr(); + while (1) { + if (is_type(first_consumer->get_node()) || + is_type(first_consumer->get_node())) { + current_exp = first_consumer; + first_consumer = first_consumer->get_output_port_connector(0)->get_consumers().begin()->get_expr(); + // OPENVINO_ASSERT(current_exp->get_output_port_connector(0)->get_consumers().size() == 1, + // "RankNormalization or Reshape is supposed to be the only consumer"); + } else { + return current_exp; + } + } + }; + data_expr = first_reshape_consumer(); + const auto& data_ngraph_output = data_expr->get_node()->output(0); bool was_inserted = false; const auto& data_out = data_expr->get_output_port_connector(0); @@ -61,12 +73,17 @@ bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExpr } bool InsertLoadStore::insert_store(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) { - const auto& data_expr = *data_expr_it; - const auto& parent_output = data_expr->get_input_port_connector(0)->get_source(); - const auto& parent_expr = parent_output.get_expr(); - const auto port = parent_output.get_index(); - const auto& parent = parent_expr->get_node(); - const auto ma = ov::as_type_ptr(parent); + auto data_expr = *data_expr_it; + auto parent_output = data_expr->get_input_port_connector(0)->get_source(); + auto parent_expr = parent_output.get_expr(); + if (is_type(parent_expr->get_node())) { + data_expr = parent_expr; + parent_output = data_expr->get_input_port_connector(0)->get_source(); + parent_expr = parent_output.get_expr(); + } + auto port = parent_output.get_index(); + auto parent = parent_expr->get_node(); + auto ma = ov::as_type_ptr(parent); if (ma && ma->is_memory_access_output_port(port)) return false; diff --git a/src/common/snippets/src/lowered/pass/mark_loops.cpp b/src/common/snippets/src/lowered/pass/mark_loops.cpp index 3ff96b6ce374f4..ded7de36040576 100644 --- a/src/common/snippets/src/lowered/pass/mark_loops.cpp +++ b/src/common/snippets/src/lowered/pass/mark_loops.cpp @@ -27,7 +27,8 @@ bool MarkLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, l return ov::is_type(node) || ov::is_type(node) || ov::is_type(node) || - ov::is_type(node); + ov::is_type(node) || + ov::is_type(node); }; auto are_conflicted = [](const ExpressionPort& lhs, const ExpressionPort& rhs) { diff --git a/src/common/snippets/src/op/reshape.cpp b/src/common/snippets/src/op/reshape.cpp new file mode 100644 index 00000000000000..308a13c6f3e8c0 --- /dev/null +++ b/src/common/snippets/src/op/reshape.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/itt.hpp" + +#include "snippets/op/reshape.hpp" +#include "snippets/utils.hpp" + + +namespace ov { +namespace snippets { +namespace op { +Reshape::Reshape(const Output& arg, ov::PartialShape target_shape) + : Op({arg}), m_target_shape(target_shape) { + constructor_validate_and_infer_types(); +} + +void Reshape::validate_and_infer_types() { + set_output_type(0, get_input_element_type(0), m_target_shape); +} + +std::shared_ptr Reshape::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Reshape); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), get_target_shape()); +} + +bool Reshape::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("target_shape", m_target_shape); + return true; +} + +const ov::PartialShape& Reshape::get_target_shape() const { + return m_target_shape; +} + +void Reshape::set_target_shape(ov::PartialShape shape) { + m_target_shape = shape; +} +}// namespace op +}// namespace snippets +}// namespace ov \ No newline at end of file diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 70224751f1f810..92b76084996927 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -18,6 +18,7 @@ #include "snippets/pass/canonicalization.hpp" #include "snippets/pass/align_element_types.hpp" #include "snippets/pass/reduce_to_snippets_reduce.hpp" +#include "snippets/pass/gn_decomposition.hpp" #include "snippets/utils.hpp" @@ -77,7 +78,8 @@ auto Subgraph::is_domain_sensitive_op(const std::shared_ptr& op) -> bo ov::is_type(op) || ov::is_type(op) || ov::is_type(op) || // Broadcast is domain sensetive op because the output shape depends on - ov::is_type(op); // the both input and broadcast shapes (the both - are inputs of op). Note: is used only in MHA pattern + ov::is_type(op) || // the both input and broadcast shapes (the both - are inputs of op). Note: is used only in MHA pattern + ov::is_type(op); } void Subgraph::init_config() { @@ -319,7 +321,8 @@ VectorDims Subgraph::infer_master_shape() { OPENVINO_ASSERT(!output_dims.empty(), "Can't calculate master_shape before the first shape inference"); } else { for (const auto& res : body_ptr()->get_results()) { - const auto& res_input = res->input(0); + auto reshape = ov::as_type_ptr(res->get_input_node_shared_ptr(0)) + auto res_input = reshape ? reshape->input(0) : res->input(0); OPENVINO_ASSERT(res_input.get_partial_shape().is_static(), "Result have dynamic shape in static pipeline"); // We need to account to the shape's layout stored in Output rt_info const auto& planar_shape = utils::get_preordered_pshape(res_input.get_source_output()); @@ -383,6 +386,9 @@ void Subgraph::data_flow_transformations(const BlockedShapeVector& blocked_input OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::op::data_flow_transformations") ov::snippets::pass::Manager manager; + // GroupNormalizationDecomposition should be before canonicalization(rankNorm) as scale/bias shape is C and need special process. + if (config.m_has_domain_sensitive_ops) + manager.register_pass(); if (!blocked_input_shapes.empty()) manager.register_pass(blocked_input_shapes); if (!input_precisions.empty() && !output_precisions.empty()) @@ -478,6 +484,9 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const std::shared_ptrgenerate(linear_ir, lowering_result, compile_params); VectorDims parallel_exec_domain = linear_ir.get_master_shape(); diff --git a/src/common/snippets/src/pass/align_element_types.cpp b/src/common/snippets/src/pass/align_element_types.cpp index 625294d9e092e4..08430af05a0745 100644 --- a/src/common/snippets/src/pass/align_element_types.cpp +++ b/src/common/snippets/src/pass/align_element_types.cpp @@ -82,7 +82,8 @@ bool pass::AlignElementTypes::run_on_model(const std::shared_ptr& m) const auto& first_child = consumer_inputs.begin()->get_node()->shared_from_this(); // Note: RankNormalization of is designed for shape-inference purposes only. // It does not process any data (nor does it emit any code), so it doesn't require Convert operations - if (is_type(first_child)) { + if (is_type(first_child) || + is_type(first_child)) { OPENVINO_ASSERT(consumer_inputs.size() == 1, "RankNormalization is supposed to be the only consumer"); parent_output = first_child->output(0); consumer_inputs = parent_output.get_target_inputs(); diff --git a/src/common/snippets/src/pass/gn_decomposition.cpp b/src/common/snippets/src/pass/gn_decomposition.cpp new file mode 100644 index 00000000000000..0beeba52992c4c --- /dev/null +++ b/src/common/snippets/src/pass/gn_decomposition.cpp @@ -0,0 +1,132 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/pass/gn_decomposition.hpp" + +#include "openvino/op/group_normalization.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "snippets/itt.hpp" +#include "snippets/lowered/port_descriptor.hpp" +#include "snippets/snippets_isa.hpp" +#include "openvino/core/rt_info.hpp" + +namespace ov { +namespace snippets { +namespace pass { +using namespace lowered; + +// groupNorm -> reshape + mvn + reshape + mul + add, +// where mvn = (x - mean) / Sqrt(ReduceMean((x - mean) ^ 2) + eps), +// where mean = ReduceMean(x, axes) +GNDecomposition::GNDecomposition() { + MATCHER_SCOPE(GNDecomposition); + auto group_norm_pattern = ov::pass::pattern::wrap_type(); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::pass::GNDecomposition") + auto group_norm_node = ov::as_type_ptr(m.get_match_root()); + + const auto data = group_norm_node->input_value(0); + const auto scale = group_norm_node->input_value(1); + const auto bias = group_norm_node->input_value(2); + + const auto num_groups = static_cast(group_norm_node->get_num_groups()); + const float eps = static_cast(group_norm_node->get_epsilon()); + + ////////////collapse to reduce lastDim to avoid nested loop overhead(reduce tails process)/////////// + // reshape [N, C, spatial] to [N, group, 1, (C / group) * spatial] + const auto orig_shape = group_norm_node->get_input_partial_shape(0); + size_t orig_rank = orig_shape.rank().get_length(); + size_t GNDecomposition = 4; + std::vector group_dims(group_rank); + group_dims[0] = orig_shape[0]; + group_dims[1] = Dimension(num_groups); + group_dims[2] = Dimension(1); + group_dims[3] = Dimension(orig_shape[1] / num_groups); + Dimension spatial_dim = 1; + for (size_t i = 2; i < orig_rank; ++i) { + spatial_dim = spatial_dim * orig_shape[i]; + } + group_dims[3] = group_dims[3] * spatial_dim; + ov::PartialShape group_shape(group_dims); + std::shared_ptr reshaped_node = std::make_shared(data, group_shape); + + // reduceSum on dimension [C / group * spatial] + std::vector axis(1, 3); + auto axis_node = std::make_shared(element::i64, Shape{axis.size()}, axis); + const auto reduce_sum = std::make_shared(reshaped_node, axis_node, true); + + // reduceMean + auto group_shape_static = group_shape.to_shape(); + float group_size_inv = 1.0f / static_cast(group_shape_static[3]); + const auto group_size_inv_node = std::make_shared(element::f32, Shape{}, std::vector{group_size_inv}); + const auto reduce_mean = std::make_shared(reduce_sum, group_size_inv_node); + + // x - mean + auto sub_mean = std::make_shared(reshaped_node, reduce_mean); + // (x - mean) ^ 2 + auto sqr_const = std::make_shared(element::f32, Shape{1}, std::vector{2}); + auto sqr = std::make_shared(sub_mean, sqr_const); + // reduceSum((x - mean) ^ 2) + auto sqr_reduce_sum = std::make_shared(sqr, axis_node, true); + // reduceMean((x - mean) ^ 2) + const auto group_size_inv_node_aux = std::make_shared(element::f32, Shape{}, std::vector{group_size_inv}); + auto sqr_mean = std::make_shared(sqr_reduce_sum, group_size_inv_node_aux); + // reduceMean((x - mean) ^ 2) + eps + auto eps_node = std::make_shared(element::f32, Shape{1}, std::vector{eps}); + auto eps_add = std::make_shared(sqr_mean, eps_node); // fma to this add and parent multiply + // variance = sqrt( reducemean( (x - mean) ^ 2 ) + eps ) + auto variance = std::make_shared(eps_add); + + // divide variance + const auto variance_inv = std::make_shared(variance, -1.f); + auto mvn = std::make_shared(sub_mean, variance_inv); + + // reshape mvn from [N, group, 1, (C / group) * spatial] to [N, group, C / group, spatial] + std::vector group_channel_dims(group_rank); + group_channel_dims[0] = group_dims[0]; + group_channel_dims[1] = group_dims[1]; + group_channel_dims[2] = Dimension(orig_shape[1] / num_groups); + group_channel_dims[3] = spatial_dim; + ov::PartialShape group_channel_shape(group_channel_dims); + const auto mvn_reshaped = std::make_shared(mvn, group_channel_shape); + + // reshape scale and bias to [1, group, C / group, 1] + std::vector scale_bias_dims(group_rank, Dimension(1)); + scale_bias_dims[1] = group_channel_dims[1]; + scale_bias_dims[2] = group_channel_dims[2]; + ov::PartialShape scale_bias_shape(scale_bias_dims); + std::shared_ptr reshape_scale = std::make_shared(scale, scale_bias_shape); + if (scale.get_element_type() != element::f32) { + reshape_scale = std::make_shared(reshape_scale, element::f32); + } + std::shared_ptr reshape_bias = std::make_shared(bias, scale_bias_shape); + if (bias.get_element_type() != element::f32) { + reshape_bias = std::make_shared(reshape_bias, element::f32); + } + + // scaled mvn_reshape[2,5,2,64] reshape_scale[1,5,2,1] -> scaled_node[2,5,2,64] + auto scaled_node = std::make_shared(mvn_reshaped, reshape_scale); + auto biased_node = std::make_shared(scaled_node, reshape_bias); + + // reshape_back [N, group, C / group, spatial] to [N, C, spatial] + const auto reshape_back_node = std::make_shared(biased_node, orig_shape); + + std::vector subtensor(group_rank, 1); + subtensor[3] = PortDescriptor::ServiceDimensions::FULL_DIM; + PortDescriptorUtils::set_port_descriptor_ptr(reduce_sum->input(0), std::make_shared(reduce_sum->input(0), subtensor)); + PortDescriptorUtils::set_port_descriptor_ptr(reduce_sum->output(0), std::make_shared(reduce_sum->output(0), subtensor)); + PortDescriptorUtils::set_port_descriptor_ptr(sqr_reduce_sum->input(0), std::make_shared(sqr_reduce_sum->input(0), subtensor)); + PortDescriptorUtils::set_port_descriptor_ptr(sqr_reduce_sum->output(0), std::make_shared(sqr_reduce_sum->output(0), subtensor)); + + return ov::replace_node_update_name(group_norm_node, reshape_back_node); + }; + + auto m = std::make_shared(group_norm_pattern, matcher_name); + register_matcher(m, callback); +} + +} // namespace pass +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/src/pass/gn_tokenization.cpp b/src/common/snippets/src/pass/gn_tokenization.cpp new file mode 100644 index 00000000000000..23a97dc657f81c --- /dev/null +++ b/src/common/snippets/src/pass/gn_tokenization.cpp @@ -0,0 +1,37 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/pass/gn_tokenization.hpp" + +#include "snippets/itt.hpp" +#include "snippets/op/subgraph.hpp" +#include "snippets/utils.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" + +ov::snippets::pass::TokenizeGNSnippets::TokenizeGNSnippets() { + MATCHER_SCOPE(TokenizeGNSnippets); + + auto group_norm_pattern = ov::pass::pattern::wrap_type(); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::pass::TokenizeGNSnippets") + auto group_norm_node = ov::as_type_ptr(m.get_match_root()); + if (group_norm_node->is_dynamic()) + return false; + + auto subgraph = op::Subgraph::wrap_node_as_subgraph(group_norm_node); + subgraph->get_rt_info()["originalLayersNames"] = group_norm_node->get_friendly_name(); + ov::replace_node(group_norm_node, subgraph); + op::update_out_tensor_name(subgraph); + + // mark the Subgraph as Completed to not allow Snippets to include any nodes into the GN Subgraph in common Tokenization + SetSnippetsSubgraphType(subgraph, SnippetsSubgraphType::Completed); + + return true; + }; + auto m = std::make_shared(group_norm_pattern, matcher_name); + register_matcher(m, callback); +} diff --git a/src/common/snippets/src/pass/tokenization.cpp b/src/common/snippets/src/pass/tokenization.cpp index 4b96a1f60a8977..30cbb42a031f46 100644 --- a/src/common/snippets/src/pass/tokenization.cpp +++ b/src/common/snippets/src/pass/tokenization.cpp @@ -9,6 +9,7 @@ #include "snippets/pass/common_optimizations.hpp" #include "snippets/pass/extract_reshapes_from_mha.hpp" #include "snippets/pass/mha_tokenization.hpp" +#include "snippets/pass/gn_tokenization.hpp" #include "snippets/pass/collapse_subgraph.hpp" @@ -81,6 +82,7 @@ bool SnippetsTokenization::run_on_model(const std::shared_ptr& m) { manager.register_pass(); manager.register_pass(); manager.register_pass(m_config); + manager.register_pass(); manager.register_pass(); manager.register_pass(m_config); manager.run_passes(m); diff --git a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp index e8df94bb670d12..c00b3ceda0e0bf 100644 --- a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp +++ b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp @@ -245,5 +245,19 @@ Result ReduceShapeInfer::infer(const std::vector& input_shapes) { return {{result_shape}, ShapeInferStatus::success}; } +ReshapeShapeInfer::ReshapeShapeInfer(const std::shared_ptr& n) { + const auto& reshape = as_type_ptr(n); + OPENVINO_ASSERT(reshape, "Invalid node passed to ReshapeShapeInfer."); + target_shape = reshape->get_target_shape(); +} + +Result ReshapeShapeInfer::infer(const std::vector& input_shapes) { + OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes passed ReshapeShapeInfer"); + VectorDims result_shape = target_shape.get_shape(); + // todo: check static and size is the same + return {{result_shape}, ShapeInferStatus::success}; +} + + } // namespace snippets } // namespace ov diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp index b6760d87e1afcb..94749d178837c0 100644 --- a/src/common/snippets/src/shape_inference/shape_inference.cpp +++ b/src/common/snippets/src/shape_inference/shape_inference.cpp @@ -89,6 +89,8 @@ std::shared_ptr make_shape_inference(const std::shared_ptr< ov::is_type(op) || ov::is_type(op)) { return std::make_shared(); + } else if (ov::is_type(op)) { + return std::make_shared(op); } else { OPENVINO_THROW("Operation type " + std::string(op->get_type_info().name) + " is not supported in Snippets shape inference pipeline"); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index cf46840aad8407..3a23ce5de6e655 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -138,6 +138,7 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[snippets::op::NewMemoryBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::VectorBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::RankNormalization::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); + jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); jitters[snippets::op::LoadReshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp index 75372646a23622..9aec5d4a933f5e 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp @@ -218,7 +218,14 @@ jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_g break; } case snippets::lowered::IOExpression::io_type::OUTPUT: { - desc = expr->get_input_port_connector(0)->get_source().get_descriptor_ptr(); + // store->reshape->result + const auto& source = expr->get_input_port_connector(0)->get_source(); + auto p_exp = source.get_expr(); + if (ov::is_type(p_exp->get_node())) { + desc = p_exp->get_input_port_connector(0)->get_source().get_descriptor_ptr(); + } else { + desc = expr->get_input_port_connector(0)->get_source().get_descriptor_ptr(); + } etype = expr->get_node()->get_input_element_type(0); break; } default : { diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index 15b18de1d9689a..34b56d0cd4e5af 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -168,7 +168,8 @@ class TypeRelaxedExtension : public ov::OpExtension> { OP_EXTENSION(ov::snippets::op::VectorBuffer) \ OP_EXTENSION(ov::snippets::op::RankNormalization) \ OP_EXTENSION(ov::snippets::op::ReduceMax) \ - OP_EXTENSION(ov::snippets::op::ReduceSum) + OP_EXTENSION(ov::snippets::op::ReduceSum) \ + OP_EXTENSION(ov::snippets::op::Reshape) OPENVINO_CREATE_EXTENSIONS(std::vector( {CPU_EXTENSIONS TYPE_RELAXED_EXTENSIONS SNIPPETS_EXTENSIONS SNIPPETS_DEBUG_CAPS_EXTENSIONS})); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 8dbdd42cee0726..8d28165b924dd5 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -67,6 +67,7 @@ #include "transformations/op_conversions/hswish_decomposition.hpp" #include "transformations/op_conversions/gru_cell_decomposition.hpp" #include "transformations/op_conversions/lstm_cell_decomposition.hpp" +#include "transformations/op_conversions/gn_decomposition.hpp" #include "transformations/op_conversions/mvn6_decomposition.hpp" #include "transformations/op_conversions/normalize_l2_decomposition.hpp" #include "transformations/op_conversions/reduce_l1_decomposition.hpp" @@ -470,6 +471,11 @@ void Transformations::PreLpt(const std::vector& defaultPrecis }, ov::pass::NormalizeL2Decomposition); + CPU_SET_CALLBACK_X64(manager, + [](const_node_ptr &node) -> bool { + return !node->is_dynamic() && node->get_output_element_type(0) == element::f32; + }, + ov::pass::GNDecomposition); CPU_ENABLE_PASS_COMMON(manager, ov::pass::SoftmaxDecomposition); CPU_SET_CALLBACK_COMMON(manager, [](const_node_ptr &node) -> bool { diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp new file mode 100644 index 00000000000000..68f573383c00d1 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp @@ -0,0 +1,54 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "single_layer_tests/group_normalization.hpp" + +using namespace ov::test::subgraph; + +namespace { + +const std::vector netPrecisions = { + ov::element::f32, + ov::element::bf16, + ov::element::i8 +}; + +const std::vector inputShapes = { + // static shape + {{1, 4, 1, 8}, {{1, 4, 1, 8}}}, + {{3, 8, 2, 32}, {{3, 8, 2, 32}}}, + {{3, 8, 16, 8, 4}, {{3, 8, 16, 8, 4}}}, + {{3, 8, 16, 8, 64}, {{3, 8, 16, 8, 64}}}, + {{3, 8, 16, 100, 4}, {{3, 8, 16, 100, 4}}}, + {{3, 16, 16, 8, 4}, {{3, 16, 16, 8, 4}}}, + {{1, 8, 8}, {{1, 8, 8}}}, + {{1, 8, 1, 8, 2}, {{1, 8, 1, 8, 2}}}, + {{1, 8, 1, 8, 2, 2}, {{1, 8, 1, 8, 2, 2}}}, + {{1, 8, 1, 8, 2, 2, 2}, {{1, 8, 1, 8, 2, 2, 2}}}, + // dynmaic shape + {{-1, -1, -1, -1}, {{1, 16, 8, 8}, {2, 8, 4, 4}, {1, 16, 8, 8}}}, + {{{1, 4}, 16, -1, -1}, {{1, 16, 6, 6}, {4, 16, 10, 10}, {1, 16, 6, 6}}} +}; + +const std::vector numGroups = { + 2, 4, +}; + +const std::vector epsilon = { + 0.0001 +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_GroupNormalization, + GroupNormalizationTest, + testing::Combine(testing::ValuesIn(netPrecisions), + ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::undefined), + testing::ValuesIn(inputShapes), + testing::ValuesIn(numGroups), + testing::ValuesIn(epsilon), + testing::Values(ov::test::utils::DEVICE_CPU), + testing::Values(ov::AnyMap())), + GroupNormalizationTest::getTestCaseName); + +} // anonymous namespace \ No newline at end of file diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp index 612c53db90ab39..233ed080bf28e0 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp @@ -81,7 +81,7 @@ class GroupNormalizationTest : public testing::WithParamInterfaceov::Shape { return {s[1]}; }); InputShape biasInputShape { - shape.first.is_dynamic() ? ov::PartialShape{shape.first[1]} : shape.first, + ov::PartialShape{shape.first[1]}, std::move(biasShape) }; return biasInputShape;