diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp index 4c10f112bc2c42..f22ba76cba19e4 100644 --- a/src/common/snippets/include/snippets/generator.hpp +++ b/src/common/snippets/include/snippets/generator.hpp @@ -24,6 +24,7 @@ class Generator; * @brief Holds all relevant information produced during lowering * @param compiled_snippet pointer to interface class that encapsulates compiled binary code * @param buffer_scratchpad_size the amount of additional memory required by the binary code to execute. + * @param buffer_inplace_output buffer share memory with subgraph output result. -1 means no sharing. i>=0 means share ith output memory. * Must be allocated and freed by the backend. */ class LoweringResult { @@ -35,6 +36,7 @@ class LoweringResult { public: std::shared_ptr compiled_snippet = nullptr; size_t buffer_scratchpad_size = 0; + int buffer_inplace_output = -1; }; /** diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp index 9c8ac3f1f25b4d..2ebaa7c2ab1728 100644 --- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp +++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp @@ -223,6 +223,22 @@ class LinearIR { */ exprIt replace_with_expr(const std::vector& old_exprs, const ExpressionPtr& new_expr); + /** + * @brief Propagate start_expr through zero to several consecutive shape infer exprs(such as reshape, rankNormalization). + * @param start_expr Propagate from start_expr. + * @param downstream Propagate downstream if it's true, otherwise propagate upstream. + * @return shape infer op consumers as a sequence if downstream, or shape infer op sources as a sequence if upstream. + */ + static std::vector propagate_expr_through_shape_infer_ops(const ExpressionPtr& start_expr, bool downstream); + + /** + * @brief Get last shape infer op from start_expr in a sequence. If no shape infer op is connect to start_expr, return start_expr. + * @param start_expr Search from start_expr. + * @param downstream search downstream if it's true, otherwise search upstream. + * @return last shape infer expr + */ + static ExpressionPtr get_last_shape_infer_expr(const ExpressionPtr& start_expr, bool downstream); + private: std::shared_ptr m_shape_infer = nullptr; diff --git a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp index 1ec9598ec1d2c2..b31a8ced9da702 100644 --- a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp @@ -26,7 +26,7 @@ namespace pass { class AllocateBuffers: public RangedPass { public: OPENVINO_RTTI("AllocateBuffers", "RangedPass") - AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true); + AllocateBuffers(size_t& buffer_scratchpad_size, int& buffer_inplace_output, bool is_optimized = true); /** * @brief Apply the pass to the Linear IR @@ -44,8 +44,10 @@ class AllocateBuffers: public RangedPass { using BufferCluster = std::set; using BufferClusters = std::vector; + private: size_t& m_buffer_scratchpad_size; + int& m_buffer_inplace_output; bool m_is_optimized_mode = true; }; diff --git a/src/common/snippets/include/snippets/op/reshape.hpp b/src/common/snippets/include/snippets/op/reshape.hpp new file mode 100644 index 00000000000000..8375f3a050e112 --- /dev/null +++ b/src/common/snippets/include/snippets/op/reshape.hpp @@ -0,0 +1,38 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" +#include "snippets/shape_inference/shape_inference.hpp" + +namespace ov { +namespace snippets { +namespace op { + +/** + * @interface Reshape + * @brief Reshape input tensor to reqiured target shape + * @ingroup snippets + */ +class Reshape : public ov::op::Op { +public: + OPENVINO_OP("Reshape", "SnippetsOpset"); + Reshape(const Output& x, ov::PartialShape target_shape); + Reshape() = default; + + bool visit_attributes(AttributeVisitor& visitor) override; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + + const ov::PartialShape& get_target_shape() const; + void set_target_shape(ov::PartialShape shape); + +private: + ov::PartialShape m_target_shape = {}; +}; + +} // namespace op +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index 2c0558abdc7529..2b02bac7b7b5c6 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -139,6 +139,8 @@ class Subgraph : public ov::op::util::SubGraphOp { // Return estimated unique buffer count (upper bound). It's needed for tokenization static auto get_estimated_buffer_count(const ov::NodeVector& ops) -> size_t; static auto is_domain_sensitive_op(const std::shared_ptr& op) -> bool; + static auto is_shape_infer_op(const std::shared_ptr& op) -> bool; + static auto get_last_shape_infer_op(const std::shared_ptr& op, bool downstream) -> std::shared_ptr; void data_flow_transformations(const BlockedShapeVector& blocked_input_shapes = {}, const std::vector& input_precisions = {}, diff --git a/src/common/snippets/include/snippets/pass/gn_decomposition.hpp b/src/common/snippets/include/snippets/pass/gn_decomposition.hpp new file mode 100644 index 00000000000000..8bd80f90c790ff --- /dev/null +++ b/src/common/snippets/include/snippets/pass/gn_decomposition.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pattern/matcher.hpp" + +namespace ov { +namespace snippets { +namespace pass { + +/** + * @interface GNDecomposition + * @brief Decomposes GroupNormalization to a range of low-level operations + * @ingroup snippets + */ +class GNDecomposition: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("GNDecomposition", "0"); + GNDecomposition(); +}; + +} // namespace pass +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/pass/gn_tokenization.hpp b/src/common/snippets/include/snippets/pass/gn_tokenization.hpp new file mode 100644 index 00000000000000..220f05f0bbbc88 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/gn_tokenization.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/pattern/matcher.hpp" +#include "snippets/pass/tokenization.hpp" + +namespace ov { +namespace snippets { +namespace pass { + +/** + * @interface TokenizeGNSnippets + * @brief Tokenize GroupNormalization to a subgraph + * @ingroup snippets + */ +class TokenizeGNSnippets : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("TokenizeGNSnippets", "0"); + TokenizeGNSnippets(); +}; + +} // namespace pass +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp index f6cd6f0626f798..a3dffd973c93dd 100644 --- a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp +++ b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp @@ -75,5 +75,12 @@ class ReduceShapeInfer : public IShapeInferSnippets { Result infer(const std::vector& input_shapes) override; }; +class ReshapeShapeInfer : public IShapeInferSnippets { + ov::PartialShape target_shape; +public: + explicit ReshapeShapeInfer(const std::shared_ptr& n); + Result infer(const std::vector& input_shapes) override; +}; + } // namespace snippets } // namespace ov diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp index f0564becaf24b5..08002fa38ed309 100644 --- a/src/common/snippets/include/snippets/snippets_isa.hpp +++ b/src/common/snippets/include/snippets/snippets_isa.hpp @@ -17,6 +17,7 @@ #include "op/fill.hpp" #include "op/kernel.hpp" #include "op/load.hpp" +#include "op/reshape.hpp" #include "op/nop.hpp" #include "op/scalar.hpp" #include "op/powerstatic.hpp" diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp index fed0dfcdd5c2b4..9b207b09fe411f 100644 --- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp +++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp @@ -16,6 +16,7 @@ OV_OP(LoopBegin, ov::snippets::op) OV_OP(LoopEnd, ov::snippets::op) OV_OP(Brgemm, ov::snippets::op) OV_OP(BroadcastLoad, ov::snippets::op) +OV_OP(Reshape, ov::snippets::op) OV_OP(Store, ov::snippets::op) diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp index 41d87f0e0fe83d..9669796628ad44 100644 --- a/src/common/snippets/include/snippets/utils.hpp +++ b/src/common/snippets/include/snippets/utils.hpp @@ -154,6 +154,14 @@ VectorDims get_planar_vdims(const snippets::lowered::ExpressionPort& expr_port); * @return preordered shape: `shape[i]` = `planar_shape[order[i]]` where `shape` is shape before applying the order. */ VectorDims get_preordered_vdims(const snippets::lowered::ExpressionPort& expr_port); +/** + * @brief Returns element count of a shape + * @param shape input shape + * @return element count of input shape + */ +inline auto get_shape_size(const VectorDims& shape) -> size_t { + return std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies()); +} /* --------------------------- */ } // namespace utils diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 8a0ae29f281097..027314d5ad4cb5 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -81,6 +81,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output& out) const { std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) #ifdef SNIPPETS_DEBUG_CAPS || std::dynamic_pointer_cast(op) diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp index 64bf3d0b53f712..3bff272f03f6c4 100644 --- a/src/common/snippets/src/lowered/linear_ir.cpp +++ b/src/common/snippets/src/lowered/linear_ir.cpp @@ -12,6 +12,7 @@ #include "openvino/core/graph_util.hpp" #include "openvino/core/type.hpp" #include "snippets/utils.hpp" +#include "snippets/op/subgraph.hpp" namespace ov { namespace snippets { @@ -365,10 +366,14 @@ VectorDims LinearIR::get_master_shape() const { } // Note: Snippets would benefit from a more generic master_shape calculation approach. // It will be implemented in the scope of ROI propagation activity (ticket 120505) - const auto& source = out_exprs[0]->get_input_port_connector(0)->get_source(); - if (!m_config.m_enable_domain_optimization && out_exprs.size() == 1 && - ov::is_type(source.get_expr()->get_node())) { - master_shape = utils::get_preordered_vdims(source); + if (out_exprs.size() == 1) { + const auto& source = out_exprs[0]->get_input_port_connector(0)->get_source(); + if (!m_config.m_enable_domain_optimization && ov::is_type(source.get_expr()->get_node())) { + master_shape = utils::get_preordered_vdims(source); + } else { + auto last_shape_infer_expr = LinearIR::get_last_shape_infer_expr(out_exprs[0], false); + master_shape = utils::get_preordered_vdims(last_shape_infer_expr->get_input_port_connector(0)->get_source()); + } } else { for (const auto& oe : out_exprs) { const auto& port_desc = oe->get_input_port_descriptor(0); @@ -493,6 +498,74 @@ LinearIR::exprIt LinearIR::replace_with_expr(const std::vector& o return replace_with_expr(old_exprs, new_expr, insertion_place); } +std::vector LinearIR::propagate_expr_through_shape_infer_ops(const ExpressionPtr& start_expr, bool downstream) { + std::vector shape_infer_exprs; + auto current_exp = start_expr; + if (op::Subgraph::is_shape_infer_op(current_exp->get_node())) { + shape_infer_exprs.push_back(current_exp); + } + if (downstream) { + if (current_exp->get_output_count() == 0) + return shape_infer_exprs; + auto consumers = current_exp->get_output_port_connector(0)->get_consumers(); + auto first_child = consumers.begin()->get_expr(); + while (op::Subgraph::is_shape_infer_op(first_child->get_node())) { + OPENVINO_ASSERT(consumers.size() == 1, "Shape infer ops are supposed to be the only consumer."); + shape_infer_exprs.push_back(first_child); + current_exp = first_child; + if (current_exp->get_output_count() == 0) + break; + consumers = current_exp->get_output_port_connector(0)->get_consumers(); + first_child = consumers.begin()->get_expr(); + } + return shape_infer_exprs; + } else { + // upstream + if (current_exp->get_input_count() == 0) + return shape_infer_exprs; + auto first_source = current_exp->get_input_port_connector(0)->get_source().get_expr(); + while (op::Subgraph::is_shape_infer_op(first_source->get_node())) { + shape_infer_exprs.push_back(first_source); + current_exp = first_source; + if (current_exp->get_input_count() == 0) + break; + first_source = current_exp->get_input_port_connector(0)->get_source().get_expr(); + } + return shape_infer_exprs; + } +} + +ExpressionPtr LinearIR::get_last_shape_infer_expr(const ExpressionPtr& start_expr, bool downstream) { + auto last_exp = start_expr; + if (downstream) { + if (last_exp->get_output_count() == 0) + return last_exp; + auto consumers = last_exp->get_output_port_connector(0)->get_consumers(); + auto first_child = consumers.begin()->get_expr(); + while (op::Subgraph::is_shape_infer_op(first_child->get_node())) { + OPENVINO_ASSERT(consumers.size() == 1, "Shape infer ops are supposed to be the only consumer."); + last_exp = first_child; + if (last_exp->get_output_count() == 0) + break; + consumers = last_exp->get_output_port_connector(0)->get_consumers(); + first_child = consumers.begin()->get_expr(); + } + return last_exp; + } else { + // upstream + if (last_exp->get_input_count() == 0) + return last_exp; + auto first_source = last_exp->get_input_port_connector(0)->get_source().get_expr(); + while (op::Subgraph::is_shape_infer_op(first_source->get_node())) { + last_exp = first_source; + if (last_exp->get_input_count() == 0) + break; + first_source = last_exp->get_input_port_connector(0)->get_source().get_expr(); + } + return last_exp; + } +} + LinearIR::LIRShapeInfer::LIRShapeInfer(container& body_exprs, io_container& io_exprs) : ShapeInferSnippetsNode(), m_exprs{std::make_shared(body_exprs)} { diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp index c7cf6b67abd8ea..ffc842c6af4078 100644 --- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp @@ -19,8 +19,8 @@ namespace snippets { namespace lowered { namespace pass { -AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized) - : m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized) {} +AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, int& buffer_inplace_output, bool is_optimized) + : m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized), m_buffer_inplace_output(buffer_inplace_output) {} void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset) { // If Buffer has offset We set this offset in the connected MemoryAccess ops @@ -46,7 +46,8 @@ void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const } } // Propagate to down: in Load. Buffer can have several Load - const auto& buffer_out = buffer_expr->get_output_port_connector(0); + auto last_shape_infer = ov::snippets::lowered::LinearIR::get_last_shape_infer_expr(buffer_expr, true); + const auto& buffer_out = last_shape_infer->get_output_port_connector(0); for (const auto& child_expr_input : buffer_out->get_consumers()) { const auto& child_expr = child_expr_input.get_expr(); const auto port = child_expr_input.get_index(); @@ -59,7 +60,7 @@ void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const continue; } else { OPENVINO_THROW( - "Buffer::set_offset() was called when Buffer didn't have the corresponding MemoryAccess op for offset propagation"); + "Buffer::set_offset() was called when Buffer didn't have the corresponding MemoryAccess op for offset propagation"); } } } @@ -77,6 +78,7 @@ bool AllocateBuffers::run(lowered::LinearIR& linear_ir, lowered::LinearIR::const pipeline.register_pass(m_buffer_scratchpad_size, buffer_clusters); pipeline.register_pass(); pipeline.run(linear_ir); + m_buffer_inplace_output = 0; } else { InitBuffersDefault(m_buffer_scratchpad_size).run(linear_ir, linear_ir.cbegin(), linear_ir.cend()); } diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp index e4b828547e9ce5..65499819b3685a 100644 --- a/src/common/snippets/src/lowered/pass/assign_registers.cpp +++ b/src/common/snippets/src/lowered/pass/assign_registers.cpp @@ -5,6 +5,7 @@ #include "snippets/lowered/pass/assign_registers.hpp" #include "snippets/lowered/linear_ir.hpp" +#include "snippets/op/subgraph.hpp" #include "snippets/snippets_isa.hpp" #include "snippets/itt.hpp" @@ -79,15 +80,23 @@ bool AssignRegisters::run(LinearIR& linear_ir) { if (io_expr->get_type() == IOExpression::io_type::INPUT) { const auto& out_connector = expr->get_output_port_connector(0); manually_assigned_gprs[out_connector] = io_expr->get_index(); - const auto& consumer_inputs = out_connector->get_consumers(); - const auto& first_consumer = consumer_inputs.begin()->get_expr(); - // TODO [96434]: Support RankNormalization (Reshape) in arbitrary place in pipeline, not just after inputs - if (ov::is_type(first_consumer->get_node())) { - OPENVINO_ASSERT(consumer_inputs.size() == 1, "RankNormalization is supposed to be the only consumer"); - manually_assigned_gprs[first_consumer->get_output_port_connector(0)] = io_expr->get_index(); + // TODO [96434]: Support shape infer ops in arbitrary place in pipeline, not just after inputs + // shape infer ops sequence after input + auto shape_infer_consumers = LinearIR::propagate_expr_through_shape_infer_ops(io_expr, true); + if (!shape_infer_consumers.empty()) { + for (const auto& child_shape_infer_expr : shape_infer_consumers) { + manually_assigned_gprs[child_shape_infer_expr->get_output_port_connector(0)] = io_expr->get_index(); + } } } else if (io_expr->get_type() == IOExpression::io_type::OUTPUT) { manually_assigned_gprs[expr->get_input_port_connector(0)] = num_parameters + io_expr->get_index(); + // shape infer ops sequence before result + auto shape_infer_sources = LinearIR::propagate_expr_through_shape_infer_ops(io_expr, false); + if (!shape_infer_sources.empty()) { + for (const auto& parent_shape_infer_expr : shape_infer_sources) { + manually_assigned_gprs[parent_shape_infer_expr->get_input_port_connector(0)] = num_parameters + io_expr->get_index(); + } + } } else { OPENVINO_THROW("Unsupported io_type detected"); } @@ -97,6 +106,16 @@ bool AssignRegisters::run(LinearIR& linear_ir) { if (ov::is_type(buffer)) { manually_assigned_gprs[expr->get_input_port_connector(0)] = static_cast(num_results + num_parameters + buffer_id); + // shape infer ops in the middle of subgraph. IntermediateMemoryBuffer is inserted before reshape as new loop should start. + // child shape info ops share the same memory as IntermediateMemoryBuffer. + auto shape_infer_consumers = LinearIR::propagate_expr_through_shape_infer_ops(expr, true); + if (!shape_infer_consumers.empty()) { + for (const auto& child_shape_infer_expr : shape_infer_consumers) { + manually_assigned_gprs[child_shape_infer_expr->get_input_port_connector(0)] = + manually_assigned_gprs[child_shape_infer_expr->get_output_port_connector(0)] = + static_cast(num_results + num_parameters + buffer_id); + } + } } manually_assigned_gprs[expr->get_output_port_connector(0)] = static_cast(num_results + num_parameters + buffer_id); diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp index eb72f971ced1c4..3174add775fae5 100644 --- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp @@ -147,10 +147,22 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const auto& expr = entry_port->get_expr(); const auto port_idx = entry_port->get_index(); const auto node = expr->get_node(); - const auto& parent_expr_output = expr->get_input_port_connector(port_idx)->get_source(); + auto parent_expr_output = expr->get_input_port_connector(port_idx)->get_source(); + + const auto& first_parent_expr = parent_expr_output.get_expr(); + bool has_shape_infer_parent = false; + auto top_shape_infer_expr = expr; + // parent before shape infer ops is used to determine if buffer needed according loopInfo + auto shape_infer_parents = LinearIR::propagate_expr_through_shape_infer_ops(first_parent_expr, false); + if (!shape_infer_parents.empty()) { + parent_expr_output = shape_infer_parents.back()->get_input_port_connector(0)->get_source(); + has_shape_infer_parent = true; + top_shape_infer_expr = shape_infer_parents.back(); + } + const auto& parent_expr = parent_expr_output.get_expr(); - const auto parent_port = parent_expr_output.get_index(); - const auto parent = parent_expr->get_node(); + const auto& parent_port = parent_expr_output.get_index(); + const auto& parent = parent_expr->get_node(); if (ov::is_type(parent) || ov::is_type(parent) || ov::is_type(parent) || @@ -178,7 +190,8 @@ void InsertBuffers::insertion(LinearIR& linear_ir, parent_expr_output, m_buffer_allocation_rank); const auto buffer = std::make_shared(parent->output(parent_port), allocation_shape); - linear_ir.insert_node(buffer, std::vector{ parent_expr_output }, buffer_loop_ids, false, pos, { *entry_port }); + const auto buffer_consumer = has_shape_infer_parent ? top_shape_infer_expr->get_input_port(0) : *entry_port; + linear_ir.insert_node(buffer, std::vector{ parent_expr_output }, buffer_loop_ids, false, pos, { buffer_consumer }); } } diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp index 2accd66309d49a..fcc21ceedc1cde 100644 --- a/src/common/snippets/src/lowered/pass/insert_load_store.cpp +++ b/src/common/snippets/src/lowered/pass/insert_load_store.cpp @@ -36,12 +36,7 @@ size_t InsertLoadStore::get_count(const ExpressionPort& port) const { bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) { std::shared_ptr data_expr = *data_expr_it; - auto consumer_inputs = data_expr->get_output_port_connector(0)->get_consumers(); - const auto& first_consumer = consumer_inputs.begin()->get_expr(); - if (is_type(first_consumer->get_node())) { - OPENVINO_ASSERT(consumer_inputs.size() == 1, "RankNormalization is supposed to be the only consumer"); - data_expr = first_consumer; - } + data_expr = LinearIR::get_last_shape_infer_expr(data_expr, true); const auto& data_ngraph_output = data_expr->get_node()->output(0); bool was_inserted = false; const auto& data_out = data_expr->get_output_port_connector(0); @@ -61,7 +56,9 @@ bool InsertLoadStore::insert_load(LinearIR& linear_ir, const LinearIR::constExpr } bool InsertLoadStore::insert_store(LinearIR& linear_ir, const LinearIR::constExprIt& data_expr_it) { - const auto& data_expr = *data_expr_it; + auto data_expr = *data_expr_it; + data_expr = LinearIR::get_last_shape_infer_expr(data_expr, false); + const auto& parent_output = data_expr->get_input_port_connector(0)->get_source(); const auto& parent_expr = parent_output.get_expr(); const auto port = parent_output.get_index(); diff --git a/src/common/snippets/src/lowered/pass/mark_loops.cpp b/src/common/snippets/src/lowered/pass/mark_loops.cpp index 3ff96b6ce374f4..ded7de36040576 100644 --- a/src/common/snippets/src/lowered/pass/mark_loops.cpp +++ b/src/common/snippets/src/lowered/pass/mark_loops.cpp @@ -27,7 +27,8 @@ bool MarkLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, l return ov::is_type(node) || ov::is_type(node) || ov::is_type(node) || - ov::is_type(node); + ov::is_type(node) || + ov::is_type(node); }; auto are_conflicted = [](const ExpressionPort& lhs, const ExpressionPort& rhs) { diff --git a/src/common/snippets/src/lowered/pass/validate.cpp b/src/common/snippets/src/lowered/pass/validate.cpp index 8dc95a94f9c015..b9a57801d6a351 100644 --- a/src/common/snippets/src/lowered/pass/validate.cpp +++ b/src/common/snippets/src/lowered/pass/validate.cpp @@ -32,13 +32,8 @@ void validate_ports(const ExpressionPtr& expr) { void validate_parameter(const ExpressionPtr& expr, const LinearIR& linear_ir) { OPENVINO_ASSERT(ov::is_type(expr->get_node()), "Parameter validation expects Parameter op"); - auto consumer_inputs = expr->get_output_port_connector(0)->get_consumers(); - const auto& first_consumer = consumer_inputs.begin()->get_expr(); - if (is_type(first_consumer->get_node())) { - OPENVINO_ASSERT(consumer_inputs.size() == 1, - "If there is RankNormalization after Parameter, it should be single consumer of the Parameter"); - consumer_inputs = first_consumer->get_output_port_connector(0)->get_consumers(); - } + auto expr_val = LinearIR::get_last_shape_infer_expr(expr, true); + auto consumer_inputs = expr_val->get_output_port_connector(0)->get_consumers(); std::set> layouts; for (const auto& consumer_input : consumer_inputs) { const auto& node = consumer_input.get_expr()->get_node(); @@ -56,7 +51,8 @@ void validate_parameter(const ExpressionPtr& expr, const LinearIR& linear_ir) { void validate_result(const ExpressionPtr& expr, const LinearIR& linear_ir) { OPENVINO_ASSERT(ov::is_type(expr->get_node()), "Result validation expects Result op"); - const auto source = expr->get_input_port_connector(0)->get_source(); + auto expr_val = LinearIR::get_last_shape_infer_expr(expr, false); + const auto source = expr_val->get_input_port_connector(0)->get_source(); const auto ma = ov::as_type_ptr(source.get_expr()->get_node()); OPENVINO_ASSERT(ma && ma->is_memory_access_output_port(source.get_index()), "Result expects MemoryAccess parent"); @@ -70,8 +66,8 @@ void validate_buffer(const ExpressionPtr& expr, const LinearIR& linear_ir) { const auto ma = ov::as_type_ptr(source.get_expr()->get_node()); OPENVINO_ASSERT(ma && ma->is_memory_access_input_port(source.get_index()), "Buffer expects MemoryAccess parent"); - - const auto& out = expr->get_output_port_connector(0); + auto expr_val = LinearIR::get_last_shape_infer_expr(expr, true); + const auto& out = expr_val->get_output_port_connector(0); const auto consumers = out->get_consumers(); for (const auto& consumer_input : consumers) { const auto& node = consumer_input.get_expr()->get_node(); diff --git a/src/common/snippets/src/op/reshape.cpp b/src/common/snippets/src/op/reshape.cpp new file mode 100644 index 00000000000000..65927f2ee4e2bf --- /dev/null +++ b/src/common/snippets/src/op/reshape.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/itt.hpp" + +#include "snippets/op/reshape.hpp" +#include "snippets/utils.hpp" + + +namespace ov { +namespace snippets { +namespace op { +Reshape::Reshape(const Output& arg, ov::PartialShape target_shape) + : Op({arg}), m_target_shape(std::move(target_shape)) { + constructor_validate_and_infer_types(); +} + +void Reshape::validate_and_infer_types() { + set_output_type(0, get_input_element_type(0), m_target_shape); +} + +std::shared_ptr Reshape::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Reshape); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_target_shape); +} + +bool Reshape::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("target_shape", m_target_shape); + return true; +} + +const ov::PartialShape& Reshape::get_target_shape() const { + return m_target_shape; +} + +void Reshape::set_target_shape(ov::PartialShape shape) { + m_target_shape = std::move(shape); +} +}// namespace op +}// namespace snippets +}// namespace ov \ No newline at end of file diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 70224751f1f810..30da2c387ff422 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -56,6 +56,8 @@ #include #include +#include "snippets/lowered/pass/serialize_control_flow.hpp" + using namespace std; using namespace ov::op::util; @@ -77,7 +79,42 @@ auto Subgraph::is_domain_sensitive_op(const std::shared_ptr& op) -> bo ov::is_type(op) || ov::is_type(op) || ov::is_type(op) || // Broadcast is domain sensetive op because the output shape depends on - ov::is_type(op); // the both input and broadcast shapes (the both - are inputs of op). Note: is used only in MHA pattern + ov::is_type(op) || // the both input and broadcast shapes (the both - are inputs of op). Note: is used only in MHA pattern + ov::is_type(op) || + ov::is_type(op); +} + +auto Subgraph::is_shape_infer_op(const std::shared_ptr& op) -> bool { + return ov::is_type(op) || + ov::is_type(op); +} + +auto Subgraph::get_last_shape_infer_op(const std::shared_ptr& op, bool downstream) -> std::shared_ptr { + auto last_op = op; + if (downstream) { + if (last_op->get_output_size() == 0) + return last_op; + auto first_child = last_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); + while (op::Subgraph::is_shape_infer_op(first_child)) { + last_op = first_child; + if (last_op->get_output_size() == 0) + break; + first_child = last_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); + } + return last_op; + } else { + // upstream + if (last_op->get_input_size() == 0) + return last_op; + auto first_parent = last_op->get_input_node_shared_ptr(0); + while (op::Subgraph::is_shape_infer_op(first_parent)) { + last_op = first_parent; + if (last_op->get_input_size() == 0) + break; + first_parent = last_op->get_input_node_shared_ptr(0); + } + return last_op; + } } void Subgraph::init_config() { @@ -273,7 +310,8 @@ auto Subgraph::constant_input_should_be_inside_body(const std::shared_ptr(node) || ov::is_type(node) || ov::is_type(node) || - ov::is_type(node); + ov::is_type(node) || + ov::is_type(node); } bool Subgraph::check_broadcast(const std::shared_ptr& node) noexcept { @@ -319,7 +357,7 @@ VectorDims Subgraph::infer_master_shape() { OPENVINO_ASSERT(!output_dims.empty(), "Can't calculate master_shape before the first shape inference"); } else { for (const auto& res : body_ptr()->get_results()) { - const auto& res_input = res->input(0); + auto res_input = get_last_shape_infer_op(res, false)->input(0); OPENVINO_ASSERT(res_input.get_partial_shape().is_static(), "Result have dynamic shape in static pipeline"); // We need to account to the shape's layout stored in Output rt_info const auto& planar_shape = utils::get_preordered_pshape(res_input.get_source_output()); @@ -405,6 +443,12 @@ void Subgraph::data_flow_transformations(const BlockedShapeVector& blocked_input manager.register_positioned_passes(backend_passes); manager.run_passes(body_ptr()); + + // ov::pass::Manager magr; + // std::string xmlo = "data_flow.xml"; + // std::string bino = "data_flow.bin"; + // magr.register_pass(xmlo, bino); + // magr.run_passes(body_ptr()); } void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir, @@ -437,7 +481,8 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir, pipeline.register_pass(); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(lowering_result.buffer_scratchpad_size, linear_ir.get_config().m_are_buffers_optimized); + pipeline.register_pass(lowering_result.buffer_scratchpad_size, lowering_result.buffer_inplace_output, + linear_ir.get_config().m_are_buffers_optimized); pipeline.register_pass(); pipeline.register_positioned_passes(lowered_backend_passes); pipeline.register_pass(); // must be last @@ -478,6 +523,11 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const std::shared_ptrgenerate(linear_ir, lowering_result, compile_params); VectorDims parallel_exec_domain = linear_ir.get_master_shape(); diff --git a/src/common/snippets/src/pass/align_element_types.cpp b/src/common/snippets/src/pass/align_element_types.cpp index 625294d9e092e4..c159167c7496e7 100644 --- a/src/common/snippets/src/pass/align_element_types.cpp +++ b/src/common/snippets/src/pass/align_element_types.cpp @@ -29,7 +29,7 @@ bool pass::AlignElementTypes::run_on_model(const std::shared_ptr& m) for (size_t i = 0; i < m_output_precisions.size(); i++) { const auto needed_out_type = m_output_precisions[i]; if (results[i]->get_input_element_type(0) != needed_out_type) { - std::shared_ptr consumer = results[i]; + std::shared_ptr consumer = op::Subgraph::get_last_shape_infer_op(results[i], false); auto parent_output = consumer->get_input_source_output(0); // Snippets supports Transpose only after Parameter or before Result nodes @@ -76,17 +76,11 @@ bool pass::AlignElementTypes::run_on_model(const std::shared_ptr& m) parameter->set_element_type(needed_in_type); parameter->validate_and_infer_types(); - auto parent_output = parameter->output(0); - auto consumer_inputs = parent_output.get_target_inputs(); - - const auto& first_child = consumer_inputs.begin()->get_node()->shared_from_this(); - // Note: RankNormalization of is designed for shape-inference purposes only. + // Note: shape infer ops is designed for shape-inference purposes only. // It does not process any data (nor does it emit any code), so it doesn't require Convert operations - if (is_type(first_child)) { - OPENVINO_ASSERT(consumer_inputs.size() == 1, "RankNormalization is supposed to be the only consumer"); - parent_output = first_child->output(0); - consumer_inputs = parent_output.get_target_inputs(); - } + auto first_child = op::Subgraph::get_last_shape_infer_op(parameter, true); + auto parent_output = first_child->output(0); + auto consumer_inputs = parent_output.get_target_inputs(); // Snippets supports Transpose only after Parameter or before Result nodes // So we have to insert Convert after Transpose (if there is) on Subgraph inputs diff --git a/src/common/snippets/src/pass/common_optimizations.cpp b/src/common/snippets/src/pass/common_optimizations.cpp index 1e10d2dc6dfe6e..7c8089bc776bec 100644 --- a/src/common/snippets/src/pass/common_optimizations.cpp +++ b/src/common/snippets/src/pass/common_optimizations.cpp @@ -6,6 +6,7 @@ #include "snippets/pass/fq_decomposition.hpp" #include "snippets/pass/softmax_reshape_elimination.hpp" +#include "snippets/pass/gn_decomposition.hpp" #include "snippets/pass/explicit_transpose_matmul_inputs.hpp" #include "snippets/pass/transpose_decomposition.hpp" #include "snippets/pass/fuse_transpose_brgemm.hpp" @@ -50,6 +51,7 @@ CommonOptimizations::CommonOptimizations(const SnippetsTokenization::Config& con REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::ExplicitTransposeMatMulInputs, is_domain_sensitive); REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::CommonFakeQuantizeDecomposition, is_quantized); REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::SoftmaxReshapeElimination, is_domain_sensitive); + REGISTER_SNIPPETS_PASS(manager, ov::snippets::pass::GNDecomposition, is_domain_sensitive); manager.run_passes(body); ov::snippets::pass::CommonOptimizations::SubgraphManager subgraph_manager; diff --git a/src/common/snippets/src/pass/gn_decomposition.cpp b/src/common/snippets/src/pass/gn_decomposition.cpp new file mode 100644 index 00000000000000..aec78587b588a5 --- /dev/null +++ b/src/common/snippets/src/pass/gn_decomposition.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/pass/gn_decomposition.hpp" + +#include "openvino/op/group_normalization.hpp" +#include "snippets/op/reduce.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "snippets/itt.hpp" +#include "snippets/lowered/port_descriptor.hpp" +#include "snippets/snippets_isa.hpp" +#include "openvino/core/rt_info.hpp" + +namespace ov { +namespace snippets { +namespace pass { +using namespace lowered; + +// groupNorm -> reshape + mvn + reshape + mul + add, +// where mvn = (x - mean) / Sqrt(ReduceMean((x - mean) ^ 2) + eps), +// where mean = ReduceMean(x, axes) +GNDecomposition::GNDecomposition() { + MATCHER_SCOPE(GNDecomposition); + auto group_norm_pattern = ov::pass::pattern::wrap_type(); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::pass::GNDecomposition") + auto group_norm_node = ov::as_type_ptr(m.get_match_root()); + + const auto data = group_norm_node->input_value(0); + const auto scale = group_norm_node->input_value(1); + const auto bias = group_norm_node->input_value(2); + + const auto num_groups = static_cast(group_norm_node->get_num_groups()); + const float eps = static_cast(group_norm_node->get_epsilon()); + + ////////////collapse to reduce lastDim to avoid nested loop overhead(e.g. reduce tails in inner loop)/////////// + // reshape [N, C, spatial] to [N, group, 1, (C / group) * spatial] + const auto orig_shape = group_norm_node->get_input_partial_shape(0); + size_t orig_rank = orig_shape.rank().get_length(); + size_t group_rank = 4; + std::vector group_dims(group_rank); + group_dims[0] = orig_shape[0]; + group_dims[1] = Dimension(num_groups); + group_dims[2] = Dimension(1); + group_dims[3] = Dimension(orig_shape[1] / num_groups); + Dimension spatial_dim = 1; + for (size_t i = 2; i < orig_rank; ++i) { + spatial_dim = spatial_dim * orig_shape[i]; + } + group_dims[3] = group_dims[3] * spatial_dim; + ov::PartialShape group_shape(group_dims); + std::shared_ptr reshaped_node_orig = std::make_shared(data, group_shape); + + std::shared_ptr reshaped_node1 = reshaped_node_orig; + if (data.get_element_type() != element::f32) { + reshaped_node1 = std::make_shared(reshaped_node_orig, element::f32); + } + + const auto reduce_sum = std::make_shared(reshaped_node1, group_rank - 1); + op::ReduceBase::compute_and_set_reduce_subtensors(reduce_sum); + + // reduceMean + auto group_shape_static = group_shape.to_shape(); + float group_size_inv = 1.0f / static_cast(group_shape_static[3]); + const auto group_size_inv_node = std::make_shared(element::f32, Shape{}, std::vector{group_size_inv}); + const auto reduce_mean = std::make_shared(reduce_sum, group_size_inv_node); + + // x - mean + std::shared_ptr reshaped_node2 = reshaped_node_orig; + if (data.get_element_type() != element::f32) { + reshaped_node2 = std::make_shared(reshaped_node_orig, element::f32); + } + auto sub_mean = std::make_shared(reshaped_node2, reduce_mean); + // (x - mean) ^ 2 + auto sqr_const = std::make_shared(element::f32, Shape{1}, std::vector{2}); + auto sqr = std::make_shared(sub_mean, sqr_const); + // reduceSum((x - mean) ^ 2) + auto sqr_reduce_sum = std::make_shared(sqr, group_rank - 1); + op::ReduceBase::compute_and_set_reduce_subtensors(sqr_reduce_sum); + // reduceMean((x - mean) ^ 2) + const auto group_size_inv_node_aux = std::make_shared(element::f32, Shape{}, std::vector{group_size_inv}); + auto sqr_mean = std::make_shared(sqr_reduce_sum, group_size_inv_node_aux); + // reduceMean((x - mean) ^ 2) + eps + auto eps_node = std::make_shared(element::f32, Shape{1}, std::vector{eps}); + auto eps_add = std::make_shared(sqr_mean, eps_node); // fma to this add and parent multiply + // variance = sqrt( reducemean( (x - mean) ^ 2 ) + eps ) + auto variance = std::make_shared(eps_add); + + // divide variance + const auto variance_inv = std::make_shared(variance, -1.f); + + // remove invariance in inner loop + std::vector subtensor_invariance(group_rank, 1); + subtensor_invariance[3] = PortDescriptor::ServiceDimensions::FULL_DIM; + PortDescriptorUtils::set_port_descriptor_ptr(reduce_mean->input(0), std::make_shared(reduce_mean->input(0), subtensor_invariance)); + PortDescriptorUtils::set_port_descriptor_ptr(reduce_mean->output(0), std::make_shared(reduce_mean->output(0), subtensor_invariance)); + PortDescriptorUtils::set_port_descriptor_ptr(sqr_mean->input(0), std::make_shared(sqr_mean->input(0), subtensor_invariance)); + PortDescriptorUtils::set_port_descriptor_ptr(sqr_mean->input(1), std::make_shared(sqr_mean->input(1), subtensor_invariance)); + PortDescriptorUtils::set_port_descriptor_ptr(sqr_mean->output(0), std::make_shared(sqr_mean->output(0), subtensor_invariance)); + PortDescriptorUtils::set_port_descriptor_ptr(eps_add->input(0), std::make_shared(eps_add->input(0), subtensor_invariance)); + PortDescriptorUtils::set_port_descriptor_ptr(eps_add->input(1), std::make_shared(eps_add->input(1), subtensor_invariance)); + PortDescriptorUtils::set_port_descriptor_ptr(eps_add->output(0), std::make_shared(eps_add->output(0), subtensor_invariance)); + PortDescriptorUtils::set_port_descriptor_ptr(variance->input(0), std::make_shared(variance->input(0), subtensor_invariance)); + PortDescriptorUtils::set_port_descriptor_ptr(variance->output(0), std::make_shared(variance->output(0), subtensor_invariance)); + PortDescriptorUtils::set_port_descriptor_ptr(variance_inv->input(0), std::make_shared(variance_inv->input(0), subtensor_invariance)); + PortDescriptorUtils::set_port_descriptor_ptr(variance_inv->output(0), std::make_shared(variance_inv->output(0), subtensor_invariance)); + + auto mvn = std::make_shared(sub_mean, variance_inv); + + // reshape mvn from [N, group, 1, (C / group) * spatial] to [N, group, C / group, spatial] + std::vector group_channel_dims(group_rank); + group_channel_dims[0] = group_dims[0]; + group_channel_dims[1] = group_dims[1]; + group_channel_dims[2] = Dimension(orig_shape[1] / num_groups); + group_channel_dims[3] = spatial_dim; + ov::PartialShape group_channel_shape(group_channel_dims); + const auto mvn_reshaped = std::make_shared(mvn, group_channel_shape); + + // reshape scale and bias to [1, group, C / group, 1] + std::vector scale_bias_dims(group_rank, Dimension(1)); + scale_bias_dims[1] = group_channel_dims[1]; + scale_bias_dims[2] = group_channel_dims[2]; + ov::PartialShape scale_bias_shape(scale_bias_dims); + std::shared_ptr reshape_scale = std::make_shared(scale, scale_bias_shape); + if (scale.get_element_type() != element::f32) { + reshape_scale = std::make_shared(reshape_scale, element::f32); + } + std::shared_ptr reshape_bias = std::make_shared(bias, scale_bias_shape); + if (bias.get_element_type() != element::f32) { + reshape_bias = std::make_shared(reshape_bias, element::f32); + } + + // scaled mvn_reshape[2,5,2,64] reshape_scale[1,5,2,1] -> scaled_node[2,5,2,64] + auto scaled_node = std::make_shared(mvn_reshaped, reshape_scale); + auto biased_node = std::make_shared(scaled_node, reshape_bias); + + auto result_prec = group_norm_node->get_output_element_type(0); + std::shared_ptr biased_node_convert = biased_node; + if (result_prec != element::f32) { + biased_node_convert = std::make_shared(biased_node, result_prec); + } + + // reshape_back [N, group, C / group, spatial] to [N, C, spatial] + const auto reshape_back_node = std::make_shared(biased_node_convert, orig_shape); + + return ov::replace_node_update_name(group_norm_node, reshape_back_node); + }; + + auto m = std::make_shared(group_norm_pattern, matcher_name); + register_matcher(m, callback); +} + +} // namespace pass +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/src/pass/gn_tokenization.cpp b/src/common/snippets/src/pass/gn_tokenization.cpp new file mode 100644 index 00000000000000..62fe124b2a4f01 --- /dev/null +++ b/src/common/snippets/src/pass/gn_tokenization.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/pass/gn_tokenization.hpp" +#include "snippets/pass/collapse_subgraph.hpp" + +#include "snippets/itt.hpp" +#include "snippets/op/subgraph.hpp" +#include "snippets/utils.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" + +ov::snippets::pass::TokenizeGNSnippets::TokenizeGNSnippets() { + MATCHER_SCOPE(TokenizeGNSnippets); + + auto group_norm_pattern = ov::pass::pattern::wrap_type(); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::pass::TokenizeGNSnippets") + auto group_norm_node = ov::as_type_ptr(m.get_match_root()); + if (group_norm_node->is_dynamic() || group_norm_node->get_element_type() != element::f32) + return false; + + auto subgraph = op::Subgraph::wrap_node_as_subgraph(group_norm_node); + subgraph->get_rt_info()["originalLayersNames"] = group_norm_node->get_friendly_name(); + ov::replace_node(group_norm_node, subgraph); + op::update_out_tensor_name(subgraph); + + // mark the Subgraph as Completed to not allow Snippets to include any nodes into the GN Subgraph in common Tokenization + SetSnippetsSubgraphType(subgraph, SnippetsSubgraphType::Completed); + + return true; + }; + auto m = std::make_shared(group_norm_pattern, matcher_name); + register_matcher(m, callback); +} diff --git a/src/common/snippets/src/pass/tokenization.cpp b/src/common/snippets/src/pass/tokenization.cpp index 4b96a1f60a8977..30cbb42a031f46 100644 --- a/src/common/snippets/src/pass/tokenization.cpp +++ b/src/common/snippets/src/pass/tokenization.cpp @@ -9,6 +9,7 @@ #include "snippets/pass/common_optimizations.hpp" #include "snippets/pass/extract_reshapes_from_mha.hpp" #include "snippets/pass/mha_tokenization.hpp" +#include "snippets/pass/gn_tokenization.hpp" #include "snippets/pass/collapse_subgraph.hpp" @@ -81,6 +82,7 @@ bool SnippetsTokenization::run_on_model(const std::shared_ptr& m) { manager.register_pass(); manager.register_pass(); manager.register_pass(m_config); + manager.register_pass(); manager.register_pass(); manager.register_pass(m_config); manager.run_passes(m); diff --git a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp index e8df94bb670d12..371de613305f37 100644 --- a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp +++ b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp @@ -245,5 +245,22 @@ Result ReduceShapeInfer::infer(const std::vector& input_shapes) { return {{result_shape}, ShapeInferStatus::success}; } +ReshapeShapeInfer::ReshapeShapeInfer(const std::shared_ptr& n) { + const auto& reshape = as_type_ptr(n); + OPENVINO_ASSERT(reshape, "Invalid node passed to ReshapeShapeInfer."); + target_shape = reshape->get_target_shape(); +} + +Result ReshapeShapeInfer::infer(const std::vector& input_shapes) { + OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes is passed in ReshapeShapeInfer"); + OPENVINO_ASSERT(target_shape.is_static(), "target_shape should be static in ReshapeShapeInfer"); + VectorDims result_shape = target_shape.get_shape(); + const auto input_elems = utils::get_shape_size(input_shapes[0].get()); + const auto output_elems = utils::get_shape_size(result_shape); + OPENVINO_ASSERT(input_elems == output_elems, "Tensor volume should be the same after reshape in ReshapeShapeInfer"); + + return {{result_shape}, ShapeInferStatus::success}; +} + } // namespace snippets } // namespace ov diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp index b6760d87e1afcb..d6c6081113ea1f 100644 --- a/src/common/snippets/src/shape_inference/shape_inference.cpp +++ b/src/common/snippets/src/shape_inference/shape_inference.cpp @@ -60,6 +60,7 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry SHAPE_INFER_PREDEFINED(op::KernelStatic, EmptyShapeInfer), SHAPE_INFER_PREDEFINED(op::KernelDynamic, EmptyShapeInfer), SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer), + SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Reshape, ReshapeShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Brgemm, BrgemmShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::ReduceMax, ReduceShapeInfer), diff --git a/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp b/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp index dd5dd631437cd8..e6bafc19ef1700 100644 --- a/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp +++ b/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp @@ -17,7 +17,8 @@ typedef std::tuple< bool, // Optimized pipeline bool, // With SplitLoops opt size_t, // Expected Buffer size in bytes - size_t // Expected unique Buffer IDs count + size_t // Expected unique Buffer IDs count + // int // buffer output inplace > BufferAllocationParams; class BufferAllocationTest : public testing::TestWithParam { @@ -46,6 +47,7 @@ class BufferAllocationTest : public testing::TestWithParam(m_vector_size); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(m_buffer_scratchpad, m_is_buffer_optimized); + pipeline.register_pass(m_buffer_scratchpad, m_buffer_inplace_out, m_is_buffer_optimized); pipeline.run(m_linear_ir); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index cf46840aad8407..3a23ce5de6e655 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -138,6 +138,7 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[snippets::op::NewMemoryBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::VectorBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::RankNormalization::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); + jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); jitters[snippets::op::LoadReshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp index 75372646a23622..e82d7cdd5b36dc 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp @@ -200,13 +200,9 @@ jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_g element::Type etype; switch (expr->get_type()) { case snippets::lowered::IOExpression::io_type::INPUT: { - // Note that here we consider only the first child (which is usually load), - // but often there is another child - LoopEnd - auto consumer_inputs = expr->get_output_port_connector(0)->get_consumers(); - const auto& first_consumer = consumer_inputs.begin()->get_expr(); - // If there is a RankNormalization op after a parameter - we should skip it - if (is_type(first_consumer->get_node())) - consumer_inputs = first_consumer->get_output_port_connector(0)->get_consumers(); + // input->shape changing ops->load + auto mem_desc_expr = ov::snippets::lowered::LinearIR::get_last_shape_infer_expr(expr, true); + auto consumer_inputs = mem_desc_expr->get_output_port_connector(0)->get_consumers(); for (const auto& child_input : consumer_inputs) { const auto ma = ov::as_type_ptr(child_input.get_expr()->get_node()); if (ma && ma->is_memory_access_input_port(child_input.get_index())) { @@ -214,12 +210,15 @@ jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_g break; } } - etype = expr->get_node()->get_output_element_type(0); + etype = mem_desc_expr->get_node()->get_output_element_type(0); + break; break; } case snippets::lowered::IOExpression::io_type::OUTPUT: { - desc = expr->get_input_port_connector(0)->get_source().get_descriptor_ptr(); - etype = expr->get_node()->get_input_element_type(0); + // store->shape changing ops->result + auto mem_desc_expr = ov::snippets::lowered::LinearIR::get_last_shape_infer_expr(expr, false); + desc = mem_desc_expr->get_input_port_connector(0)->get_source().get_descriptor_ptr(); + etype = mem_desc_expr->get_node()->get_input_element_type(0); break; } default : { OPENVINO_THROW("Kernel detected unsupported io_type"); diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index 15b18de1d9689a..34b56d0cd4e5af 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -168,7 +168,8 @@ class TypeRelaxedExtension : public ov::OpExtension> { OP_EXTENSION(ov::snippets::op::VectorBuffer) \ OP_EXTENSION(ov::snippets::op::RankNormalization) \ OP_EXTENSION(ov::snippets::op::ReduceMax) \ - OP_EXTENSION(ov::snippets::op::ReduceSum) + OP_EXTENSION(ov::snippets::op::ReduceSum) \ + OP_EXTENSION(ov::snippets::op::Reshape) OPENVINO_CREATE_EXTENSIONS(std::vector( {CPU_EXTENSIONS TYPE_RELAXED_EXTENSIONS SNIPPETS_EXTENSIONS SNIPPETS_DEBUG_CAPS_EXTENSIONS})); diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index f7a50ffa14852f..791be15c197a1c 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -505,16 +505,20 @@ void Snippet::SnippetJitExecutor::exec(const std::vector& inMemPtrs, } void Snippet::SnippetJitExecutor::update_ptrs(jit_snippets_call_args& call_args, - const std::vector& inMemPtrs, const std::vector& outMemPtrs) { + const std::vector& inMemPtrs, const std::vector& outMemPtrs, size_t buffer_offset) { for (size_t i = 0; i < inMemPtrs.size(); i++) call_args.src_ptrs[i] = inMemPtrs[i]->getDataAs() + start_offset_in[i]; for (size_t i = 0; i < outMemPtrs.size(); i++) call_args.dst_ptrs[i] = outMemPtrs[i]->getDataAs() + start_offset_out[i]; - if (buffer_scratchpad_size > 0) { - call_args.buffer_scratchpad_ptr = + if (buffer_inplace_output >= 0) { + call_args.buffer_scratchpad_ptr = call_args.dst_ptrs[buffer_inplace_output] + buffer_offset * dataSize[buffer_inplace_output + numInput]; + } else { + if (buffer_scratchpad_size > 0) { + call_args.buffer_scratchpad_ptr = reinterpret_cast(buffer_scratchpad.data()) + parallel_get_thread_num() * buffer_scratchpad_size; + } } } @@ -547,7 +551,12 @@ void Snippet::SnippetJitExecutor::schedule_6d(const std::vector& inMe [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) { int64_t indexes[] = {d0, d1, d2, d3, d4}; jit_snippets_call_args call_args; - update_ptrs(call_args, inMemPtrs, outMemPtrs); + size_t buffer_offset = 0; + if (buffer_inplace_output >= 0) { + for (size_t i = 0; i < sizeof(indexes) / sizeof(indexes[0]); i++) + buffer_offset += indexes[i] * master_shape_stride[i]; + } + update_ptrs(call_args, inMemPtrs, outMemPtrs, buffer_offset); callable(&call_args, indexes); }); } @@ -558,9 +567,6 @@ void Snippet::SnippetJitExecutor::schedule_nt(const std::vector& inMe segfault_detector(); #endif parallel_nt(0, [&](const int ithr, const int nthr) { - jit_snippets_call_args call_args; - update_ptrs(call_args, inMemPtrs, outMemPtrs); - size_t start = 0, end = 0; splitter(harnessWorkAmount, nthr, ithr, start, end); @@ -571,7 +577,13 @@ void Snippet::SnippetJitExecutor::schedule_nt(const std::vector& inMe indexes[j] = static_cast(tmp % work_size[j]); tmp /= work_size[j]; } - + size_t buffer_offset = 0; + if (buffer_inplace_output >= 0) { + for (size_t i = 0; i < indexes.size(); i++) + buffer_offset += indexes[i] * master_shape_stride[i]; + } + jit_snippets_call_args call_args; + update_ptrs(call_args, inMemPtrs, outMemPtrs, buffer_offset); schedule.get_callable()(&call_args, indexes.data()); } }); @@ -595,10 +607,10 @@ Snippet::SnippetJitExecutor::SnippetJitExecutor(SnippetAttrs attrs, bool is_dyna in_shapes.emplace_back(s); snippetAttrs.snippet->shape_infer(in_shapes); } - const VectorDims& canonicalShape = snippetAttrs.snippet->infer_master_shape(); + master_shape = snippetAttrs.snippet->infer_master_shape(); // initialize by maximum output dimension. Dimensions of outputs should be broadcastable - tensorRank = std::max(static_cast(rank6D), canonicalShape.size()); + tensorRank = std::max(static_cast(rank6D), master_shape.size()); auto initDataSizes = [this]() { dataSize.resize(numInput + numOutput); for (size_t i = 0; i < numInput; i++) @@ -608,18 +620,26 @@ Snippet::SnippetJitExecutor::SnippetJitExecutor(SnippetAttrs attrs, bool is_dyna }; initDataSizes(); - if (snippets::utils::is_dynamic_vdims(canonicalShape)) + if (snippets::utils::is_dynamic_vdims(master_shape)) OPENVINO_THROW("Snippets: Canonicalization returned dynamic shape in static pipeline"); // generate jit_snippets_compile_args jcp; jcp.parallel_executor_ndims = tensorRank; generate(&jcp); - buffer_scratchpad_size = schedule.lowering_result.buffer_scratchpad_size; - buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0); + buffer_inplace_output = schedule.lowering_result.buffer_inplace_output; + if (buffer_inplace_output == -1) { + buffer_scratchpad_size = schedule.lowering_result.buffer_scratchpad_size; + buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0); + } parallel_exec_domain = schedule.parallel_exec_domain; harnessWorkAmount = std::accumulate(parallel_exec_domain.begin(), parallel_exec_domain.end(), 1, std::multiplies()); parallel_exec_domain = getNormalizedDimsBySize(parallel_exec_domain, tensorRank); + master_shape = getNormalizedDimsBySize(master_shape, tensorRank); + master_shape_stride = std::vector(master_shape.size(), 1); + for (int i = master_shape_stride.size() - 2 ; i >= 0; i--) { + master_shape_stride[i] = master_shape_stride[i + 1] * master_shape[i + 1]; + } } void Snippet::SnippetJitExecutor::generate(const jit_snippets_compile_args* jcp) { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 9ce3a3b71b760b..89f5221e09f978 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -101,7 +101,8 @@ class Snippet : public Node { size_t numOutput = 0; void generate(const jit_snippets_compile_args*); - inline void update_ptrs(jit_snippets_call_args&, const std::vector& inMemPtrs, const std::vector& outMemPtrs); + inline void update_ptrs(jit_snippets_call_args&, const std::vector& inMemPtrs, const std::vector& outMemPtrs, + size_t buffer_offset); // Evaluates generated snippet using parallel backend void schedule_6d(const std::vector& inMemPtrs, const std::vector& outMemPtrs); void schedule_nt(const std::vector& inMemPtrs, const std::vector& outMemPtrs); @@ -125,6 +126,9 @@ class Snippet : public Node { // Buffer scratchpad std::vector buffer_scratchpad = {}; size_t buffer_scratchpad_size = 0; + int buffer_inplace_output = -1; + VectorDims master_shape = {}; + VectorDims master_shape_stride = {}; #ifdef SNIPPETS_DEBUG_CAPS inline void segfault_detector(); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 8dbdd42cee0726..f740b397b54f9f 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -67,6 +67,7 @@ #include "transformations/op_conversions/hswish_decomposition.hpp" #include "transformations/op_conversions/gru_cell_decomposition.hpp" #include "transformations/op_conversions/lstm_cell_decomposition.hpp" +#include "transformations/op_conversions/group_normalization_decomposition.hpp" #include "transformations/op_conversions/mvn6_decomposition.hpp" #include "transformations/op_conversions/normalize_l2_decomposition.hpp" #include "transformations/op_conversions/reduce_l1_decomposition.hpp" @@ -470,6 +471,12 @@ void Transformations::PreLpt(const std::vector& defaultPrecis }, ov::pass::NormalizeL2Decomposition); + CPU_SET_CALLBACK_X64(manager, + [this](const_node_ptr &node) -> bool { + return !node->is_dynamic() && node->get_element_type() == element::f32 && inferencePrecision != ov::element::bf16; + }, + ov::pass::GroupNormalizationDecomposition); + CPU_ENABLE_PASS_COMMON(manager, ov::pass::SoftmaxDecomposition); CPU_SET_CALLBACK_COMMON(manager, [](const_node_ptr &node) -> bool { diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp new file mode 100644 index 00000000000000..bd7257318235be --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp @@ -0,0 +1,74 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "single_op_tests/group_normalization.hpp" + +namespace { +using ov::test::GroupNormalizationTest; + +const std::vector netPrecisions = { + ov::element::f32, +}; + +// static shapes +const std::vector staticInputShapes = { + {3, 8, 3}, + {3, 8, 8}, + {3, 8, 16}, + {3, 8, 21}, + {1, 4, 8, 8}, + {1, 8, 1, 22}, + {3, 16, 1, 33}, + {1, 4, 1, 1, 34}, + {1, 8, 1, 8, 2, 2}, + {1, 8, 1, 8, 2, 2, 2}, +}; + +// dynmaic shapes +const std::vector DynamicInputShapes = { + {{-1, -1, -1}, {{1, 8, 22}, {2, 4, 7}, {1, 8, 22}}}, + {{-1, -1, -1, -1}, {{1, 16, 8, 8}, {2, 8, 4, 4}, {1, 16, 8, 8}}}, + {{{1, 4}, {4, 16}, -1, -1}, {{1, 4, 6, 6}, {4, 16, 10, 10}, {1, 4, 6, 6}}}, + {{-1, -1, -1, -1, -1}, {{1, 16, 7, 7, 1}, {2, 8, 4, 4, 1}, {1, 16, 7, 7, 1}}}, +}; + +const std::vector numGroups = { + 2, 4, +}; + +const std::vector epsilon = { + 0.0001 +}; + +std::vector additionalConfig = { + {{ov::hint::inference_precision(ov::element::f32)}}, + {{ov::hint::inference_precision(ov::element::bf16)}} +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_GroupNormalizationStatic, + GroupNormalizationTest, + testing::Combine(testing::ValuesIn(netPrecisions), + ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::undefined), + testing::ValuesIn(ov::test::static_shapes_to_test_representation(staticInputShapes)), + testing::ValuesIn(numGroups), + testing::ValuesIn(epsilon), + testing::Values(ov::test::utils::DEVICE_CPU), + testing::ValuesIn(additionalConfig)), + GroupNormalizationTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_GroupNormalizationDyn, + GroupNormalizationTest, + testing::Combine(testing::ValuesIn(netPrecisions), + ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::undefined), + testing::ValuesIn(DynamicInputShapes), + testing::ValuesIn(numGroups), + testing::ValuesIn(epsilon), + testing::Values(ov::test::utils::DEVICE_CPU), + testing::ValuesIn(additionalConfig)), + GroupNormalizationTest::getTestCaseName); + +} // anonymous namespace \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 8280bdfe251783..cbdee358936444 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -292,6 +292,8 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*Extension.OnnxModelWithExtensionFromDSO.*)"); retVector.emplace_back(R"(.*ONNXQuantizedModels/QuantizedModelsTests.MaxPool.*)"); retVector.emplace_back(R"(.*ONNXQuantizedModels/QuantizedModelsTests.Convolution.*)"); + // Ticket: 134601 + retVector.emplace_back(R"(.*smoke_GroupNormalization.*)"); } // invalid test: checks u8 precision for runtime graph, while it should be f32 retVector.emplace_back(R"(smoke_NegativeQuantizedMatMulMultiplyFusion.*)"); diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp index b66ce1919f6d23..07c84885796093 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp @@ -88,7 +88,8 @@ class BufferAllocationCPUTest : public testing::TestWithParam(); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(m_buffer_scratchpad, m_is_buffer_optimized); + int inplace = -1; + pipeline.register_pass(m_buffer_scratchpad, inplace, m_is_buffer_optimized); pipeline.run(m_linear_ir); } diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp index 612c53db90ab39..606ee8ede9e972 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/group_normalization.hpp @@ -27,8 +27,8 @@ class GroupNormalizationTest : public testing::WithParamInterfaceGetParam(); + std::tie(ngPrc, inType, outType, shapes, num_groups, epsilon, targetDevice, additional_config) = this->GetParam(); InputShape biasInputShape = ExtractBiasShape(shapes); init_input_shapes({shapes, biasInputShape, biasInputShape}); ov::ParameterVector params; @@ -73,6 +78,8 @@ class GroupNormalizationTest : public testing::WithParamInterface(results, params, "GroupNormalization"); }