chenhu-wang · chenhu-wang · Dec 19, 2023 · Feb 28, 2024 · Mar 12, 2024 · Mar 15, 2024
diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp
@@ -24,6 +24,7 @@ class Generator;
  * @brief Holds all relevant information produced during lowering
  * @param compiled_snippet pointer to interface class that encapsulates compiled binary code
  * @param buffer_scratchpad_size the amount of additional memory required by the binary code to execute.
+ * @param buffer_inplace_output buffer share memory with subgraph output result. -1 means no sharing. i>=0 means share ith output memory.
  * Must be allocated and freed by the backend.
  */
 class LoweringResult {
@@ -35,6 +36,7 @@ class LoweringResult {
 public:
     std::shared_ptr<CompiledSnippet> compiled_snippet = nullptr;
     size_t buffer_scratchpad_size = 0;
+    int buffer_inplace_output = -1;
 };
 
 /**

diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
@@ -223,6 +223,22 @@ class LinearIR {
      */
     exprIt replace_with_expr(const std::vector<ExpressionPtr>& old_exprs, const ExpressionPtr& new_expr);
 
+    /**
+     * @brief Propagate start_expr through zero to several consecutive shape infer exprs(such as reshape, rankNormalization).
+     * @param start_expr Propagate from start_expr.
+     * @param downstream Propagate downstream if it's true, otherwise propagate upstream.
+     * @return shape infer op consumers as a sequence if downstream, or shape infer op sources as a sequence if upstream.
+     */
+    static std::vector<ExpressionPtr> propagate_expr_through_shape_infer_ops(const ExpressionPtr& start_expr, bool downstream);
+
+    /**
+     * @brief Get last shape infer op from start_expr in a sequence. If no shape infer op is connect to start_expr, return start_expr.
+     * @param start_expr Search from start_expr.
+     * @param downstream search downstream if it's true, otherwise search upstream.
+     * @return last shape infer expr
+     */
+    static ExpressionPtr get_last_shape_infer_expr(const ExpressionPtr& start_expr, bool downstream);
+
 private:
     std::shared_ptr<ShapeInferSnippetsNode> m_shape_infer = nullptr;
 

diff --git a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
@@ -26,7 +26,7 @@ namespace pass {
 class AllocateBuffers: public RangedPass {
 public:
     OPENVINO_RTTI("AllocateBuffers", "RangedPass")
-    AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true);
+    AllocateBuffers(size_t& buffer_scratchpad_size, int& buffer_inplace_output, bool is_optimized = true);
 
     /**
      * @brief Apply the pass to the Linear IR
@@ -44,8 +44,10 @@ class AllocateBuffers: public RangedPass {
 
     using BufferCluster = std::set<ExpressionPtr>;
     using BufferClusters = std::vector<BufferCluster>;
+
 private:
     size_t& m_buffer_scratchpad_size;
+    int& m_buffer_inplace_output;
     bool m_is_optimized_mode = true;
 };
 

diff --git a/src/common/snippets/include/snippets/op/reshape.hpp b/src/common/snippets/include/snippets/op/reshape.hpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+#include "snippets/shape_inference/shape_inference.hpp"
+
+namespace ov {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface Reshape
+ * @brief Reshape input tensor to reqiured target shape
+ * @ingroup snippets
+ */
+class Reshape : public ov::op::Op {
+public:
+    OPENVINO_OP("Reshape", "SnippetsOpset");
+    Reshape(const Output<Node>& x, ov::PartialShape target_shape);
+    Reshape() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+    const ov::PartialShape& get_target_shape() const;
+    void set_target_shape(ov::PartialShape shape);
+
+private:
+    ov::PartialShape m_target_shape = {};
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -139,6 +139,8 @@ class Subgraph : public ov::op::util::SubGraphOp {
     // Return estimated unique buffer count (upper bound). It's needed for tokenization
     static auto get_estimated_buffer_count(const ov::NodeVector& ops) -> size_t;
     static auto is_domain_sensitive_op(const std::shared_ptr<ov::Node>& op) -> bool;
+    static auto is_shape_infer_op(const std::shared_ptr<ov::Node>& op) -> bool;
+    static auto get_last_shape_infer_op(const std::shared_ptr<ov::Node>& op, bool downstream) -> std::shared_ptr<ov::Node>;
 
     void data_flow_transformations(const BlockedShapeVector& blocked_input_shapes = {},
                                    const std::vector<ov::element::Type>& input_precisions = {},

diff --git a/src/common/snippets/include/snippets/pass/gn_decomposition.hpp b/src/common/snippets/include/snippets/pass/gn_decomposition.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/graph_rewrite.hpp"
+#include "openvino/pass/pattern/matcher.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface GNDecomposition
+ * @brief Decomposes GroupNormalization to a range of low-level operations
+ * @ingroup snippets
+ */
+class GNDecomposition: public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("GNDecomposition", "0");
+    GNDecomposition();
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ov
diff --git a/src/common/snippets/include/snippets/pass/gn_tokenization.hpp b/src/common/snippets/include/snippets/pass/gn_tokenization.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/pattern/matcher.hpp"
+#include "snippets/pass/tokenization.hpp"
+
+namespace ov {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface TokenizeGNSnippets
+ * @brief Tokenize GroupNormalization to a subgraph
+ * @ingroup snippets
+ */
+class TokenizeGNSnippets : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("TokenizeGNSnippets", "0");
+    TokenizeGNSnippets();
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ov
diff --git a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp
@@ -75,5 +75,12 @@ class ReduceShapeInfer : public IShapeInferSnippets {
     Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
 };
 
+class ReshapeShapeInfer : public IShapeInferSnippets {
+    ov::PartialShape target_shape;
+public:
+    explicit ReshapeShapeInfer(const std::shared_ptr<Node>& n);
+    Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
+};
+
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp
@@ -17,6 +17,7 @@
 #include "op/fill.hpp"
 #include "op/kernel.hpp"
 #include "op/load.hpp"
+#include "op/reshape.hpp"
 #include "op/nop.hpp"
 #include "op/scalar.hpp"
 #include "op/powerstatic.hpp"

diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
@@ -16,6 +16,7 @@ OV_OP(LoopBegin, ov::snippets::op)
 OV_OP(LoopEnd, ov::snippets::op)
 OV_OP(Brgemm, ov::snippets::op)
 OV_OP(BroadcastLoad, ov::snippets::op)
+OV_OP(Reshape, ov::snippets::op)
 
 OV_OP(Store, ov::snippets::op)
 

diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp
@@ -154,6 +154,14 @@ VectorDims get_planar_vdims(const snippets::lowered::ExpressionPort& expr_port);
  * @return preordered shape: `shape[i]` = `planar_shape[order[i]]` where `shape` is shape before applying the order.
  */
 VectorDims get_preordered_vdims(const snippets::lowered::ExpressionPort& expr_port);
+/**
+ * @brief Returns element count of a shape
+ * @param shape input shape
+ * @return element count of input shape
+ */
+inline auto get_shape_size(const VectorDims& shape) -> size_t {
+    return std::accumulate(shape.begin(), shape.end(), static_cast<size_t>(1), std::multiplies<size_t>());
+}
 /* --------------------------- */
 
 } // namespace utils

diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
@@ -81,6 +81,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output<Node>& out) const {
         std::dynamic_pointer_cast<op::IntermediateMemoryBuffer>(op) ||
         std::dynamic_pointer_cast<op::NewMemoryBuffer>(op) ||
         std::dynamic_pointer_cast<op::RankNormalization>(op) ||
+        std::dynamic_pointer_cast<op::Reshape>(op) ||
         std::dynamic_pointer_cast<snippets::op::Store>(op)
 #ifdef SNIPPETS_DEBUG_CAPS
         || std::dynamic_pointer_cast<op::PerfCountBeginBase>(op)

diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp
@@ -12,6 +12,7 @@
 #include "openvino/core/graph_util.hpp"
 #include "openvino/core/type.hpp"
 #include "snippets/utils.hpp"
+#include "snippets/op/subgraph.hpp"
 
 namespace ov {
 namespace snippets {
@@ -365,10 +366,14 @@ VectorDims LinearIR::get_master_shape() const {
     }
     // Note: Snippets would benefit from a more generic master_shape calculation approach.
     //  It will be implemented in the scope of ROI propagation activity (ticket 120505)
-    const auto& source = out_exprs[0]->get_input_port_connector(0)->get_source();
-    if (!m_config.m_enable_domain_optimization && out_exprs.size() == 1 &&
-        ov::is_type<snippets::op::Brgemm>(source.get_expr()->get_node())) {
-        master_shape = utils::get_preordered_vdims(source);
+    if (out_exprs.size() == 1) {
+        const auto& source = out_exprs[0]->get_input_port_connector(0)->get_source();
+        if (!m_config.m_enable_domain_optimization && ov::is_type<snippets::op::Brgemm>(source.get_expr()->get_node())) {
+            master_shape = utils::get_preordered_vdims(source);
+        } else {
+            auto last_shape_infer_expr = LinearIR::get_last_shape_infer_expr(out_exprs[0], false);
+            master_shape = utils::get_preordered_vdims(last_shape_infer_expr->get_input_port_connector(0)->get_source());
+        }
     } else {
         for (const auto& oe : out_exprs) {
             const auto& port_desc = oe->get_input_port_descriptor(0);
@@ -493,6 +498,74 @@ LinearIR::exprIt LinearIR::replace_with_expr(const std::vector<ExpressionPtr>& o
     return replace_with_expr(old_exprs, new_expr, insertion_place);
 }
 
+std::vector<ExpressionPtr> LinearIR::propagate_expr_through_shape_infer_ops(const ExpressionPtr& start_expr, bool downstream) {
+    std::vector<ExpressionPtr> shape_infer_exprs;
+    auto current_exp = start_expr;
+    if (op::Subgraph::is_shape_infer_op(current_exp->get_node())) {
+        shape_infer_exprs.push_back(current_exp);
+    }
+    if (downstream) {
+        if (current_exp->get_output_count() == 0)
+            return shape_infer_exprs;
+        auto consumers = current_exp->get_output_port_connector(0)->get_consumers();
+        auto first_child = consumers.begin()->get_expr();
+        while (op::Subgraph::is_shape_infer_op(first_child->get_node())) {
+            OPENVINO_ASSERT(consumers.size() == 1, "Shape infer ops are supposed to be the only consumer.");
+            shape_infer_exprs.push_back(first_child);
+            current_exp = first_child;
+            if (current_exp->get_output_count() == 0)
+                break;
+            consumers = current_exp->get_output_port_connector(0)->get_consumers();
+            first_child = consumers.begin()->get_expr();
+        }
+        return shape_infer_exprs;
+    } else {
+        // upstream
+        if (current_exp->get_input_count() == 0)
+            return shape_infer_exprs;
+        auto first_source = current_exp->get_input_port_connector(0)->get_source().get_expr();
+        while (op::Subgraph::is_shape_infer_op(first_source->get_node())) {
+            shape_infer_exprs.push_back(first_source);
+            current_exp = first_source;
+            if (current_exp->get_input_count() == 0)
+                break;
+            first_source = current_exp->get_input_port_connector(0)->get_source().get_expr();
+        }
+        return shape_infer_exprs;
+    }
+}
+
+ExpressionPtr LinearIR::get_last_shape_infer_expr(const ExpressionPtr& start_expr, bool downstream) {
+    auto last_exp = start_expr;
+    if (downstream) {
+        if (last_exp->get_output_count() == 0)
+            return last_exp;
+        auto consumers = last_exp->get_output_port_connector(0)->get_consumers();
+        auto first_child = consumers.begin()->get_expr();
+        while (op::Subgraph::is_shape_infer_op(first_child->get_node())) {
+            OPENVINO_ASSERT(consumers.size() == 1, "Shape infer ops are supposed to be the only consumer.");
+            last_exp = first_child;
+            if (last_exp->get_output_count() == 0)
+                break;
+            consumers = last_exp->get_output_port_connector(0)->get_consumers();
+            first_child = consumers.begin()->get_expr();
+        }
+        return last_exp;
+    } else {
+        // upstream
+        if (last_exp->get_input_count() == 0)
+            return last_exp;
+        auto first_source = last_exp->get_input_port_connector(0)->get_source().get_expr();
+        while (op::Subgraph::is_shape_infer_op(first_source->get_node())) {
+            last_exp = first_source;
+            if (last_exp->get_input_count() == 0)
+                break;
+            first_source = last_exp->get_input_port_connector(0)->get_source().get_expr();
+        }
+        return last_exp;
+    }
+}
+
 LinearIR::LIRShapeInfer::LIRShapeInfer(container& body_exprs, io_container& io_exprs)
                                        : ShapeInferSnippetsNode(),
                                          m_exprs{std::make_shared<container>(body_exprs)} {

diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
@@ -19,8 +19,8 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized)
-    : m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized) {}
+AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, int& buffer_inplace_output, bool is_optimized)
+    : m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized), m_buffer_inplace_output(buffer_inplace_output) {}
 
 void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset) {
     // If Buffer has offset We set this offset in the connected MemoryAccess ops
@@ -46,7 +46,8 @@ void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const
         }
     }
     // Propagate to down: in Load. Buffer can have several Load
-    const auto& buffer_out = buffer_expr->get_output_port_connector(0);
+    auto last_shape_infer = ov::snippets::lowered::LinearIR::get_last_shape_infer_expr(buffer_expr, true);
+    const auto& buffer_out = last_shape_infer->get_output_port_connector(0);
     for (const auto& child_expr_input : buffer_out->get_consumers()) {
         const auto& child_expr = child_expr_input.get_expr();
         const auto port = child_expr_input.get_index();
@@ -59,7 +60,7 @@ void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const
             continue;
         } else {
             OPENVINO_THROW(
-                    "Buffer::set_offset() was called when Buffer didn't have the corresponding MemoryAccess op for offset propagation");
+                "Buffer::set_offset() was called when Buffer didn't have the corresponding MemoryAccess op for offset propagation");
         }
     }
 }
@@ -77,6 +78,7 @@ bool AllocateBuffers::run(lowered::LinearIR& linear_ir, lowered::LinearIR::const
         pipeline.register_pass<SolveBufferMemory>(m_buffer_scratchpad_size, buffer_clusters);
         pipeline.register_pass<NormalizeBufferIDs>();
         pipeline.run(linear_ir);
+        m_buffer_inplace_output = 0;
     } else {
         InitBuffersDefault(m_buffer_scratchpad_size).run(linear_ir, linear_ir.cbegin(), linear_ir.cend());
     }