[Snippets][CPU] Added external repacking via BrgemmCopyB (openvinotoo…

…lkit#28179) ### Details: - *Added two separate implementation for external repacking: in parallel section with kernel and in the separate parallel section before kernel execution* ### Tickets: - *159886* ### TODO: - [x] *Adjust heuristic of the impl choosing* - [x] *Add layout support* - [x] *[Cherry-picked] Merge the a-sidorova#266 to this branch* - [x] *[Cherry-picked] Merge the a-sidorova#267 to this branch* --------- Co-authored-by: Vladislav Golubev <[email protected]>
MirceaDan99 · Jan 22, 2025 · c421a52 · c421a52
1 parent 2151da8
commit c421a52
Show file tree

Hide file tree

Showing 64 changed files with 1,734 additions and 880 deletions.
diff --git a/src/common/snippets/docs/mha_optimization_guide.md b/src/common/snippets/docs/mha_optimization_guide.md
@@ -65,7 +65,7 @@ The supported by decomposition Transpose orders are defined by `TokenizeMHASnipp
 
 [SplitDimensionM](../src/pass/split_dimension_m.cpp) splits M dimension of MHA in 2 parts (`batch_m` and `new_m`) by inserting Reshape on A input of the first Matmul and output of the second Matmul (the rest Subgraph's inputs are reshaped by Unsqueeze-like reshapes in order not to break subgraph semantic).
 This optimization increases parallel work amount by `batch_m` times thus enabling a more efficient parallel execution in some cases.
-The splitting is performed based on heuristic algorithm which can be found in `SplitDimensionM::get_splited_dimensions` method.
+The splitting is performed based on heuristic algorithm which can be found in `SplitDimensionM::split` method.
 
 Let's consider an example of the transformation:
 

diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp
@@ -41,17 +41,17 @@ class Load : public modifier::MemoryAccess, public ov::op::Op {
 };
 
 /**
- * @interface LoadReshape
+ * @interface LoadReorder
  * @brief It's just Load operation (and it's mapped on LoadEmitter during code generation) that allows to tweak
  *        shape propagation. We need it to keep correct shape propagation  when Transpose is decomposed to
  *        Load and Store. This is a temporary solution until tokenization of Reshape operation is supported.
  * @ingroup snippets
  */
-class LoadReshape : public Load {
+class LoadReorder : public Load {
 public:
-    OPENVINO_OP("LoadReshape", "SnippetsOpset", Load);
-    LoadReshape(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
-    LoadReshape() = default;
+    OPENVINO_OP("LoadReorder", "SnippetsOpset", Load);
+    LoadReorder(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
+    LoadReorder() = default;
 
     void set_offset(size_t offset) { set_output_offset(offset, 0); }
     void set_count(size_t count) { set_output_count(count, 0); }

diff --git a/src/common/snippets/include/snippets/op/rank_normalization.hpp b/src/common/snippets/include/snippets/op/rank_normalization.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "openvino/op/op.hpp"
+#include "shape_infer_op.hpp"
 #include "snippets/shape_inference/shape_inference.hpp"
 
 namespace ov {
@@ -21,9 +21,9 @@ namespace op {
  // Note that technically the same goal could be achieved using op::Unsqueeze operation,
  // but RankNormalization has a much narrower semantics, and hence allows for an easier control and a more efficient shape infer.
  //
-class  RankNormalization : public ov::op::Op {
+class  RankNormalization : public ShapeInferOp {
 public:
-    OPENVINO_OP("RankNormalization", "SnippetsOpset");
+    OPENVINO_OP("RankNormalization", "SnippetsOpset", ShapeInferOp);
 
     RankNormalization() = default;
     RankNormalization(const Output<Node>& data, size_t num_prepend, size_t num_append);

diff --git a/src/common/snippets/include/snippets/op/reorder.hpp b/src/common/snippets/include/snippets/op/reorder.hpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shape_infer_op.hpp"
+#include "snippets/shape_inference/shape_inference.hpp"
+
+namespace ov {
+namespace snippets {
+namespace op {
+/**
+ * @interface Reorder
+ * @brief Reorder reshapes input tensor shape by reqiured target order.
+ *        The tensor data is not updated.
+ *        Note: Order is stored in input PortDescriptor
+ * @ingroup snippets
+ */
+class Reorder : public ShapeInferOp {
+public:
+    OPENVINO_OP("Reorder", "SnippetsOpset", ShapeInferOp);
+    Reorder() = default;
+    Reorder(const Output<Node>& x, std::vector<size_t> order);
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+    class ShapeInfer : public IShapeInferSnippets {
+        std::vector<size_t> m_target_order {};
+    public:
+        explicit ShapeInfer(const std::shared_ptr<Node>& n);
+        Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
+    };
+
+private:
+    void custom_constructor_validate_and_infer_types(std::vector<size_t> order);
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/op/reshape.hpp b/src/common/snippets/include/snippets/op/reshape.hpp
@@ -4,7 +4,8 @@
 
 #pragma once
 
-#include "openvino/op/op.hpp"
+#include "shape_infer_op.hpp"
+#include "snippets/shape_inference/shape_inference.hpp"
 
 namespace ov {
 namespace snippets {
@@ -15,9 +16,9 @@ namespace op {
  * @brief Reshape input tensor to reqiured target shape
  * @ingroup snippets
  */
-class Reshape : public ov::op::Op {
+class Reshape : public ShapeInferOp {
 public:
-    OPENVINO_OP("Reshape", "SnippetsOpset");
+    OPENVINO_OP("Reshape", "SnippetsOpset", ShapeInferOp);
     Reshape(const Output<Node>& x, ov::PartialShape target_shape);
     Reshape() = default;
 
@@ -28,6 +29,14 @@ class Reshape : public ov::op::Op {
     const ov::PartialShape& get_target_shape() const;
     void set_target_shape(ov::PartialShape shape);
 
+    class ShapeInfer : public IShapeInferSnippets {
+        VectorDims target_shape;
+        size_t target_shape_volume = 0;
+    public:
+        explicit ShapeInfer(const std::shared_ptr<Node>& n);
+        Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
+    };
+
 private:
     ov::PartialShape m_target_shape = {};
 };

diff --git a/src/common/snippets/include/snippets/op/shape_infer_op.hpp b/src/common/snippets/include/snippets/op/shape_infer_op.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+
+namespace ov {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface ShapeInferOp
+ * @brief Op which infers shape without actually moving data
+ * @ingroup snippets
+ */
+class ShapeInferOp : public ov::op::Op {
+public:
+    OPENVINO_OP("ShapeInferOp", "SnippetsOpset");
+    ShapeInferOp() = default;
+    ShapeInferOp(const OutputVector& args) : ov::op::Op(args) {}
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/include/snippets/pass/split_dimension_m.hpp b/src/common/snippets/include/snippets/pass/split_dimension_m.hpp
@@ -67,11 +67,24 @@ class SplitDimensionM: public CommonOptimizations::SubgraphPass {
 
 private:
     static std::shared_ptr<ov::op::v0::MatMul> get_matmul(const std::shared_ptr<op::Subgraph>& subgraph);
-    static std::pair<size_t, size_t> get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
+    /**
+     * @brief Contains splitM approaches allowing to get the batch ideally divisible by optimal_parallelism_work_amount
+     */
+    static std::pair<size_t, size_t> split_ideally(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
+    /**
+     * @brief Splits m_dim to minimize kernel_m in order to reduce waiting time for idle threads at the last parallel loop iteration.
+     */
+    static std::pair<size_t, size_t> split_minimize_kernel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
+    /**
+     * @brief Splits m_dim to get the batch in (optimal_parallelism_work_amount, 2 * optimal_parallelism_work_amount) interval
+     */
+    static std::pair<size_t, size_t> split_fallback_increase_parallel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
 
     void reshape_subgraph(const std::shared_ptr<op::Subgraph>& subgraph, const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim);
 
     size_t m_concurrency;
+
+    static const size_t min_kernel_m;
 };
 } // namespace pass
 } // namespace snippets

diff --git a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp
@@ -75,12 +75,5 @@ class ReduceShapeInfer : public IShapeInferSnippets {
     Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
 };
 
-class ReshapeShapeInfer : public IShapeInferSnippets {
-    VectorDims target_shape;
-    size_t target_shape_volume = 0;
-public:
-    explicit ReshapeShapeInfer(const std::shared_ptr<Node>& n);
-    Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
-};
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp
@@ -18,6 +18,7 @@
 #include "op/kernel.hpp"
 #include "op/load.hpp"
 #include "op/reshape.hpp"
+#include "op/reorder.hpp"
 #include "op/nop.hpp"
 #include "op/scalar.hpp"
 #include "op/powerstatic.hpp"

diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
@@ -11,12 +11,13 @@
 
 // SnippetS dialect
 OV_OP(Load, ov::snippets::op)
-OV_OP(LoadReshape, ov::snippets::op)
+OV_OP(LoadReorder, ov::snippets::op)
 OV_OP(LoopBegin, ov::snippets::op)
 OV_OP(LoopEnd, ov::snippets::op)
 OV_OP(Brgemm, ov::snippets::op)
 OV_OP(BroadcastLoad, ov::snippets::op)
 OV_OP(Reshape, ov::snippets::op)
+OV_OP(Reorder, ov::snippets::op)
 
 OV_OP(Store, ov::snippets::op)
 

diff --git a/src/common/snippets/include/snippets/utils/utils.hpp b/src/common/snippets/include/snippets/utils/utils.hpp
@@ -290,13 +290,26 @@ std::shared_ptr<ov::Node> get_leaf_node_of_first_child_shape_infer_seq(const std
 std::shared_ptr<ov::Node> get_leaf_node_of_first_parent_shape_infer_seq(const std::shared_ptr<ov::Node>& start_node);
 
 /**
- *
  * @param Get stride of input/output dimension
  * @param expr_port target port that contains shape and layout info
  * @param idx index of the target dimension starting from the shape's end (default = 1)
  */
 
 int64_t get_dim_stride(const lowered::ExpressionPort& expr_port, size_t idx = 1);
+/**
+ * @brief Get stride of input dimension
+ * @param shape target shape
+ * @param layout target layout
+ * @param idx index of the target dimension starting from the shape's end (default = 1)
+ */
+int64_t get_dim_in_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1);
+/**
+ * @brief Get stride of output dimension
+ * @param shape target shape
+ * @param layout target layout
+ * @param idx index of the target dimension starting from the shape's end (default = 1)
+ */
+int64_t get_dim_out_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1);
 
 /**
  * @brief Traverses path starting from "expr", and calls "func" for each expression.

diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
@@ -77,6 +77,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output<Node>& out) const {
         std::dynamic_pointer_cast<op::Buffer>(op) ||
         std::dynamic_pointer_cast<op::RankNormalization>(op) ||
         std::dynamic_pointer_cast<op::Reshape>(op) ||
+        std::dynamic_pointer_cast<op::Reorder>(op) ||
         std::dynamic_pointer_cast<snippets::op::Store>(op)
 #ifdef SNIPPETS_DEBUG_CAPS
         || std::dynamic_pointer_cast<op::PerfCountBeginBase>(op)

diff --git a/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp b/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp
@@ -41,7 +41,7 @@ static bool is_affecting_op(const ExpressionPtr& expr) {
     const auto& node = expr->get_node();
     return ov::is_type<ov::snippets::op::Brgemm>(node) ||
            ov::is_type<ov::snippets::op::Reshape>(node) ||
-           ov::is_type<ov::snippets::op::LoadReshape>(node);
+           ov::is_type<ov::snippets::op::LoadReorder>(node);
 }
 }  // namespace
 

diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp
@@ -41,19 +41,19 @@ std::shared_ptr<Node> Load::clone_with_new_inputs(const OutputVector& new_args)
     return std::make_shared<Load>(new_args.at(0), get_count(), get_offset());
 }
 
-LoadReshape::LoadReshape(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
+LoadReorder::LoadReorder(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
                             : Load(x, count, offset), m_order(std::move(order)) {
     const auto& in_shape = x.get_partial_shape();
     const auto in_shape_size = in_shape.size();
-    OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReshape got new_order of invalid size");
+    OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReorder got new_order of invalid size");
     OPENVINO_ASSERT(*std::max_element(m_order.begin(), m_order.end()) == in_shape_size - 1 &&
-                    *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReshape detected invalid values in new_order");
+                    *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReorder detected invalid values in new_order");
     const std::set<size_t> unique_dims(order.begin(), order.end());
-    OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReshape order must not contain repeated elements");
+    OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReorder order must not contain repeated elements");
     constructor_validate_and_infer_types();
 }
 
-void LoadReshape::validate_and_infer_types() {
+void LoadReorder::validate_and_infer_types() {
     validate_memory_access_params();
     const auto& old_shape = get_input_partial_shape(0);
     ov::PartialShape new_shape;
@@ -62,23 +62,23 @@ void LoadReshape::validate_and_infer_types() {
     set_output_type(0, get_input_element_type(0), new_shape);
 }
 
-bool LoadReshape::visit_attributes(AttributeVisitor& visitor) {
+bool LoadReorder::visit_attributes(AttributeVisitor& visitor) {
     MemoryAccess::visit_attributes(visitor);
     visitor.on_attribute("order", m_order);
     return true;
 }
 
-std::shared_ptr<Node> LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const {
-    INTERNAL_OP_SCOPE(LoadReshape);
+std::shared_ptr<Node> LoadReorder::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(LoadReorder);
     check_new_args_count(this, new_args);
-    return std::make_shared<LoadReshape>(new_args.at(0), get_count(), get_offset(), m_order);
+    return std::make_shared<LoadReorder>(new_args.at(0), get_count(), get_offset(), m_order);
 }
-LoadReshape::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
-    const auto& loadReshape = ov::as_type_ptr<LoadReshape>(n);
-    OPENVINO_ASSERT(loadReshape, "Got invalid node in LoadReshape::ShapeInfer");
-    m_order = loadReshape->m_order;
+LoadReorder::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
+    const auto& loadReorder = ov::as_type_ptr<LoadReorder>(n);
+    OPENVINO_ASSERT(loadReorder, "Got invalid node in LoadReorder::ShapeInfer");
+    m_order = loadReorder->m_order;
 }
-IShapeInferSnippets::Result LoadReshape::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
+IShapeInferSnippets::Result LoadReorder::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
     OPENVINO_ASSERT(input_shapes.size() == 1, "Got unexpected number of input shapes");
     return {{utils::get_planar_vdims(input_shapes[0], m_order)}, ShapeInferStatus::success};
 }

diff --git a/src/common/snippets/src/op/rank_normalization.cpp b/src/common/snippets/src/op/rank_normalization.cpp
@@ -10,11 +10,10 @@ namespace snippets {
 namespace op {
 
 RankNormalization::RankNormalization(const Output<Node>& data, size_t num_prepend, size_t num_append) :
-    Op({data}), m_num_prepend(num_prepend), m_num_append(num_append) {
+    ShapeInferOp({data}), m_num_prepend(num_prepend), m_num_append(num_append) {
     constructor_validate_and_infer_types();
 }
 
-
 std::shared_ptr<ov::Node> RankNormalization::clone_with_new_inputs(const OutputVector& new_args) const {
     check_new_args_count(this, new_args);
     return std::make_shared<RankNormalization>(new_args[0], m_num_prepend, m_num_append);