Skip to content

Commit

Permalink
[Snippets][CPU] Added external repacking via BrgemmCopyB (openvinotoo…
Browse files Browse the repository at this point in the history
…lkit#28179)

### Details:
- *Added two separate implementation for external repacking: in parallel
section with kernel and in the separate parallel section before kernel
execution*

### Tickets:
 - *159886*

### TODO:
- [x] *Adjust heuristic of the impl choosing*
- [x] *Add layout support*
- [x] *[Cherry-picked] Merge the
a-sidorova#266 to this branch*
- [x] *[Cherry-picked] Merge the
a-sidorova#267 to this branch*

---------

Co-authored-by: Vladislav Golubev <[email protected]>
  • Loading branch information
2 people authored and MirceaDan99 committed Jan 22, 2025
1 parent 2151da8 commit c421a52
Show file tree
Hide file tree
Showing 64 changed files with 1,734 additions and 880 deletions.
2 changes: 1 addition & 1 deletion src/common/snippets/docs/mha_optimization_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ The supported by decomposition Transpose orders are defined by `TokenizeMHASnipp

[SplitDimensionM](../src/pass/split_dimension_m.cpp) splits M dimension of MHA in 2 parts (`batch_m` and `new_m`) by inserting Reshape on A input of the first Matmul and output of the second Matmul (the rest Subgraph's inputs are reshaped by Unsqueeze-like reshapes in order not to break subgraph semantic).
This optimization increases parallel work amount by `batch_m` times thus enabling a more efficient parallel execution in some cases.
The splitting is performed based on heuristic algorithm which can be found in `SplitDimensionM::get_splited_dimensions` method.
The splitting is performed based on heuristic algorithm which can be found in `SplitDimensionM::split` method.

Let's consider an example of the transformation:

Expand Down
10 changes: 5 additions & 5 deletions src/common/snippets/include/snippets/op/load.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,17 @@ class Load : public modifier::MemoryAccess, public ov::op::Op {
};

/**
* @interface LoadReshape
* @interface LoadReorder
* @brief It's just Load operation (and it's mapped on LoadEmitter during code generation) that allows to tweak
* shape propagation. We need it to keep correct shape propagation when Transpose is decomposed to
* Load and Store. This is a temporary solution until tokenization of Reshape operation is supported.
* @ingroup snippets
*/
class LoadReshape : public Load {
class LoadReorder : public Load {
public:
OPENVINO_OP("LoadReshape", "SnippetsOpset", Load);
LoadReshape(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
LoadReshape() = default;
OPENVINO_OP("LoadReorder", "SnippetsOpset", Load);
LoadReorder(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
LoadReorder() = default;

void set_offset(size_t offset) { set_output_offset(offset, 0); }
void set_count(size_t count) { set_output_count(count, 0); }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

#pragma once

#include "openvino/op/op.hpp"
#include "shape_infer_op.hpp"
#include "snippets/shape_inference/shape_inference.hpp"

namespace ov {
Expand All @@ -21,9 +21,9 @@ namespace op {
// Note that technically the same goal could be achieved using op::Unsqueeze operation,
// but RankNormalization has a much narrower semantics, and hence allows for an easier control and a more efficient shape infer.
//
class RankNormalization : public ov::op::Op {
class RankNormalization : public ShapeInferOp {
public:
OPENVINO_OP("RankNormalization", "SnippetsOpset");
OPENVINO_OP("RankNormalization", "SnippetsOpset", ShapeInferOp);

RankNormalization() = default;
RankNormalization(const Output<Node>& data, size_t num_prepend, size_t num_append);
Expand Down
43 changes: 43 additions & 0 deletions src/common/snippets/include/snippets/op/reorder.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "shape_infer_op.hpp"
#include "snippets/shape_inference/shape_inference.hpp"

namespace ov {
namespace snippets {
namespace op {
/**
* @interface Reorder
* @brief Reorder reshapes input tensor shape by reqiured target order.
* The tensor data is not updated.
* Note: Order is stored in input PortDescriptor
* @ingroup snippets
*/
class Reorder : public ShapeInferOp {
public:
OPENVINO_OP("Reorder", "SnippetsOpset", ShapeInferOp);
Reorder() = default;
Reorder(const Output<Node>& x, std::vector<size_t> order);

bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;

class ShapeInfer : public IShapeInferSnippets {
std::vector<size_t> m_target_order {};
public:
explicit ShapeInfer(const std::shared_ptr<Node>& n);
Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
};

private:
void custom_constructor_validate_and_infer_types(std::vector<size_t> order);
};

} // namespace op
} // namespace snippets
} // namespace ov
15 changes: 12 additions & 3 deletions src/common/snippets/include/snippets/op/reshape.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

#pragma once

#include "openvino/op/op.hpp"
#include "shape_infer_op.hpp"
#include "snippets/shape_inference/shape_inference.hpp"

namespace ov {
namespace snippets {
Expand All @@ -15,9 +16,9 @@ namespace op {
* @brief Reshape input tensor to reqiured target shape
* @ingroup snippets
*/
class Reshape : public ov::op::Op {
class Reshape : public ShapeInferOp {
public:
OPENVINO_OP("Reshape", "SnippetsOpset");
OPENVINO_OP("Reshape", "SnippetsOpset", ShapeInferOp);
Reshape(const Output<Node>& x, ov::PartialShape target_shape);
Reshape() = default;

Expand All @@ -28,6 +29,14 @@ class Reshape : public ov::op::Op {
const ov::PartialShape& get_target_shape() const;
void set_target_shape(ov::PartialShape shape);

class ShapeInfer : public IShapeInferSnippets {
VectorDims target_shape;
size_t target_shape_volume = 0;
public:
explicit ShapeInfer(const std::shared_ptr<Node>& n);
Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
};

private:
ov::PartialShape m_target_shape = {};
};
Expand Down
27 changes: 27 additions & 0 deletions src/common/snippets/include/snippets/op/shape_infer_op.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/op/op.hpp"

namespace ov {
namespace snippets {
namespace op {

/**
* @interface ShapeInferOp
* @brief Op which infers shape without actually moving data
* @ingroup snippets
*/
class ShapeInferOp : public ov::op::Op {
public:
OPENVINO_OP("ShapeInferOp", "SnippetsOpset");
ShapeInferOp() = default;
ShapeInferOp(const OutputVector& args) : ov::op::Op(args) {}
};

} // namespace op
} // namespace snippets
} // namespace ov
15 changes: 14 additions & 1 deletion src/common/snippets/include/snippets/pass/split_dimension_m.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,24 @@ class SplitDimensionM: public CommonOptimizations::SubgraphPass {

private:
static std::shared_ptr<ov::op::v0::MatMul> get_matmul(const std::shared_ptr<op::Subgraph>& subgraph);
static std::pair<size_t, size_t> get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
/**
* @brief Contains splitM approaches allowing to get the batch ideally divisible by optimal_parallelism_work_amount
*/
static std::pair<size_t, size_t> split_ideally(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
/**
* @brief Splits m_dim to minimize kernel_m in order to reduce waiting time for idle threads at the last parallel loop iteration.
*/
static std::pair<size_t, size_t> split_minimize_kernel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
/**
* @brief Splits m_dim to get the batch in (optimal_parallelism_work_amount, 2 * optimal_parallelism_work_amount) interval
*/
static std::pair<size_t, size_t> split_fallback_increase_parallel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);

void reshape_subgraph(const std::shared_ptr<op::Subgraph>& subgraph, const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim);

size_t m_concurrency;

static const size_t min_kernel_m;
};
} // namespace pass
} // namespace snippets
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,5 @@ class ReduceShapeInfer : public IShapeInferSnippets {
Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
};

class ReshapeShapeInfer : public IShapeInferSnippets {
VectorDims target_shape;
size_t target_shape_volume = 0;
public:
explicit ReshapeShapeInfer(const std::shared_ptr<Node>& n);
Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
};
} // namespace snippets
} // namespace ov
1 change: 1 addition & 0 deletions src/common/snippets/include/snippets/snippets_isa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "op/kernel.hpp"
#include "op/load.hpp"
#include "op/reshape.hpp"
#include "op/reorder.hpp"
#include "op/nop.hpp"
#include "op/scalar.hpp"
#include "op/powerstatic.hpp"
Expand Down
3 changes: 2 additions & 1 deletion src/common/snippets/include/snippets/snippets_isa_tbl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@

// SnippetS dialect
OV_OP(Load, ov::snippets::op)
OV_OP(LoadReshape, ov::snippets::op)
OV_OP(LoadReorder, ov::snippets::op)
OV_OP(LoopBegin, ov::snippets::op)
OV_OP(LoopEnd, ov::snippets::op)
OV_OP(Brgemm, ov::snippets::op)
OV_OP(BroadcastLoad, ov::snippets::op)
OV_OP(Reshape, ov::snippets::op)
OV_OP(Reorder, ov::snippets::op)

OV_OP(Store, ov::snippets::op)

Expand Down
15 changes: 14 additions & 1 deletion src/common/snippets/include/snippets/utils/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,13 +290,26 @@ std::shared_ptr<ov::Node> get_leaf_node_of_first_child_shape_infer_seq(const std
std::shared_ptr<ov::Node> get_leaf_node_of_first_parent_shape_infer_seq(const std::shared_ptr<ov::Node>& start_node);

/**
*
* @param Get stride of input/output dimension
* @param expr_port target port that contains shape and layout info
* @param idx index of the target dimension starting from the shape's end (default = 1)
*/

int64_t get_dim_stride(const lowered::ExpressionPort& expr_port, size_t idx = 1);
/**
* @brief Get stride of input dimension
* @param shape target shape
* @param layout target layout
* @param idx index of the target dimension starting from the shape's end (default = 1)
*/
int64_t get_dim_in_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1);
/**
* @brief Get stride of output dimension
* @param shape target shape
* @param layout target layout
* @param idx index of the target dimension starting from the shape's end (default = 1)
*/
int64_t get_dim_out_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1);

/**
* @brief Traverses path starting from "expr", and calls "func" for each expression.
Expand Down
1 change: 1 addition & 0 deletions src/common/snippets/src/generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output<Node>& out) const {
std::dynamic_pointer_cast<op::Buffer>(op) ||
std::dynamic_pointer_cast<op::RankNormalization>(op) ||
std::dynamic_pointer_cast<op::Reshape>(op) ||
std::dynamic_pointer_cast<op::Reorder>(op) ||
std::dynamic_pointer_cast<snippets::op::Store>(op)
#ifdef SNIPPETS_DEBUG_CAPS
|| std::dynamic_pointer_cast<op::PerfCountBeginBase>(op)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ static bool is_affecting_op(const ExpressionPtr& expr) {
const auto& node = expr->get_node();
return ov::is_type<ov::snippets::op::Brgemm>(node) ||
ov::is_type<ov::snippets::op::Reshape>(node) ||
ov::is_type<ov::snippets::op::LoadReshape>(node);
ov::is_type<ov::snippets::op::LoadReorder>(node);
}
} // namespace

Expand Down
28 changes: 14 additions & 14 deletions src/common/snippets/src/op/load.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,19 @@ std::shared_ptr<Node> Load::clone_with_new_inputs(const OutputVector& new_args)
return std::make_shared<Load>(new_args.at(0), get_count(), get_offset());
}

LoadReshape::LoadReshape(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
LoadReorder::LoadReorder(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
: Load(x, count, offset), m_order(std::move(order)) {
const auto& in_shape = x.get_partial_shape();
const auto in_shape_size = in_shape.size();
OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReshape got new_order of invalid size");
OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReorder got new_order of invalid size");
OPENVINO_ASSERT(*std::max_element(m_order.begin(), m_order.end()) == in_shape_size - 1 &&
*std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReshape detected invalid values in new_order");
*std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReorder detected invalid values in new_order");
const std::set<size_t> unique_dims(order.begin(), order.end());
OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReshape order must not contain repeated elements");
OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReorder order must not contain repeated elements");
constructor_validate_and_infer_types();
}

void LoadReshape::validate_and_infer_types() {
void LoadReorder::validate_and_infer_types() {
validate_memory_access_params();
const auto& old_shape = get_input_partial_shape(0);
ov::PartialShape new_shape;
Expand All @@ -62,23 +62,23 @@ void LoadReshape::validate_and_infer_types() {
set_output_type(0, get_input_element_type(0), new_shape);
}

bool LoadReshape::visit_attributes(AttributeVisitor& visitor) {
bool LoadReorder::visit_attributes(AttributeVisitor& visitor) {
MemoryAccess::visit_attributes(visitor);
visitor.on_attribute("order", m_order);
return true;
}

std::shared_ptr<Node> LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(LoadReshape);
std::shared_ptr<Node> LoadReorder::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(LoadReorder);
check_new_args_count(this, new_args);
return std::make_shared<LoadReshape>(new_args.at(0), get_count(), get_offset(), m_order);
return std::make_shared<LoadReorder>(new_args.at(0), get_count(), get_offset(), m_order);
}
LoadReshape::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
const auto& loadReshape = ov::as_type_ptr<LoadReshape>(n);
OPENVINO_ASSERT(loadReshape, "Got invalid node in LoadReshape::ShapeInfer");
m_order = loadReshape->m_order;
LoadReorder::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
const auto& loadReorder = ov::as_type_ptr<LoadReorder>(n);
OPENVINO_ASSERT(loadReorder, "Got invalid node in LoadReorder::ShapeInfer");
m_order = loadReorder->m_order;
}
IShapeInferSnippets::Result LoadReshape::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
IShapeInferSnippets::Result LoadReorder::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
OPENVINO_ASSERT(input_shapes.size() == 1, "Got unexpected number of input shapes");
return {{utils::get_planar_vdims(input_shapes[0], m_order)}, ShapeInferStatus::success};
}
Expand Down
3 changes: 1 addition & 2 deletions src/common/snippets/src/op/rank_normalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@ namespace snippets {
namespace op {

RankNormalization::RankNormalization(const Output<Node>& data, size_t num_prepend, size_t num_append) :
Op({data}), m_num_prepend(num_prepend), m_num_append(num_append) {
ShapeInferOp({data}), m_num_prepend(num_prepend), m_num_append(num_append) {
constructor_validate_and_infer_types();
}


std::shared_ptr<ov::Node> RankNormalization::clone_with_new_inputs(const OutputVector& new_args) const {
check_new_args_count(this, new_args);
return std::make_shared<RankNormalization>(new_args[0], m_num_prepend, m_num_append);
Expand Down
Loading

0 comments on commit c421a52

Please sign in to comment.