Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Snippets][CPU] Added external repacking via BrgemmCopyB #28179

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/common/snippets/docs/mha_optimization_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ The supported by decomposition Transpose orders are defined by `TokenizeMHASnipp

[SplitDimensionM](../src/pass/split_dimension_m.cpp) splits M dimension of MHA in 2 parts (`batch_m` and `new_m`) by inserting Reshape on A input of the first Matmul and output of the second Matmul (the rest Subgraph's inputs are reshaped by Unsqueeze-like reshapes in order not to break subgraph semantic).
This optimization increases parallel work amount by `batch_m` times thus enabling a more efficient parallel execution in some cases.
The splitting is performed based on heuristic algorithm which can be found in `SplitDimensionM::get_splited_dimensions` method.
The splitting is performed based on heuristic algorithm which can be found in `SplitDimensionM::split` method.

Let's consider an example of the transformation:

Expand Down
10 changes: 5 additions & 5 deletions src/common/snippets/include/snippets/op/load.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,17 @@ class Load : public modifier::MemoryAccess, public ov::op::Op {
};

/**
* @interface LoadReshape
* @interface LoadReorder
* @brief It's just Load operation (and it's mapped on LoadEmitter during code generation) that allows to tweak
* shape propagation. We need it to keep correct shape propagation when Transpose is decomposed to
* Load and Store. This is a temporary solution until tokenization of Reshape operation is supported.
* @ingroup snippets
*/
class LoadReshape : public Load {
class LoadReorder : public Load {
public:
OPENVINO_OP("LoadReshape", "SnippetsOpset", Load);
LoadReshape(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
LoadReshape() = default;
OPENVINO_OP("LoadReorder", "SnippetsOpset", Load);
LoadReorder(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
LoadReorder() = default;

void set_offset(size_t offset) { set_output_offset(offset, 0); }
void set_count(size_t count) { set_output_count(count, 0); }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

#pragma once

#include "openvino/op/op.hpp"
#include "shape_infer_op.hpp"
#include "snippets/shape_inference/shape_inference.hpp"

namespace ov {
Expand All @@ -21,9 +21,9 @@ namespace op {
// Note that technically the same goal could be achieved using op::Unsqueeze operation,
// but RankNormalization has a much narrower semantics, and hence allows for an easier control and a more efficient shape infer.
//
class RankNormalization : public ov::op::Op {
class RankNormalization : public ShapeInferOp {
public:
OPENVINO_OP("RankNormalization", "SnippetsOpset");
OPENVINO_OP("RankNormalization", "SnippetsOpset", ShapeInferOp);

RankNormalization() = default;
RankNormalization(const Output<Node>& data, size_t num_prepend, size_t num_append);
Expand Down
43 changes: 43 additions & 0 deletions src/common/snippets/include/snippets/op/reorder.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "shape_infer_op.hpp"
#include "snippets/shape_inference/shape_inference.hpp"

namespace ov {
namespace snippets {
namespace op {
/**
* @interface Reorder
* @brief Reorder reshapes input tensor shape by reqiured target order.
* The tensor data is not updated.
* Note: Order is stored in input PortDescriptor
* @ingroup snippets
*/
class Reorder : public ShapeInferOp {
public:
OPENVINO_OP("Reorder", "SnippetsOpset", ShapeInferOp);
Reorder() = default;
Reorder(const Output<Node>& x, std::vector<size_t> order);

bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;

class ShapeInfer : public IShapeInferSnippets {
std::vector<size_t> m_target_order {};
public:
explicit ShapeInfer(const std::shared_ptr<Node>& n);
Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
};

private:
void custom_constructor_validate_and_infer_types(std::vector<size_t> order);
};

} // namespace op
} // namespace snippets
} // namespace ov
15 changes: 12 additions & 3 deletions src/common/snippets/include/snippets/op/reshape.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

#pragma once

#include "openvino/op/op.hpp"
#include "shape_infer_op.hpp"
#include "snippets/shape_inference/shape_inference.hpp"

namespace ov {
namespace snippets {
Expand All @@ -15,9 +16,9 @@ namespace op {
* @brief Reshape input tensor to reqiured target shape
* @ingroup snippets
*/
class Reshape : public ov::op::Op {
class Reshape : public ShapeInferOp {
public:
OPENVINO_OP("Reshape", "SnippetsOpset");
OPENVINO_OP("Reshape", "SnippetsOpset", ShapeInferOp);
Reshape(const Output<Node>& x, ov::PartialShape target_shape);
Reshape() = default;

Expand All @@ -28,6 +29,14 @@ class Reshape : public ov::op::Op {
const ov::PartialShape& get_target_shape() const;
void set_target_shape(ov::PartialShape shape);

class ShapeInfer : public IShapeInferSnippets {
VectorDims target_shape;
size_t target_shape_volume = 0;
public:
explicit ShapeInfer(const std::shared_ptr<Node>& n);
Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
};

private:
ov::PartialShape m_target_shape = {};
};
Expand Down
27 changes: 27 additions & 0 deletions src/common/snippets/include/snippets/op/shape_infer_op.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/op/op.hpp"

namespace ov {
namespace snippets {
namespace op {

/**
* @interface ShapeInferOp
* @brief Op which infers shape without actually moving data
* @ingroup snippets
*/
class ShapeInferOp : public ov::op::Op {
public:
OPENVINO_OP("ShapeInferOp", "SnippetsOpset");
ShapeInferOp() = default;
ShapeInferOp(const OutputVector& args) : ov::op::Op(args) {}
};

} // namespace op
} // namespace snippets
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,24 @@ class SplitDimensionM: public CommonOptimizations::SubgraphPass {

private:
static std::shared_ptr<ov::op::v0::MatMul> get_matmul(const std::shared_ptr<op::Subgraph>& subgraph);
static std::pair<size_t, size_t> get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
/**
* @brief Contains splitM approaches allowing to get the batch ideally divisible by optimal_parallelism_work_amount
*/
static std::pair<size_t, size_t> split_ideally(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
/**
* @brief Splits m_dim to minimize kernel_m in order to reduce waiting time for idle threads at the last parallel loop iteration.
*/
static std::pair<size_t, size_t> split_minimize_kernel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);
/**
* @brief Splits m_dim to get the batch in (optimal_parallelism_work_amount, 2 * optimal_parallelism_work_amount) interval
*/
static std::pair<size_t, size_t> split_fallback_increase_parallel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount);

void reshape_subgraph(const std::shared_ptr<op::Subgraph>& subgraph, const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim);

size_t m_concurrency;

static const size_t min_kernel_m;
};
} // namespace pass
} // namespace snippets
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,5 @@ class ReduceShapeInfer : public IShapeInferSnippets {
Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
};

class ReshapeShapeInfer : public IShapeInferSnippets {
VectorDims target_shape;
size_t target_shape_volume = 0;
public:
explicit ReshapeShapeInfer(const std::shared_ptr<Node>& n);
Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
};
} // namespace snippets
} // namespace ov
1 change: 1 addition & 0 deletions src/common/snippets/include/snippets/snippets_isa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "op/kernel.hpp"
#include "op/load.hpp"
#include "op/reshape.hpp"
#include "op/reorder.hpp"
#include "op/nop.hpp"
#include "op/scalar.hpp"
#include "op/powerstatic.hpp"
Expand Down
3 changes: 2 additions & 1 deletion src/common/snippets/include/snippets/snippets_isa_tbl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@

// SnippetS dialect
OV_OP(Load, ov::snippets::op)
OV_OP(LoadReshape, ov::snippets::op)
OV_OP(LoadReorder, ov::snippets::op)
OV_OP(LoopBegin, ov::snippets::op)
OV_OP(LoopEnd, ov::snippets::op)
OV_OP(Brgemm, ov::snippets::op)
OV_OP(BroadcastLoad, ov::snippets::op)
OV_OP(Reshape, ov::snippets::op)
OV_OP(Reorder, ov::snippets::op)

OV_OP(Store, ov::snippets::op)

Expand Down
15 changes: 14 additions & 1 deletion src/common/snippets/include/snippets/utils/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,13 +290,26 @@ std::shared_ptr<ov::Node> get_leaf_node_of_first_child_shape_infer_seq(const std
std::shared_ptr<ov::Node> get_leaf_node_of_first_parent_shape_infer_seq(const std::shared_ptr<ov::Node>& start_node);

/**
*
* @param Get stride of input/output dimension
* @param expr_port target port that contains shape and layout info
* @param idx index of the target dimension starting from the shape's end (default = 1)
*/

int64_t get_dim_stride(const lowered::ExpressionPort& expr_port, size_t idx = 1);
/**
* @brief Get stride of input dimension
* @param shape target shape
* @param layout target layout
* @param idx index of the target dimension starting from the shape's end (default = 1)
*/
int64_t get_dim_in_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1);
/**
* @brief Get stride of output dimension
* @param shape target shape
* @param layout target layout
* @param idx index of the target dimension starting from the shape's end (default = 1)
*/
int64_t get_dim_out_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1);

/**
* @brief Traverses path starting from "expr", and calls "func" for each expression.
Expand Down
1 change: 1 addition & 0 deletions src/common/snippets/src/generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output<Node>& out) const {
std::dynamic_pointer_cast<op::Buffer>(op) ||
std::dynamic_pointer_cast<op::RankNormalization>(op) ||
std::dynamic_pointer_cast<op::Reshape>(op) ||
std::dynamic_pointer_cast<op::Reorder>(op) ||
std::dynamic_pointer_cast<snippets::op::Store>(op)
#ifdef SNIPPETS_DEBUG_CAPS
|| std::dynamic_pointer_cast<op::PerfCountBeginBase>(op)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ static bool is_affecting_op(const ExpressionPtr& expr) {
const auto& node = expr->get_node();
return ov::is_type<ov::snippets::op::Brgemm>(node) ||
ov::is_type<ov::snippets::op::Reshape>(node) ||
ov::is_type<ov::snippets::op::LoadReshape>(node);
ov::is_type<ov::snippets::op::LoadReorder>(node);
}
} // namespace

Expand Down
28 changes: 14 additions & 14 deletions src/common/snippets/src/op/load.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,19 @@ std::shared_ptr<Node> Load::clone_with_new_inputs(const OutputVector& new_args)
return std::make_shared<Load>(new_args.at(0), get_count(), get_offset());
}

LoadReshape::LoadReshape(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
LoadReorder::LoadReorder(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
: Load(x, count, offset), m_order(std::move(order)) {
const auto& in_shape = x.get_partial_shape();
const auto in_shape_size = in_shape.size();
OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReshape got new_order of invalid size");
OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReorder got new_order of invalid size");
OPENVINO_ASSERT(*std::max_element(m_order.begin(), m_order.end()) == in_shape_size - 1 &&
*std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReshape detected invalid values in new_order");
*std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReorder detected invalid values in new_order");
const std::set<size_t> unique_dims(order.begin(), order.end());
OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReshape order must not contain repeated elements");
OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReorder order must not contain repeated elements");
constructor_validate_and_infer_types();
}

void LoadReshape::validate_and_infer_types() {
void LoadReorder::validate_and_infer_types() {
validate_memory_access_params();
const auto& old_shape = get_input_partial_shape(0);
ov::PartialShape new_shape;
Expand All @@ -62,23 +62,23 @@ void LoadReshape::validate_and_infer_types() {
set_output_type(0, get_input_element_type(0), new_shape);
}

bool LoadReshape::visit_attributes(AttributeVisitor& visitor) {
bool LoadReorder::visit_attributes(AttributeVisitor& visitor) {
MemoryAccess::visit_attributes(visitor);
visitor.on_attribute("order", m_order);
return true;
}

std::shared_ptr<Node> LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(LoadReshape);
std::shared_ptr<Node> LoadReorder::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(LoadReorder);
check_new_args_count(this, new_args);
return std::make_shared<LoadReshape>(new_args.at(0), get_count(), get_offset(), m_order);
return std::make_shared<LoadReorder>(new_args.at(0), get_count(), get_offset(), m_order);
}
LoadReshape::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
const auto& loadReshape = ov::as_type_ptr<LoadReshape>(n);
OPENVINO_ASSERT(loadReshape, "Got invalid node in LoadReshape::ShapeInfer");
m_order = loadReshape->m_order;
LoadReorder::ShapeInfer::ShapeInfer(const std::shared_ptr<ov::Node>& n) {
const auto& loadReorder = ov::as_type_ptr<LoadReorder>(n);
OPENVINO_ASSERT(loadReorder, "Got invalid node in LoadReorder::ShapeInfer");
m_order = loadReorder->m_order;
}
IShapeInferSnippets::Result LoadReshape::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
IShapeInferSnippets::Result LoadReorder::ShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
OPENVINO_ASSERT(input_shapes.size() == 1, "Got unexpected number of input shapes");
return {{utils::get_planar_vdims(input_shapes[0], m_order)}, ShapeInferStatus::success};
}
Expand Down
3 changes: 1 addition & 2 deletions src/common/snippets/src/op/rank_normalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@ namespace snippets {
namespace op {

RankNormalization::RankNormalization(const Output<Node>& data, size_t num_prepend, size_t num_append) :
Op({data}), m_num_prepend(num_prepend), m_num_append(num_append) {
ShapeInferOp({data}), m_num_prepend(num_prepend), m_num_append(num_append) {
constructor_validate_and_infer_types();
}


std::shared_ptr<ov::Node> RankNormalization::clone_with_new_inputs(const OutputVector& new_args) const {
check_new_args_count(this, new_args);
return std::make_shared<RankNormalization>(new_args[0], m_num_prepend, m_num_append);
Expand Down
Loading
Loading