diff --git a/src/common/snippets/docs/mha_optimization_guide.md b/src/common/snippets/docs/mha_optimization_guide.md index 28245017833a4a..1ea3a4c24c3524 100644 --- a/src/common/snippets/docs/mha_optimization_guide.md +++ b/src/common/snippets/docs/mha_optimization_guide.md @@ -65,7 +65,7 @@ The supported by decomposition Transpose orders are defined by `TokenizeMHASnipp [SplitDimensionM](../src/pass/split_dimension_m.cpp) splits M dimension of MHA in 2 parts (`batch_m` and `new_m`) by inserting Reshape on A input of the first Matmul and output of the second Matmul (the rest Subgraph's inputs are reshaped by Unsqueeze-like reshapes in order not to break subgraph semantic). This optimization increases parallel work amount by `batch_m` times thus enabling a more efficient parallel execution in some cases. -The splitting is performed based on heuristic algorithm which can be found in `SplitDimensionM::get_splited_dimensions` method. +The splitting is performed based on heuristic algorithm which can be found in `SplitDimensionM::split` method. Let's consider an example of the transformation: diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp index bca4b09fabdcbd..d0a168483bc5ce 100644 --- a/src/common/snippets/include/snippets/op/load.hpp +++ b/src/common/snippets/include/snippets/op/load.hpp @@ -41,17 +41,17 @@ class Load : public modifier::MemoryAccess, public ov::op::Op { }; /** - * @interface LoadReshape + * @interface LoadReorder * @brief It's just Load operation (and it's mapped on LoadEmitter during code generation) that allows to tweak * shape propagation. We need it to keep correct shape propagation when Transpose is decomposed to * Load and Store. This is a temporary solution until tokenization of Reshape operation is supported. * @ingroup snippets */ -class LoadReshape : public Load { +class LoadReorder : public Load { public: - OPENVINO_OP("LoadReshape", "SnippetsOpset", Load); - LoadReshape(const Output& x, size_t count = 1lu, const size_t offset = 0lu, std::vector order = {}); - LoadReshape() = default; + OPENVINO_OP("LoadReorder", "SnippetsOpset", Load); + LoadReorder(const Output& x, size_t count = 1lu, const size_t offset = 0lu, std::vector order = {}); + LoadReorder() = default; void set_offset(size_t offset) { set_output_offset(offset, 0); } void set_count(size_t count) { set_output_count(count, 0); } diff --git a/src/common/snippets/include/snippets/op/rank_normalization.hpp b/src/common/snippets/include/snippets/op/rank_normalization.hpp index 47b18601f8d805..645f9edf527141 100644 --- a/src/common/snippets/include/snippets/op/rank_normalization.hpp +++ b/src/common/snippets/include/snippets/op/rank_normalization.hpp @@ -4,7 +4,7 @@ #pragma once -#include "openvino/op/op.hpp" +#include "shape_infer_op.hpp" #include "snippets/shape_inference/shape_inference.hpp" namespace ov { @@ -21,9 +21,9 @@ namespace op { // Note that technically the same goal could be achieved using op::Unsqueeze operation, // but RankNormalization has a much narrower semantics, and hence allows for an easier control and a more efficient shape infer. // -class RankNormalization : public ov::op::Op { +class RankNormalization : public ShapeInferOp { public: - OPENVINO_OP("RankNormalization", "SnippetsOpset"); + OPENVINO_OP("RankNormalization", "SnippetsOpset", ShapeInferOp); RankNormalization() = default; RankNormalization(const Output& data, size_t num_prepend, size_t num_append); diff --git a/src/common/snippets/include/snippets/op/reorder.hpp b/src/common/snippets/include/snippets/op/reorder.hpp new file mode 100644 index 00000000000000..79b024b768aa76 --- /dev/null +++ b/src/common/snippets/include/snippets/op/reorder.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shape_infer_op.hpp" +#include "snippets/shape_inference/shape_inference.hpp" + +namespace ov { +namespace snippets { +namespace op { +/** + * @interface Reorder + * @brief Reorder reshapes input tensor shape by reqiured target order. + * The tensor data is not updated. + * Note: Order is stored in input PortDescriptor + * @ingroup snippets + */ +class Reorder : public ShapeInferOp { +public: + OPENVINO_OP("Reorder", "SnippetsOpset", ShapeInferOp); + Reorder() = default; + Reorder(const Output& x, std::vector order); + + bool visit_attributes(AttributeVisitor& visitor) override; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + + class ShapeInfer : public IShapeInferSnippets { + std::vector m_target_order {}; + public: + explicit ShapeInfer(const std::shared_ptr& n); + Result infer(const std::vector& input_shapes) override; + }; + +private: + void custom_constructor_validate_and_infer_types(std::vector order); +}; + +} // namespace op +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/op/reshape.hpp b/src/common/snippets/include/snippets/op/reshape.hpp index b4e0c9233c73f0..c86150a1479364 100644 --- a/src/common/snippets/include/snippets/op/reshape.hpp +++ b/src/common/snippets/include/snippets/op/reshape.hpp @@ -4,7 +4,8 @@ #pragma once -#include "openvino/op/op.hpp" +#include "shape_infer_op.hpp" +#include "snippets/shape_inference/shape_inference.hpp" namespace ov { namespace snippets { @@ -15,9 +16,9 @@ namespace op { * @brief Reshape input tensor to reqiured target shape * @ingroup snippets */ -class Reshape : public ov::op::Op { +class Reshape : public ShapeInferOp { public: - OPENVINO_OP("Reshape", "SnippetsOpset"); + OPENVINO_OP("Reshape", "SnippetsOpset", ShapeInferOp); Reshape(const Output& x, ov::PartialShape target_shape); Reshape() = default; @@ -28,6 +29,14 @@ class Reshape : public ov::op::Op { const ov::PartialShape& get_target_shape() const; void set_target_shape(ov::PartialShape shape); + class ShapeInfer : public IShapeInferSnippets { + VectorDims target_shape; + size_t target_shape_volume = 0; + public: + explicit ShapeInfer(const std::shared_ptr& n); + Result infer(const std::vector& input_shapes) override; + }; + private: ov::PartialShape m_target_shape = {}; }; diff --git a/src/common/snippets/include/snippets/op/shape_infer_op.hpp b/src/common/snippets/include/snippets/op/shape_infer_op.hpp new file mode 100644 index 00000000000000..a1462cbb426fd9 --- /dev/null +++ b/src/common/snippets/include/snippets/op/shape_infer_op.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" + +namespace ov { +namespace snippets { +namespace op { + +/** + * @interface ShapeInferOp + * @brief Op which infers shape without actually moving data + * @ingroup snippets + */ +class ShapeInferOp : public ov::op::Op { +public: + OPENVINO_OP("ShapeInferOp", "SnippetsOpset"); + ShapeInferOp() = default; + ShapeInferOp(const OutputVector& args) : ov::op::Op(args) {} +}; + +} // namespace op +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/pass/split_dimension_m.hpp b/src/common/snippets/include/snippets/pass/split_dimension_m.hpp index e9a9a46d3847ff..b93f09bf62803e 100644 --- a/src/common/snippets/include/snippets/pass/split_dimension_m.hpp +++ b/src/common/snippets/include/snippets/pass/split_dimension_m.hpp @@ -67,11 +67,24 @@ class SplitDimensionM: public CommonOptimizations::SubgraphPass { private: static std::shared_ptr get_matmul(const std::shared_ptr& subgraph); - static std::pair get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount); + /** + * @brief Contains splitM approaches allowing to get the batch ideally divisible by optimal_parallelism_work_amount + */ + static std::pair split_ideally(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount); + /** + * @brief Splits m_dim to minimize kernel_m in order to reduce waiting time for idle threads at the last parallel loop iteration. + */ + static std::pair split_minimize_kernel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount); + /** + * @brief Splits m_dim to get the batch in (optimal_parallelism_work_amount, 2 * optimal_parallelism_work_amount) interval + */ + static std::pair split_fallback_increase_parallel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount); void reshape_subgraph(const std::shared_ptr& subgraph, const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim); size_t m_concurrency; + + static const size_t min_kernel_m; }; } // namespace pass } // namespace snippets diff --git a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp index 1b91ea573ab1c4..f6cd6f0626f798 100644 --- a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp +++ b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp @@ -75,12 +75,5 @@ class ReduceShapeInfer : public IShapeInferSnippets { Result infer(const std::vector& input_shapes) override; }; -class ReshapeShapeInfer : public IShapeInferSnippets { - VectorDims target_shape; - size_t target_shape_volume = 0; -public: - explicit ReshapeShapeInfer(const std::shared_ptr& n); - Result infer(const std::vector& input_shapes) override; -}; } // namespace snippets } // namespace ov diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp index 08002fa38ed309..72198decc366d9 100644 --- a/src/common/snippets/include/snippets/snippets_isa.hpp +++ b/src/common/snippets/include/snippets/snippets_isa.hpp @@ -18,6 +18,7 @@ #include "op/kernel.hpp" #include "op/load.hpp" #include "op/reshape.hpp" +#include "op/reorder.hpp" #include "op/nop.hpp" #include "op/scalar.hpp" #include "op/powerstatic.hpp" diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp index 9b207b09fe411f..9dc416b3f7e38f 100644 --- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp +++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp @@ -11,12 +11,13 @@ // SnippetS dialect OV_OP(Load, ov::snippets::op) -OV_OP(LoadReshape, ov::snippets::op) +OV_OP(LoadReorder, ov::snippets::op) OV_OP(LoopBegin, ov::snippets::op) OV_OP(LoopEnd, ov::snippets::op) OV_OP(Brgemm, ov::snippets::op) OV_OP(BroadcastLoad, ov::snippets::op) OV_OP(Reshape, ov::snippets::op) +OV_OP(Reorder, ov::snippets::op) OV_OP(Store, ov::snippets::op) diff --git a/src/common/snippets/include/snippets/utils/utils.hpp b/src/common/snippets/include/snippets/utils/utils.hpp index ff4646f24d03b7..0569a230e91f32 100644 --- a/src/common/snippets/include/snippets/utils/utils.hpp +++ b/src/common/snippets/include/snippets/utils/utils.hpp @@ -290,13 +290,26 @@ std::shared_ptr get_leaf_node_of_first_child_shape_infer_seq(const std std::shared_ptr get_leaf_node_of_first_parent_shape_infer_seq(const std::shared_ptr& start_node); /** - * * @param Get stride of input/output dimension * @param expr_port target port that contains shape and layout info * @param idx index of the target dimension starting from the shape's end (default = 1) */ int64_t get_dim_stride(const lowered::ExpressionPort& expr_port, size_t idx = 1); +/** + * @brief Get stride of input dimension + * @param shape target shape + * @param layout target layout + * @param idx index of the target dimension starting from the shape's end (default = 1) + */ +int64_t get_dim_in_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1); +/** + * @brief Get stride of output dimension + * @param shape target shape + * @param layout target layout + * @param idx index of the target dimension starting from the shape's end (default = 1) + */ +int64_t get_dim_out_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1); /** * @brief Traverses path starting from "expr", and calls "func" for each expression. diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index d059ddd94d5724..fad0086427c93d 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -77,6 +77,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output& out) const { std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) #ifdef SNIPPETS_DEBUG_CAPS || std::dynamic_pointer_cast(op) diff --git a/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp b/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp index b32056d4e32a57..16d4160f1aaeb2 100644 --- a/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp +++ b/src/common/snippets/src/lowered/pass/mark_invariant_shape_path.cpp @@ -41,7 +41,7 @@ static bool is_affecting_op(const ExpressionPtr& expr) { const auto& node = expr->get_node(); return ov::is_type(node) || ov::is_type(node) || - ov::is_type(node); + ov::is_type(node); } } // namespace diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp index 461fec8b1399c0..9bd1e4c7bc8706 100644 --- a/src/common/snippets/src/op/load.cpp +++ b/src/common/snippets/src/op/load.cpp @@ -41,19 +41,19 @@ std::shared_ptr Load::clone_with_new_inputs(const OutputVector& new_args) return std::make_shared(new_args.at(0), get_count(), get_offset()); } -LoadReshape::LoadReshape(const Output& x, const size_t count, const size_t offset, std::vector order) +LoadReorder::LoadReorder(const Output& x, const size_t count, const size_t offset, std::vector order) : Load(x, count, offset), m_order(std::move(order)) { const auto& in_shape = x.get_partial_shape(); const auto in_shape_size = in_shape.size(); - OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReshape got new_order of invalid size"); + OPENVINO_ASSERT(m_order.size() == in_shape_size, "LoadReorder got new_order of invalid size"); OPENVINO_ASSERT(*std::max_element(m_order.begin(), m_order.end()) == in_shape_size - 1 && - *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReshape detected invalid values in new_order"); + *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReorder detected invalid values in new_order"); const std::set unique_dims(order.begin(), order.end()); - OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReshape order must not contain repeated elements"); + OPENVINO_ASSERT(unique_dims.size() == order.size(), "LoadReorder order must not contain repeated elements"); constructor_validate_and_infer_types(); } -void LoadReshape::validate_and_infer_types() { +void LoadReorder::validate_and_infer_types() { validate_memory_access_params(); const auto& old_shape = get_input_partial_shape(0); ov::PartialShape new_shape; @@ -62,23 +62,23 @@ void LoadReshape::validate_and_infer_types() { set_output_type(0, get_input_element_type(0), new_shape); } -bool LoadReshape::visit_attributes(AttributeVisitor& visitor) { +bool LoadReorder::visit_attributes(AttributeVisitor& visitor) { MemoryAccess::visit_attributes(visitor); visitor.on_attribute("order", m_order); return true; } -std::shared_ptr LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const { - INTERNAL_OP_SCOPE(LoadReshape); +std::shared_ptr LoadReorder::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(LoadReorder); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), get_count(), get_offset(), m_order); + return std::make_shared(new_args.at(0), get_count(), get_offset(), m_order); } -LoadReshape::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { - const auto& loadReshape = ov::as_type_ptr(n); - OPENVINO_ASSERT(loadReshape, "Got invalid node in LoadReshape::ShapeInfer"); - m_order = loadReshape->m_order; +LoadReorder::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { + const auto& loadReorder = ov::as_type_ptr(n); + OPENVINO_ASSERT(loadReorder, "Got invalid node in LoadReorder::ShapeInfer"); + m_order = loadReorder->m_order; } -IShapeInferSnippets::Result LoadReshape::ShapeInfer::infer(const std::vector& input_shapes) { +IShapeInferSnippets::Result LoadReorder::ShapeInfer::infer(const std::vector& input_shapes) { OPENVINO_ASSERT(input_shapes.size() == 1, "Got unexpected number of input shapes"); return {{utils::get_planar_vdims(input_shapes[0], m_order)}, ShapeInferStatus::success}; } diff --git a/src/common/snippets/src/op/rank_normalization.cpp b/src/common/snippets/src/op/rank_normalization.cpp index 4986f0d7fae6ef..2eab2dedc8aeb5 100644 --- a/src/common/snippets/src/op/rank_normalization.cpp +++ b/src/common/snippets/src/op/rank_normalization.cpp @@ -10,11 +10,10 @@ namespace snippets { namespace op { RankNormalization::RankNormalization(const Output& data, size_t num_prepend, size_t num_append) : - Op({data}), m_num_prepend(num_prepend), m_num_append(num_append) { + ShapeInferOp({data}), m_num_prepend(num_prepend), m_num_append(num_append) { constructor_validate_and_infer_types(); } - std::shared_ptr RankNormalization::clone_with_new_inputs(const OutputVector& new_args) const { check_new_args_count(this, new_args); return std::make_shared(new_args[0], m_num_prepend, m_num_append); diff --git a/src/common/snippets/src/op/reorder.cpp b/src/common/snippets/src/op/reorder.cpp new file mode 100644 index 00000000000000..43d8387a8cb2fb --- /dev/null +++ b/src/common/snippets/src/op/reorder.cpp @@ -0,0 +1,67 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/itt.hpp" + +#include "snippets/op/reorder.hpp" +#include "snippets/utils/utils.hpp" + + +namespace ov { +namespace snippets { +namespace op { + +Reorder::Reorder(const Output& arg, std::vector order) + : ShapeInferOp({arg}) { + custom_constructor_validate_and_infer_types(std::move(order)); +} + +void Reorder::custom_constructor_validate_and_infer_types(std::vector order) { + INTERNAL_OP_SCOPE(Reorder_constructor_validate_and_infer_types); + + const auto& input_pshape = get_input_partial_shape(0); + OPENVINO_ASSERT(input_pshape.rank().is_static() && input_pshape.size() == order.size(), + "Incompatible shape and order sizes"); + + // During ctor call, Reorder doesn't know his port descriptors. + // So we use explicit layouts from parameters + set_output_type(0, get_input_element_type(0), ov::snippets::utils::get_planar_pshape(input_pshape, order)); +} + +void Reorder::validate_and_infer_types() { + const auto& input_pshape = get_input_partial_shape(0); + const auto& order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout(); + OPENVINO_ASSERT(input_pshape.rank().is_static() && input_pshape.size() == order.size(), + "Incompatible shape and order sizes"); + const auto output_pshape = utils::get_planar_pshape(get_input_partial_shape(0), order); + set_output_type(0, get_input_element_type(0), output_pshape); +} + +std::shared_ptr Reorder::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Reorder); + check_new_args_count(this, new_args); + const auto& order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout(); + return std::make_shared(new_args.at(0), order); +} + +bool Reorder::visit_attributes(AttributeVisitor& visitor) { + auto order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout(); + visitor.on_attribute("target_order", order); + return true; +} + +Reorder::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { + const auto& op = as_type_ptr(n); + OPENVINO_ASSERT(op, "Invalid node passed to ReorderShapeInfer."); + m_target_order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(op->input(0))->get_layout(); +} + +IShapeInferSnippets::Result Reorder::ShapeInfer::infer(const std::vector& input_shapes) { + OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes is passed in ReorderShapeInfer"); + return {{ov::snippets::utils::get_planar_vdims(input_shapes[0].get(), m_target_order)}, ShapeInferStatus::success}; +} + +}// namespace op +}// namespace snippets +}// namespace ov \ No newline at end of file diff --git a/src/common/snippets/src/op/reshape.cpp b/src/common/snippets/src/op/reshape.cpp index 72823d2815cdbf..9b46ea73247f39 100644 --- a/src/common/snippets/src/op/reshape.cpp +++ b/src/common/snippets/src/op/reshape.cpp @@ -11,8 +11,9 @@ namespace ov { namespace snippets { namespace op { + Reshape::Reshape(const Output& arg, ov::PartialShape target_shape) - : Op({arg}), m_target_shape(std::move(target_shape)) { + : ShapeInferOp({arg}), m_target_shape(std::move(target_shape)) { constructor_validate_and_infer_types(); } @@ -38,6 +39,24 @@ const ov::PartialShape& Reshape::get_target_shape() const { void Reshape::set_target_shape(ov::PartialShape shape) { m_target_shape = std::move(shape); } + +Reshape::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { + const auto& reshape = as_type_ptr(n); + OPENVINO_ASSERT(reshape, "Invalid node passed to ReshapeShapeInfer."); + const auto& partial_shape = reshape->get_target_shape(); + OPENVINO_ASSERT(partial_shape.is_static(), "target_shape of reshape op should be static in ReshapeShapeInfer"); + target_shape = partial_shape.get_shape(); + target_shape_volume = utils::get_shape_size(target_shape); +} + +IShapeInferSnippets::Result Reshape::ShapeInfer::infer(const std::vector& input_shapes) { + OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes is passed in ReshapeShapeInfer"); + const auto input_shape_volume = utils::get_shape_size(input_shapes[0].get()); + OPENVINO_ASSERT(input_shape_volume == target_shape_volume, "Tensor volume should be the same after reshape in ReshapeShapeInfer"); + + return {{target_shape}, ShapeInferStatus::success}; +} + }// namespace op }// namespace snippets }// namespace ov \ No newline at end of file diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 98e3392a65e1e2..aff2341cc8bf9d 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -95,8 +95,7 @@ auto Subgraph::is_domain_sensitive_op(const std::shared_ptr& op) -> bo } auto Subgraph::is_shape_infer_op(const std::shared_ptr& op) -> bool { - return ov::is_type(op) || - ov::is_type(op); + return ov::is_type(op); } void Subgraph::init_config() { diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp index beb465ab3a3fbe..c6b5045cfeee62 100644 --- a/src/common/snippets/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/src/pass/mha_tokenization.cpp @@ -344,45 +344,6 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken * Transpose3 */ - // First input branch of MatMul0 should be executed before second input branch of MatMul0, - // so firstly we insert Transpose1 on the beginning of ordered_ops and then Transpose0 - // Note: If MatMul0 has transposed_b, we should tokenize only scalars ops from 1st branch - // to move extracted Transpose from MatMul input to body Parameter - auto parent = matmul0->get_input_node_shared_ptr(1); - // We can support several ops between MatMul0 with transposed_b and Transpose1 with 0213 order (or without this Transpose1) - // only if these ops have scalar shapes on other inputs. - // There is transformation ExplicitTransposeMatMulInputs that set supported order and transposed_b(false). - // We can allow to call this pass only if ops have scalar shapes to avoid shape mismatching - const auto is_transposed_b_0 = matmul0->get_transpose_b(); - bool has_matmul0_has_ops_on_input = false; - while (is_supported_intermediate_op(parent)) { - // All supported ops have only one output port - if (parent->get_output_target_inputs(0).size() != 1) - break; - - // Only if MatMul0 has transposed_b, we have to tokenize scalar ops - // to move explicit Transpose from MatMul0 input_1 to Parameter of Subgraph body - if (is_transposed_b_0 && !ov::snippets::pass::ExplicitTransposeMatMulInputs::are_weights_scalar(parent)) { - break; - } - - // To avoid unsupported number of non-scalar Constants in the future after FakeQuantize decomposition (plugin specific limitation) - // we should calculate potential number of non-scalar Constants for FakeQuantize that will be moved up from body. - if (const auto fq_node = ov::as_type_ptr(parent)) { - hidden_virtual_ports_count += ov::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node); - } - - potential_body_params_count += get_potential_body_params(parent); - ordered_ops.insert(ordered_ops.begin(), parent); - // [107731] To go always through 0-th port - is it safe? - parent = parent->get_input_node_shared_ptr(0); - has_matmul0_has_ops_on_input = true; - } - // If there are ops on second input of MatMul0 and only one unique Buffer between MatMuls - there must be one more unique Buffer - if (has_matmul0_has_ops_on_input && uniqie_buffer_reg_group_count < 2) { - uniqie_buffer_reg_group_count++; - } - auto tokenize_transpose = [&](const std::shared_ptr& transpose, bool is_input_transposed, std::vector order, const ov::NodeVector::const_iterator& pos) { @@ -404,11 +365,15 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken } }; - const auto transpose1 = ov::as_type_ptr(parent); + // [160177]: Due to performance problems, if operations on 2nd input of MatMuls should be explicitly executed + // (in other words, if the Buffer should be inserted between Brgemm and this op sequence), + // we don't tokenize such operations into Subgraph. The details are described in the ticket 160177. + // Please, return the tokenization of these ops when parallel loops are implemented. const auto transpose0 = ov::as_type_ptr(matmul0->get_input_node_shared_ptr(0)); + const auto transpose1 = ov::as_type_ptr(matmul0->get_input_node_shared_ptr(1)); const auto transpose2 = ov::as_type_ptr(matmul1->get_input_node_shared_ptr(1)); - tokenize_transpose(transpose1, is_transposed_b_0, get_decomposed_transpose_order(pattern_rank), ordered_ops.begin()); tokenize_transpose(transpose0, matmul0->get_transpose_a(), get_fusion_transpose_order(pattern_rank), ordered_ops.begin()); + tokenize_transpose(transpose1, matmul0->get_transpose_b(), get_fusion_transpose_order(pattern_rank), ordered_ops.begin()); tokenize_transpose(transpose2, matmul1->get_transpose_b(), get_fusion_transpose_order(pattern_rank), ordered_ops.end()); ordered_ops.push_back(matmul1); diff --git a/src/common/snippets/src/pass/split_dimension_m.cpp b/src/common/snippets/src/pass/split_dimension_m.cpp index ae95a371483163..b6b8cdd70f0bc8 100644 --- a/src/common/snippets/src/pass/split_dimension_m.cpp +++ b/src/common/snippets/src/pass/split_dimension_m.cpp @@ -4,8 +4,8 @@ #include "snippets/pass/split_dimension_m.hpp" -#include "snippets/utils/utils.hpp" #include "snippets/itt.hpp" +#include "snippets/utils/utils.hpp" namespace { size_t get_dim_M(const ov::Shape& shape) { @@ -26,50 +26,69 @@ bool is_prime_number(size_t value) { namespace ov { namespace snippets { namespace pass { + +const size_t SplitDimensionM::min_kernel_m = 32; + bool SplitDimensionM::is_supported_matmul(const std::shared_ptr& node) { const auto matmul = ov::as_type_ptr(node); return matmul && !matmul->get_transpose_a() && !matmul->is_dynamic(); } -std::pair SplitDimensionM::get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) { - std::pair splited = { 1, m_dim }; - +std::pair SplitDimensionM::split_ideally(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) { // Ideal case #1: M can be split on the parts one of which complements the batch dimension to the optimal parallel work amount // In this case, each thread will execute the Snippets kernel once const size_t lower_bound = optimal_parallelism_work_amount / batch_dim; - if (lower_bound * batch_dim == optimal_parallelism_work_amount && m_dim % lower_bound == 0) { - splited.first = lower_bound; - splited.second = m_dim / lower_bound; - OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); - return splited; - } + if (lower_bound * batch_dim == optimal_parallelism_work_amount && m_dim % lower_bound == 0) + return std::make_pair(lower_bound, m_dim / lower_bound); // Ideal case #2: M is divisible by optimal parallel work amount, and the new_m_dim is big enough // In this case, each thread will execute the Snippets kernel 'batch_dim' times if (m_dim % optimal_parallelism_work_amount == 0) { const auto new_m_dim = m_dim / optimal_parallelism_work_amount; - const size_t min_kernel_m = 64; - if (new_m_dim >= min_kernel_m) { - splited.first = optimal_parallelism_work_amount; - splited.second = new_m_dim; - OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); - return splited; - } + if (new_m_dim >= min_kernel_m) + return std::make_pair(optimal_parallelism_work_amount, new_m_dim); } + return std::make_pair(1, m_dim); +} + +std::pair SplitDimensionM::split_fallback_increase_parallel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) { + std::pair splited = { 1, m_dim }; const size_t upper_bound = utils::div_up(2 * optimal_parallelism_work_amount, batch_dim); for (size_t divisor_0 = upper_bound - 1; divisor_0 > 1; divisor_0--) { size_t divisor_1 = m_dim / divisor_0; - if (divisor_1 * divisor_0 == m_dim) { - splited.first = divisor_0; - splited.second = divisor_1; - break; - } + if (divisor_1 * divisor_0 == m_dim) + return divisor_0 * batch_dim >= optimal_parallelism_work_amount ? std::make_pair(divisor_0, divisor_1) : splited; } - OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); return splited; } +std::pair SplitDimensionM::split_minimize_kernel_wa(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) { + // This heuristic minimizes 'm_kernel' (=> maximizes 'm_batch') with a limitation that 'm_kernel >= min_kernel_m'. + // In other words, it tries to find 'm_kernel' bigger than 'min_kernel_m' and at the same time as close as possible to this value. + std::pair best_result = {1, m_dim}; + for (size_t divisor = 2; divisor < std::sqrt(m_dim); ++divisor) { + if (m_dim % divisor != 0) + continue; + // If divisor is more than 'min_kernel_m', divisor becomes 'm_kernel', + // guaranteeing the most optimal implementation from 'm_kernel' minimization perspective. + if (divisor >= min_kernel_m) + return std::make_pair(m_dim / divisor, divisor); + + // If divisor is less than 'min_kernel_m', divisor becomes m_batch. + // However, it is not guaranteed that the current 'm_kernel = m_dim / divisor' is minimized, as one of the next divisors can be more optimal. + // So in this case the best result is remembered + const size_t m_kernel = m_dim / divisor; + if (m_kernel >= min_kernel_m) { + best_result.first = divisor; + best_result.second = m_kernel; + } + } + if (best_result.first * batch_dim >= optimal_parallelism_work_amount) + return best_result; + return std::make_pair(1, m_dim); +} + bool SplitDimensionM::can_be_optimized(const std::shared_ptr& node, size_t concurrency) { if (!is_supported_matmul(node)) return false; @@ -131,16 +150,25 @@ bool SplitDimensionM::split(const ov::Shape& shape, size_t optimal_parallelism_w if (is_prime_number(m_dim)) return false; - auto is_optimized = [&](size_t batch_dim) { - return batch_dim >= optimal_parallelism_work_amount; - }; - // We skip optimization if the current batch is optimal for concurrency - if (is_optimized(batch_dim)) + if (batch_dim % optimal_parallelism_work_amount == 0) return false; - std::tie(batch_m_dim, new_m_dim) = get_splited_dimensions(batch_dim, m_dim, optimal_parallelism_work_amount); - return is_optimized(batch_dim * batch_m_dim); + auto split_is_done = [&batch_m_dim]() { + return batch_m_dim != 1; + }; + + std::tie(batch_m_dim, new_m_dim) = split_ideally(batch_dim, m_dim, optimal_parallelism_work_amount); + if (split_is_done()) + return true; + + std::tie(batch_m_dim, new_m_dim) = split_minimize_kernel_wa(batch_dim, m_dim, optimal_parallelism_work_amount); + if (split_is_done()) + return true; + // If all the previous heuristics failed, fallback heuristic is used, which reflects the old splitting behavior + if (batch_dim < optimal_parallelism_work_amount) + std::tie(batch_m_dim, new_m_dim) = split_fallback_increase_parallel_wa(batch_dim, m_dim, optimal_parallelism_work_amount); + return split_is_done(); } void SplitDimensionM::reshape_subgraph(const std::shared_ptr& subgraph, const ov::Shape& shape, size_t batch_m_dim, size_t new_m_dim) { diff --git a/src/common/snippets/src/pass/transpose_decomposition.cpp b/src/common/snippets/src/pass/transpose_decomposition.cpp index 5c29b493af5826..a433cd41377422 100644 --- a/src/common/snippets/src/pass/transpose_decomposition.cpp +++ b/src/common/snippets/src/pass/transpose_decomposition.cpp @@ -60,9 +60,9 @@ TransposeDecomposition::TransposeDecomposition() { const auto subtensor = std::vector{1}; const auto& layout = order->cast_vector(); - // todo: LoadReshape used here is essentially Load + an easy way to maintain correct shape propagation + // todo: LoadReorder used here is essentially Load + an easy way to maintain correct shape propagation // fix this in future and develop a more consistent shape propagation approach. - auto load = std::make_shared(data_input, subtensor[0], 0, layout); + auto load = std::make_shared(data_input, subtensor[0], 0, layout); auto store = std::make_shared(load, subtensor[0]); PortDescriptorUtils::set_port_descriptor(load->input(0), subtensor, layout); diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 06beb8db94ae3d..5527cebb63f24f 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -118,7 +118,26 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir) // input->shape changing ops->load PortDescriptorPtr desc = nullptr; const auto& shape_infer_seq = utils::get_first_child_shape_infer_expr_seq(param); - const auto& mem_desc_expr = shape_infer_seq.empty() ? param : shape_infer_seq.back(); + ExpressionPtr mem_desc_expr = param; + if (!shape_infer_seq.empty()) { + // [160048] Reorder, as any another ShapeInferOp, should just propagate input shape to output using target order + // without data movement. However, currently we have to save desc of input of the Reorder + // to support correct input data offsets calculations and MHAParallelWAOptimizer pass work. + // Please, remove this code part when the mentioned ticket is completed. + const auto& reorder_it = std::find_if(shape_infer_seq.cbegin(), shape_infer_seq.cend(), + [](const ExpressionPtr& expr) { + return ov::is_type(expr->get_node()); + }); + if (reorder_it != shape_infer_seq.cend()) { + const auto& reorder = *reorder_it; + const auto& etype = reorder->get_node()->get_output_element_type(0); + update_io_parameters(reorder->get_input_port_descriptor(0), etype); + continue; + } + + mem_desc_expr = shape_infer_seq.back(); + } + auto consumer_inputs = mem_desc_expr->get_output_port_connector(0)->get_consumers(); for (const auto& child_input : consumer_inputs) { const auto ma = std::dynamic_pointer_cast(child_input.get_expr()->get_node()); @@ -127,6 +146,7 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir) break; } } + OPENVINO_ASSERT(desc, "Descriptor is missed!"); const auto& etype = mem_desc_expr->get_node()->get_output_element_type(0); update_io_parameters(desc, etype); } diff --git a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp index a3e3d9652c0ac8..3fed1d924a7140 100644 --- a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp +++ b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp @@ -228,22 +228,5 @@ Result ReduceShapeInfer::infer(const std::vector& input_shapes) { return {{result_shape}, ShapeInferStatus::success}; } -ReshapeShapeInfer::ReshapeShapeInfer(const std::shared_ptr& n) { - const auto& reshape = as_type_ptr(n); - OPENVINO_ASSERT(reshape, "Invalid node passed to ReshapeShapeInfer."); - const auto& partial_shape = reshape->get_target_shape(); - OPENVINO_ASSERT(partial_shape.is_static(), "target_shape of reshape op should be static in ReshapeShapeInfer"); - target_shape = partial_shape.get_shape(); - target_shape_volume = utils::get_shape_size(target_shape); -} - -Result ReshapeShapeInfer::infer(const std::vector& input_shapes) { - OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes is passed in ReshapeShapeInfer"); - const auto input_shape_volume = utils::get_shape_size(input_shapes[0].get()); - OPENVINO_ASSERT(input_shape_volume == target_shape_volume, "Tensor volume should be the same after reshape in ReshapeShapeInfer"); - - return {{target_shape}, ShapeInferStatus::success}; -} - } // namespace snippets } // namespace ov diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp index 76a4c491c66983..0e3060501a87d5 100644 --- a/src/common/snippets/src/shape_inference/shape_inference.cpp +++ b/src/common/snippets/src/shape_inference/shape_inference.cpp @@ -57,7 +57,6 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry SHAPE_INFER_PREDEFINED(op::KernelStatic, EmptyShapeInfer), SHAPE_INFER_PREDEFINED(op::KernelDynamic, EmptyShapeInfer), SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer), - SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Reshape, ReshapeShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Brgemm, BrgemmShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::ReduceMax, ReduceShapeInfer), @@ -65,7 +64,9 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry // Note that Result has no output PortConnectors, so the shape must be empty SHAPE_INFER_PREDEFINED(ov::op::v0::Result, EmptyShapeInfer), // - SHAPE_INFER_OP_SPECIFIC(op::LoadReshape), + SHAPE_INFER_OP_SPECIFIC(op::LoadReorder), + SHAPE_INFER_OP_SPECIFIC(op::Reshape), + SHAPE_INFER_OP_SPECIFIC(op::Reorder), SHAPE_INFER_OP_SPECIFIC(op::RankNormalization), SHAPE_INFER_OP_SPECIFIC(op::BroadcastLoad), SHAPE_INFER_OP_SPECIFIC(op::BroadcastMove), diff --git a/src/common/snippets/src/utils/utils.cpp b/src/common/snippets/src/utils/utils.cpp index e7381fe6754758..249970b65baa5d 100644 --- a/src/common/snippets/src/utils/utils.cpp +++ b/src/common/snippets/src/utils/utils.cpp @@ -317,14 +317,21 @@ std::shared_ptr get_leaf_node_of_first_parent_shape_infer_seq(const st } int64_t get_dim_stride(const lowered::ExpressionPort& expr_port, size_t idx) { - size_t dim_idx = 0; + const auto& shape = expr_port.get_descriptor_ptr()->get_shape(); const auto& layout = expr_port.get_descriptor_ptr()->get_layout(); switch (expr_port.get_type()) { - case lowered::ExpressionPort::Input: dim_idx = utils::get_input_dim_idx(layout, idx); break; - case lowered::ExpressionPort::Output: dim_idx = utils::get_output_dim_idx(layout, idx); break; - default: OPENVINO_THROW("Unsupported expression port type!"); + case lowered::ExpressionPort::Input: return get_dim_in_stride(shape, layout, idx); + case lowered::ExpressionPort::Output: return get_dim_out_stride(shape, layout, idx); } - return get_stride(dim_idx, expr_port.get_descriptor_ptr()->get_shape()); + OPENVINO_THROW("Unsupported expression port type!"); +} + +int64_t get_dim_in_stride(const VectorDims& shape, const VectorDims& layout, size_t idx) { + return get_stride(utils::get_input_dim_idx(layout, idx), shape); +} + +int64_t get_dim_out_stride(const VectorDims& shape, const VectorDims& layout, size_t idx) { + return get_stride(utils::get_output_dim_idx(layout, idx), shape); } void visit_path(const lowered::ExpressionPtr& expr, diff --git a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp index ee76c5af7234d8..b9ff7bda6823ed 100644 --- a/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp +++ b/src/common/snippets/tests/src/lowered/pass/extracted_loop_invariants.cpp @@ -299,7 +299,7 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsImpossible) { * * Param0(32,8,1) * | - * LoadReshape with order (1,2,0) + * LoadReorder with order (1,2,0) * | * Store * | @@ -307,7 +307,7 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsImpossible) { */ { auto param = linear_ir->push_node(input_precision, input_shape_0); - auto load_reshape = linear_ir->push_node(param.second, 1, 0, layout); + auto load_reshape = linear_ir->push_node(param.second, 1, 0, layout); auto store = linear_ir->push_node(load_reshape.second, 1, 0); init_expr_descriptors(*load_reshape.first, {subtensor, subtensor}, {order, layout}); init_expr_descriptors(*store.first, {subtensor, subtensor}, {layout, layout}); diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp index 382257f935cc49..d725c36e5c35a5 100644 --- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp @@ -160,7 +160,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Dynamic_Transpose_fusion) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) { const auto& f = MHASplitMFunction(std::vector{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{2, 64, 12, 64}, {128, 12, 1, 64}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}}, + std::vector{{2, 64, 12, 64}, {12, 1, 64, 128}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}}, false); model = f.getOriginal(); model_ref = f.getReference(); @@ -171,7 +171,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM_withMul) { const auto& f = MHASplitMFunction(std::vector{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{2, 64, 12, 64}, {128, 12, 1, 64}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}}, + std::vector{{4, 32, 12, 64}, {12, 1, 64, 128}, {12, 4, 32, 128}, {1, 128, 12, 64}, {128, 12, 64}}, true); model = f.getOriginal(); model_ref = f.getReference(); @@ -182,7 +182,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM_withMul) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM) { const auto& f = MHASplitMFunction(std::vector{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, + std::vector{{1, 12, 32, 16, 64}, {1, 16, 1, 64, 384}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, false); model = f.getOriginal(); model_ref = f.getReference(); @@ -193,7 +193,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM_withMul) { const auto& f = MHASplitMFunction(std::vector{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, + std::vector{{1, 12, 32, 16, 64}, {1, 16, 1, 64, 384}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, true); model = f.getOriginal(); model_ref = f.getReference(); diff --git a/src/common/snippets/tests/src/utils/split_dim_m.cpp b/src/common/snippets/tests/src/utils/split_dim_m.cpp index 9e801fceae02e9..df7c277d775cb4 100644 --- a/src/common/snippets/tests/src/utils/split_dim_m.cpp +++ b/src/common/snippets/tests/src/utils/split_dim_m.cpp @@ -59,6 +59,11 @@ const std::vector split_dimension_cases = { {InputData{25, 50, 40}, ReferenceData{true, 2, 25}}, {InputData{5, 16384, 40}, ReferenceData{true, 8, 2048}}, {InputData{5, 16384, 32}, ReferenceData{true, 32, 512}}, + {InputData{48, 4097, 32}, ReferenceData{true, 17, 241}}, + {InputData{48, 6600, 32}, ReferenceData{true, 200, 33}}, + {InputData{12, 128, 16}, ReferenceData{true, 4, 32}}, + {InputData{16, 384, 60}, ReferenceData{true, 12, 32}}, + {InputData{16, 384, 24}, ReferenceData{true, 12, 32}}, }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SplitDimensionM, diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp index cf89181e2a7979..cdc768f5d4e1cc 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp @@ -141,10 +141,11 @@ bool CompiledSnippetCPU::empty() const { return get_code_size() == 0; } -CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa) - : TargetMachine(std::make_shared()), +CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr cache) + : TargetMachine(std::make_shared(cache)), h(new jit_snippet()), - isa(host_isa) { + isa(host_isa), + compiled_kernel_cache(std::move(cache)) { // data movement jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); @@ -213,7 +214,7 @@ CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa) } std::shared_ptr CPUTargetMachine::clone() const { - const auto cloned = std::make_shared(isa); + const auto cloned = std::make_shared(isa, compiled_kernel_cache); cloned->configurator = std::make_shared(*configurator); return cloned; } @@ -250,14 +251,15 @@ dnnl::impl::cpu::aarch64::cpu_isa_t CPUTargetMachine::get_isa() const { return isa; } -CPUGenerator::CPUGenerator(dnnl::impl::cpu::aarch64::cpu_isa_t isa_) - : Generator(std::make_shared(isa_)) {} +CPUGenerator::CPUGenerator(dnnl::impl::cpu::aarch64::cpu_isa_t isa_, ov::intel_cpu::MultiCacheWeakPtr cache) + : Generator(std::make_shared(isa_, std::move(cache))) {} +CPUGenerator::CPUGenerator(const std::shared_ptr& target) : Generator(target) {} std::shared_ptr CPUGenerator::clone() const { const auto& cpu_target_machine = std::dynamic_pointer_cast(target); OPENVINO_ASSERT(cpu_target_machine, "Failed to clone CPUGenerator: the instance contains incompatible TargetMachine type"); - return std::make_shared(cpu_target_machine->get_isa()); + return std::make_shared(cpu_target_machine); } ov::snippets::RegType CPUGenerator::get_specific_op_out_reg_type(const ov::Output& out) const { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.hpp index 4006fc01b9a1f5..90c2662e33d070 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.hpp @@ -4,6 +4,7 @@ #pragma once +#include "cache/multi_cache.h" #include "cpu/aarch64/jit_generator.hpp" #include "snippets/generator.hpp" #include "snippets/target_machine.hpp" @@ -25,7 +26,7 @@ class CompiledSnippetCPU : public snippets::CompiledSnippet { class CPUTargetMachine : public snippets::TargetMachine { public: - explicit CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa); + explicit CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr); std::shared_ptr clone() const override; bool is_supported() const override; snippets::CompiledSnippetPtr get_snippet() override; @@ -36,11 +37,13 @@ class CPUTargetMachine : public snippets::TargetMachine { private: std::unique_ptr h; dnnl::impl::cpu::aarch64::cpu_isa_t isa; + ov::intel_cpu::MultiCacheWeakPtr compiled_kernel_cache; }; class CPUGenerator : public snippets::Generator { public: - CPUGenerator(dnnl::impl::cpu::aarch64::cpu_isa_t isa); + CPUGenerator(dnnl::impl::cpu::aarch64::cpu_isa_t isa, ov::intel_cpu::MultiCacheWeakPtr); + CPUGenerator(const std::shared_ptr& target); std::shared_ptr clone() const override; protected: diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 65741d7031d289..3ad41d707bb96b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -7,7 +7,7 @@ #include "snippets/lowered/loop_manager.hpp" #include "snippets/utils/utils.hpp" -#ifndef OPENVINO_ARCH_ARM64 +#ifdef OPENVINO_ARCH_X86_64 # include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp" # include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp" #endif @@ -39,12 +39,13 @@ std::string CPURuntimeConfig::to_string() const { } #endif -CPURuntimeConfigurator::CPURuntimeConfigurator() - : ov::snippets::RuntimeConfigurator(std::make_shared()) {} +CPURuntimeConfigurator::CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache) + : ov::snippets::RuntimeConfigurator(std::make_shared()), + compiled_kernel_cache(std::move(cache)) {} void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { RuntimeConfigurator::initialization(linear_ir); -#ifndef OPENVINO_ARCH_ARM64 +#ifdef OPENVINO_ARCH_X86_64 RuntimeOptimizer::register_if_applicable(m_intermediate_optimizers, linear_ir, this); RuntimeOptimizer::register_if_applicable(m_final_optimizers, linear_ir, this); #endif diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 1706670ce870d1..425959c289b3a7 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -4,8 +4,9 @@ #pragma once +#include "cache/multi_cache.h" #include "emitters/snippets/jit_snippets_call_args.hpp" -#include "memory_desc/cpu_blocked_memory_desc.h" +#include "emitters/snippets/repacked_input.hpp" #include "snippets/lowered/port_descriptor.hpp" #include "snippets/runtime_configurator.hpp" @@ -21,13 +22,20 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { std::string to_string() const override; #endif + enum class RepackingImplType { + NONE, // no kernel-outside repacking + IN_PARALLEL, // should be executed in parallel_nt by each thread + SEPARATE, // should be separathy from kernel executed + }; + RepackingImplType repacking_impl_type = RepackingImplType::NONE; + + std::unordered_map repacked_inputs = {}; std::vector loop_args = {}; - std::unordered_map m_in_requested_descs = {}; }; class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { public: - CPURuntimeConfigurator(); + CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache); /** * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig @@ -35,6 +43,12 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { */ void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; + // Note: This method is temporarily used only by `BrgemmExternalRepackingAdjuster` to create kernels for repacking. + // Please, remove this method when the adjuster is deprecated + const ov::intel_cpu::MultiCacheWeakPtr& get_cache() const { + return compiled_kernel_cache; + } + protected: void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; void update_tensor_rank(const ov::snippets::VectorDims& master_shape) const override; @@ -42,6 +56,8 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; static const size_t rank6D; + + ov::intel_cpu::MultiCacheWeakPtr compiled_kernel_cache; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.cpp b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.cpp new file mode 100644 index 00000000000000..a9fbf04d27392b --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "repacked_input.hpp" + +namespace ov { +namespace intel_cpu { + +RepackedInput::RepackedInput(std::shared_ptr kernel, + CpuBlockedMemoryDescPtr desc, + VectorDims in_offsets, + VectorDims out_offsets) + : m_kernel(std::move(kernel)), + m_desc(std::move(desc)), + m_in_offsets(std::move(in_offsets)), + m_out_offsets(std::move(out_offsets)) { + OPENVINO_ASSERT(m_in_offsets.size() == m_out_offsets.size(), "Incorrect size of offsets"); + OPENVINO_ASSERT(m_desc, "Descriptor is empty"); +} + +const CpuBlockedMemoryDescPtr& RepackedInput::desc() const { + return m_desc; +} + +const VectorDims& RepackedInput::in_offsets() const { + return m_in_offsets; +} + +const VectorDims& RepackedInput::out_offsets() const { + return m_out_offsets; +} +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp new file mode 100644 index 00000000000000..61daaa859ef603 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/repacked_input.hpp @@ -0,0 +1,45 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "memory_desc/cpu_blocked_memory_desc.h" + +namespace ov { +namespace intel_cpu { + +struct RepackedInputKernel { + RepackedInputKernel() = default; + virtual ~RepackedInputKernel() = default; + virtual void operator()(const void* args) const = 0; +}; + +struct RepackedInput { + RepackedInput() = default; + RepackedInput(std::shared_ptr kernel, + CpuBlockedMemoryDescPtr desc, + VectorDims in_offsets, + VectorDims out_offsets); + + template ::value, bool>::type = true> + std::shared_ptr kernel() const { + const auto ker = std::dynamic_pointer_cast(m_kernel); + OPENVINO_ASSERT(ker, "Kernel is empty!"); + return ker; + } + + const CpuBlockedMemoryDescPtr& desc() const; + const VectorDims& in_offsets() const; + const VectorDims& out_offsets() const; + +private: + std::shared_ptr m_kernel{nullptr}; + CpuBlockedMemoryDescPtr m_desc{nullptr}; + VectorDims m_in_offsets{}; + VectorDims m_out_offsets{}; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index 39e384837856a1..31daa32dfa144f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -165,7 +165,7 @@ class jit_snippet : public dnnl::impl::cpu::x64::jit_generator { intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr cache) - : TargetMachine(std::make_shared()), + : TargetMachine(std::make_shared(cache)), h(new jit_snippet()), isa(host_isa), compiled_kernel_cache(std::move(cache)) { @@ -177,9 +177,10 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[snippets::op::RankNormalization::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); + jitters[snippets::op::Reorder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); - jitters[snippets::op::LoadReshape::get_type_info_static()] = + jitters[snippets::op::LoadReorder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); jitters[snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_broadcast_emitter); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp index 6df658d8d72d0c..861b9779c25533 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp @@ -21,15 +21,6 @@ using namespace ov::snippets::utils; namespace ov { namespace intel_cpu { -namespace { -bool get_is_transposed(const ov::snippets::lowered::ExpressionPtr& expr) { - const auto& layout = expr->get_input_port_descriptor(0)->get_layout(); - const auto is_transposed = !layout.empty() && layout.back() != layout.size() - 1; - OV_CPU_JIT_EMITTER_ASSERT(IMPLICATION(is_transposed, (layout[layout.size() - 2] == layout.size() - 1)), - "supports only N dim placed as last or pre last dimension"); - return is_transposed; -} -} // namespace jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h, cpu_isa_t isa, @@ -50,7 +41,7 @@ jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h, const auto& src_prc = brgemm_repack->get_src_element_type(); const auto& wei_prc = brgemm_repack->get_input_element_type(0); const auto wei_N_blk = brgemm_utils::repacking::compute_inner_n_block(wei_prc); - const auto is_transposed = get_is_transposed(expr); + const auto is_transposed = BrgemmCopyB::is_transposed(expr->get_input_port_descriptor(0)->get_layout()); const auto brgemm_type = get_brgemm_type(src_prc, is_transposed); const auto primitive_isa = brgemm_utils::get_primitive_isa(src_prc, with_amx(brgemm_type)); m_with_comp = with_compensations(brgemm_type); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp index dd216517ace12e..7aca5f6c6a696f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp @@ -145,10 +145,11 @@ std::string BrgemmCopyBKernelConfig::StaticParams::to_string() const { # undef PRINT #endif -BrgemmCopyBKernel::BrgemmCopyBKernel() : jit_generator(jit_name()), ker_(nullptr) {} +BrgemmCopyBKernel::BrgemmCopyBKernel() : RepackedInputKernel(), jit_generator(jit_name()), ker_(nullptr) {} BrgemmCopyBKernel::BrgemmCopyBKernel(const BrgemmCopyBKernelConfig& conf) - : jit_generator(jit_name()), + : RepackedInputKernel(), + jit_generator(jit_name()), is_with_comp(conf.is_with_comp()), is_transpose(conf.is_transposed_B()), wei_data_size(dnnl_data_type_size(conf.get_wei_dt())), @@ -169,9 +170,11 @@ status_t BrgemmCopyBKernel::create_kernel() { return code; } -void BrgemmCopyBKernel::operator()(const call_args* args) const { +void BrgemmCopyBKernel::operator()(const void* args) const { + const auto* call_args = reinterpret_cast(args); + OV_CPU_JIT_EMITTER_ASSERT(call_args, "Call arguments are nullptr!"); OV_CPU_JIT_EMITTER_ASSERT(ker_, "Kernel is nullptr"); - ker_(args); + ker_(call_args); } void BrgemmCopyBKernel::init_brgemm_copy_b_kernel( diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp index b3b107cd676705..5ef740067f2035 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp @@ -10,6 +10,7 @@ #include "emitters/plugin/x64/jit_emitter.hpp" #include "emitters/snippets/cpu_kernel_executor_table.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/snippets/repacked_input.hpp" namespace ov { namespace intel_cpu { @@ -139,7 +140,7 @@ struct BrgemmCopyBKernelConfig : public snippets::KernelExecutorBase::GenericCon size_t m_hash{SIZE_MAX}; }; -struct BrgemmCopyBKernel : public dnnl::impl::cpu::x64::jit_generator { +struct BrgemmCopyBKernel : public RepackedInputKernel, public dnnl::impl::cpu::x64::jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(BrgemmCopyBKernel) struct call_args { const void* src = nullptr; @@ -152,7 +153,7 @@ struct BrgemmCopyBKernel : public dnnl::impl::cpu::x64::jit_generator { dnnl::impl::status_t create_kernel() override; - void operator()(const call_args* args) const; + void operator()(const void* args) const override; private: void generate() override; diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index 95de3720bb1e25..38ba9c484620db 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -172,7 +172,7 @@ class TypeRelaxedExtension : public ov::OpExtension> { OP_EXTENSION(ov::snippets::op::KernelStatic) \ OP_EXTENSION(ov::snippets::op::KernelDynamic) \ OP_EXTENSION(ov::snippets::op::Load) \ - OP_EXTENSION(ov::snippets::op::LoadReshape) \ + OP_EXTENSION(ov::snippets::op::LoadReorder) \ OP_EXTENSION(ov::snippets::op::LoopBegin) \ OP_EXTENSION(ov::snippets::op::LoopEnd) \ OP_EXTENSION(ov::snippets::op::Buffer) \ diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp new file mode 100644 index 00000000000000..a8fac443391289 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "nodes/executors/aarch64/subgraph.hpp" + +#include "snippets/op/subgraph.hpp" + +namespace ov { +namespace intel_cpu { + +SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& snippet_config, + const std::shared_ptr& snippet_attrs, + const std::shared_ptr& snippet, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) + : SubgraphBaseExecutor(snippet_config, + snippet_attrs, + snippet, + start_offset_in, + start_offset_out, + allocator, + kernel_cache) { + m_buffer_scratchpad = allocator(m_internal_buffer_size); +} + +void SubgraphStaticExecutor::exec_impl(const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + const auto& callable = m_schedule->get_callable(); + + auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + }; + auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + callable(&call_args, indexes.data()); + }; + + if (m_parallel_exec_domain.size() == rank6D) { + parallel_for6d(initializer, caller); + } else { + parallel_forNd(initializer, caller); + } +} + +void SubgraphDynamicSpecializedExecutor::exec_impl(const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + const auto& callable = m_schedule->get_callable(); + + OPENVINO_ASSERT(m_data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!"); + OPENVINO_ASSERT(m_data_offsets.front().size() == m_parallel_exec_domain.size(), + "Data offsets with invalid ranks detected"); + + // Note: we need to reset KernelExecutorTable to the state that was recorded in the + // SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes + m_reset_exec_table_state(); + + std::vector src_ptrs; + std::vector dst_ptrs; + init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs, m_start_offset_in, m_start_offset_out); + + auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, ithr); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + }; + auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); + callable(&call_args); + }; + + if (m_parallel_exec_domain.size() == rank6D) { + parallel_for6d(initializer, caller); + } else { + parallel_forNd(initializer, caller); + } +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp new file mode 100644 index 00000000000000..54d7f27a79fd17 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "nodes/executors/subgraph.hpp" + +namespace ov { +namespace intel_cpu { + +class SubgraphExecutor : public SubgraphBaseExecutor { +public: + SubgraphExecutor(const std::shared_ptr& snippet_config, + const std::shared_ptr& snippet_attrs, + const std::shared_ptr& snippet, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache); +}; + +class SubgraphStaticExecutor : public SubgraphExecutor, public SubgraphStaticBaseExecutor { +public: + template + SubgraphStaticExecutor(T&& first, Args&&... rest) + : SubgraphExecutor(std::forward(first), std::forward(rest)...), + SubgraphStaticBaseExecutor() {} + + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override; +}; + +class SubgraphDynamicSpecializedExecutor : public SubgraphExecutor, public SubgraphDynamicSpecializedBaseExecutor { +public: + template + SubgraphDynamicSpecializedExecutor(T&& first, Args&&... rest) + : SubgraphExecutor(std::forward(first), std::forward(rest)...), + SubgraphDynamicSpecializedBaseExecutor(std::forward(first)) {} + + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp new file mode 100644 index 00000000000000..739ae56be3b4ff --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp @@ -0,0 +1,170 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "nodes/executors/subgraph.hpp" + +#include "common/primitive_hashing_utils.hpp" +#include "openvino/core/parallel.hpp" + +namespace ov { +namespace intel_cpu { + +bool operator==(const SubgraphAttrs& lhs, const SubgraphAttrs& rhs) { + if (&lhs == &rhs) + return true; + if (lhs.bodyHash != rhs.bodyHash) + return false; + if (lhs.inMemOrders.size() != rhs.inMemOrders.size() || lhs.inMemPrecs.size() != rhs.inMemPrecs.size()) + return false; + if (lhs.outMemOrders.size() != rhs.outMemOrders.size() || lhs.outMemPrecs.size() != rhs.outMemPrecs.size()) + return false; + if (lhs.inMemOrders != rhs.inMemOrders || lhs.inMemPrecs != rhs.inMemPrecs) + return false; + if (lhs.outMemOrders != rhs.outMemOrders || lhs.outMemPrecs != rhs.outMemPrecs) + return false; + return true; +} + +size_t get_attr_hash(size_t seed, const std::shared_ptr& attrs) { + using namespace dnnl::impl; + using namespace dnnl::impl::primitive_hashing; + + for (const auto& order : attrs->inMemOrders) + seed = get_vector_hash(seed, order); + for (const auto& prec : attrs->inMemPrecs) + seed = hash_combine(seed, prec.hash()); + + for (const auto& order : attrs->outMemOrders) + seed = get_vector_hash(seed, order); + for (const auto& prec : attrs->outMemPrecs) + seed = hash_combine(seed, prec.hash()); + + seed = hash_combine(seed, attrs->bodyHash); + return seed; +} + +SubgraphCodeGenerator::SubgraphCodeGenerator(const std::shared_ptr& snippet_attrs, + const std::shared_ptr& config) { + OPENVINO_ASSERT(snippet_attrs, "Subgraph attributes are empty!"); + OPENVINO_ASSERT(config, "Runtime Config is empty!"); + + jit_snippets_compile_args jcp; + jcp.data_offsets = config->io_data_offsets; + SubgraphBaseExecutor::init_parallel_domain(config, jcp.exec_domain); + schedule = + std::make_shared(snippet_attrs->snippet->generate(reinterpret_cast(&jcp))); +} + +SubgraphBaseExecutor::SubgraphBaseExecutor(const std::shared_ptr& snippet_config, + const std::shared_ptr& snippet_attrs, + const std::shared_ptr& snippet, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) + : m_schedule(snippet->get()), + m_start_offset_in(start_offset_in), + m_start_offset_out(start_offset_out) { + OPENVINO_ASSERT(m_schedule, "Schedule is empty!"); + OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!"); + init_parallel_domain(snippet_config, m_parallel_exec_domain); + + m_tensor_rank = snippet_config->tensor_rank; + m_harness_work_amount = std::accumulate(m_parallel_exec_domain.cbegin(), + m_parallel_exec_domain.cend(), + size_t(1), + std::multiplies()); + m_nthreads = std::min(parallel_get_max_threads(), static_cast(m_harness_work_amount)); + + m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size; + OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), + "Undefined buffer scratchpad size!"); + m_internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; +} + +void SubgraphBaseExecutor::init_parallel_domain(const std::vector& master_shape, + size_t tensor_rank, + size_t tile_rank, + std::vector& domain) { + domain.resize(tensor_rank, 1); + std::fill(domain.begin(), domain.end(), 1); + std::copy(master_shape.cbegin(), + master_shape.cbegin() + (master_shape.size() - tile_rank), + domain.begin() + (tensor_rank - master_shape.size())); +} + +void SubgraphBaseExecutor::init_parallel_domain(const std::shared_ptr& snippet_config, + std::vector& domain) { + init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain); +} + +void SubgraphBaseExecutor::parallel_for6d(const initializer_functor& initializer, const call_functor& caller) { + const auto& dom = m_parallel_exec_domain; + + parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) { + jit_snippets_call_args call_args; + initializer(call_args, ithr); + + size_t start = 0, end = 0; + splitter(m_harness_work_amount, nthr, ithr, start, end); + + std::vector indexes{0, 0, 0, 0, 0}; + parallel_it_init(start, + indexes[0], + dom[0], + indexes[1], + dom[1], + indexes[2], + dom[2], + indexes[3], + dom[3], + indexes[4], + dom[4]); + for (size_t iwork = start; iwork < end; ++iwork) { + caller(call_args, indexes, ithr); + parallel_it_step(indexes[0], + dom[0], + indexes[1], + dom[1], + indexes[2], + dom[2], + indexes[3], + dom[3], + indexes[4], + dom[4]); + } + }); +} + +void SubgraphBaseExecutor::parallel_forNd(const initializer_functor& initializer, const call_functor& caller) { + const auto& dom = m_parallel_exec_domain; + + parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) { + jit_snippets_call_args call_args; + initializer(call_args, ithr); + + size_t start = 0, end = 0; + splitter(m_harness_work_amount, nthr, ithr, start, end); + + std::vector indexes(dom.size() - 1, 0); + for (size_t iwork = start; iwork < end; ++iwork) { + size_t tmp = iwork; + for (ptrdiff_t j = static_cast(dom.size()) - 2; j >= 0; j--) { + indexes[j] = tmp % dom[j]; + tmp /= dom[j]; + } + + caller(call_args, indexes, ithr); + } + }); +} + +void SubgraphBaseExecutor::execute(const dnnl::stream& strm, + const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + exec_impl(inMemPtrs, outMemPtrs); +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp new file mode 100644 index 00000000000000..78cb56440203d2 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp @@ -0,0 +1,188 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "cpu_memory.h" +#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "snippets/generator.hpp" +#include "snippets/op/subgraph.hpp" + +namespace ov { +namespace intel_cpu { + +struct SubgraphAttrs { + // Local copy of subgraph node for canonization & code generation + std::shared_ptr snippet; + uint64_t bodyHash; + std::vector inMemOrders; + std::vector outMemOrders; + std::vector inMemPrecs; + std::vector outMemPrecs; +}; +bool operator==(const SubgraphAttrs& lhs, const SubgraphAttrs& rhs); +size_t get_attr_hash(size_t seed, const std::shared_ptr& attrs); + +class SubgraphCodeGenerator { +public: + SubgraphCodeGenerator(const std::shared_ptr& snippet_attrs, + const std::shared_ptr& config); + + const std::shared_ptr& get() const { + return schedule; + } + +private: + std::shared_ptr schedule; +}; + +class SubgraphBaseExecutor { +public: + using BufferScratchpadAllocator = std::function; + + SubgraphBaseExecutor() = default; + SubgraphBaseExecutor(const std::shared_ptr& snippet_config, + const std::shared_ptr& snippet_attrs, + const std::shared_ptr& snippet, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache); + virtual ~SubgraphBaseExecutor() = default; + + virtual void execute(const dnnl::stream& strm, + const std::vector& inMemPtrs, + const std::vector& outMemPtrs); + + static void init_parallel_domain(const std::vector& master_shape, + size_t tensor_rank, + size_t tile_rank, + std::vector& domain); + static void init_parallel_domain(const std::shared_ptr& snippet_config, + std::vector& domain); + +protected: + virtual void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) = 0; + + using initializer_functor = std::function; + using call_functor = std::function&, size_t)>; + + virtual void parallel_for6d(const initializer_functor& initializer, const call_functor& caller); + virtual void parallel_forNd(const initializer_functor& initializer, const call_functor& caller); + + inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const { + if (m_buffer_scratchpad_size > 0) + scratchpad_ptr = m_buffer_scratchpad->getDataAs() + ithr * m_buffer_scratchpad_size; + } + + std::shared_ptr m_schedule; + // Holds index of output used as in execution domain + // it should be compatible with a schedule's work size + std::vector m_parallel_exec_domain = {}; + size_t m_harness_work_amount = 0; + + // Buffer scratchpad + MemoryPtr m_buffer_scratchpad = nullptr; + size_t m_buffer_scratchpad_size = 0; + size_t m_internal_buffer_size = 0; + size_t m_tensor_rank = 0; + + const size_t rank6D = 6; + + // Count of threads for parallel_nt + int m_nthreads = 0; + + std::vector m_start_offset_in = {}; + std::vector m_start_offset_out = {}; +}; + +// Class for Subgraphs with static shapes +class SubgraphStaticBaseExecutor { +public: + SubgraphStaticBaseExecutor() = default; + virtual ~SubgraphStaticBaseExecutor() = default; + +protected: + typedef void (*kernel)(const void*, const void*); + + inline void init_call_args(jit_snippets_call_args& call_args, + const std::vector& srcMemPtrs, + const std::vector& dstMemPtrs, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + size_t ithr) { + for (size_t i = 0; i < srcMemPtrs.size(); i++) + call_args.src_ptrs[i] = srcMemPtrs[i]->getDataAs() + start_offset_in[i]; + + for (size_t i = 0; i < dstMemPtrs.size(); i++) + call_args.dst_ptrs[i] = dstMemPtrs[i]->getDataAs() + start_offset_out[i]; + } +}; + +// Specialized dynamic executor based on shape agnostic kernel for the specific input shapes +class SubgraphDynamicSpecializedBaseExecutor { +public: + SubgraphDynamicSpecializedBaseExecutor(const std::shared_ptr& snippet_config) { + m_buffer_offsets = snippet_config->buffer_cluster_offsets; + m_data_offsets = snippet_config->io_data_offsets; + m_loop_args = snippet_config->loop_args; + m_reset_exec_table_state = snippet_config->kernel_executor_table->get_state_reset(); + } + virtual ~SubgraphDynamicSpecializedBaseExecutor() = default; + +protected: + typedef void (*dynamic_kernel)(const void*); + + inline void init_call_args(jit_snippets_call_args& call_args, size_t ithr) { + call_args.register_loops(m_loop_args); + std::copy(m_buffer_offsets.cbegin(), m_buffer_offsets.cend(), call_args.buffer_offsets); + } + + inline void init_original_ptrs(const std::vector& srcMemPtrs, + const std::vector& dstMemPtrs, + std::vector& src_ptrs, + std::vector& dst_ptrs, + const std::vector& start_offset_in, + const std::vector& start_offset_out) { + const auto in_num = srcMemPtrs.size(); + const auto out_num = dstMemPtrs.size(); + + src_ptrs.resize(in_num, nullptr); + dst_ptrs.resize(out_num, nullptr); + + for (size_t i = 0; i < in_num; i++) + src_ptrs[i] = srcMemPtrs[i]->getDataAs() + start_offset_in[i]; + for (size_t i = 0; i < out_num; i++) + dst_ptrs[i] = dstMemPtrs[i]->getDataAs() + start_offset_out[i]; + } + + inline void update_ptrs(jit_snippets_call_args& call_args, + const std::vector& src_ptrs, + const std::vector& dst_ptrs, + const std::vector& indexes) const { + for (size_t i = 0; i < src_ptrs.size(); i++) { + auto i_ptr = src_ptrs[i]; + for (size_t j = 0; j < indexes.size(); j++) { + i_ptr += m_data_offsets[i][j] * indexes[j]; + } + call_args.src_ptrs[i] = i_ptr; + } + for (size_t i = 0; i < dst_ptrs.size(); i++) { + auto i_ptr = dst_ptrs[i]; + for (size_t j = 0; j < indexes.size(); j++) { + i_ptr += m_data_offsets[i + src_ptrs.size()][j] * indexes[j]; + } + call_args.dst_ptrs[i] = i_ptr; + } + } + + std::vector m_buffer_offsets = {}; + std::vector> m_data_offsets = {}; + std::vector m_loop_args = {}; + std::function m_reset_exec_table_state; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp new file mode 100644 index 00000000000000..983c4410083beb --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp @@ -0,0 +1,328 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "nodes/executors/x64/subgraph.hpp" + +#include "emitters/snippets/x64/cpu_generator.hpp" +#include "emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp" +#include "openvino/core/parallel.hpp" +#include "snippets/op/subgraph.hpp" + +#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS) +# include + +# include "emitters/snippets/x64/jit_segfault_detector_emitter.hpp" +std::mutex err_print_lock; +#endif + +namespace ov { +namespace intel_cpu { + +namespace { +inline void parallel4d_repacking(const BrgemmCopyBKernel* ker, + const VectorDims& dom, + const VectorDims& in_str, + const VectorDims& out_str, + const uint8_t* src, + uint8_t* dst) { + parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) { + BrgemmCopyBKernel::call_args args; + args.src = src + d0 * in_str[0] + d1 * in_str[1] + d2 * in_str[2] + d3 * in_str[3]; + args.tr_src = dst + d0 * out_str[0] + d1 * out_str[1] + d2 * out_str[2] + d3 * out_str[3]; + (*ker)(&args); + }); +}; +inline void parallelNd_repacking(const BrgemmCopyBKernel* ker, + const VectorDims& dom, + const VectorDims& in_str, + const VectorDims& out_str, + const uint8_t* src, + uint8_t* dst) { + const size_t batch = std::accumulate(dom.rbegin() + 2, dom.rend(), 1lu, std::multiplies()); + parallel_nt_static(0, [&](const int ithr, const int nthr) { + BrgemmCopyBKernel::call_args args; + size_t start = 0, end = 0; + splitter(batch, nthr, ithr, start, end); + for (size_t iwork = start; iwork < end; ++iwork) { + const uint8_t* src_u8 = src; + uint8_t* dst_u8 = dst; + size_t tmp = iwork; + for (ptrdiff_t j = static_cast(dom.size()) - 3; j >= 0; j--) { + auto idx = tmp % dom[j]; + tmp /= dom[j]; + + src_u8 += idx * in_str[j]; + dst_u8 += idx * out_str[j]; + } + args.src = src_u8; + args.tr_src = dst_u8; + (*ker)(&args); + } + }); +}; +} // namespace + +SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& snippet_config, + const std::shared_ptr& snippet_attrs, + const std::shared_ptr& snippet, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) + : SubgraphBaseExecutor(snippet_config, + snippet_attrs, + snippet, + start_offset_in, + start_offset_out, + allocator, + kernel_cache) { + m_repacking_impl_type = snippet_config->repacking_impl_type; + m_repacked_inputs = snippet_config->repacked_inputs; + + auto external_buffer_size = std::accumulate(m_repacked_inputs.begin(), + m_repacked_inputs.end(), + size_t(0), + [](size_t sum, const std::pair& p) { + return sum + p.second.desc()->getCurrentMemSize(); + }); + + if (get_repacking_impl_type() == RepackingImplType::IN_PARALLEL) { + // When external repacking is applied in parallel section, + // each thread should have own buffer to store repacked data + external_buffer_size *= m_nthreads; + + // To avoid extra overheads in runtime on vector creation, + // we initialize `repacked_offsets_by_threads` by default here + m_repacked_offsets_by_threads.resize(m_nthreads); + for (size_t i = 0; i < m_repacked_offsets_by_threads.size(); ++i) + clean_repacked_offsets(i); + + if (m_tensor_rank == rank6D) { + init_offset = [](const std::vector& offsets, const std::vector& indexes, size_t& offset) { + offset += offsets[0] * indexes[0] + offsets[1] * indexes[1] + offsets[2] * indexes[2] + + offsets[3] * indexes[3]; + }; + } else { + init_offset = [](const std::vector& offsets, const std::vector& indexes, size_t& offset) { + for (size_t j = 0; j < indexes.size(); j++) + offset += offsets[j] * indexes[j]; + }; + } + } + + m_buffer_scratchpad = allocator(m_internal_buffer_size + external_buffer_size); + +#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS) + const auto target = std::dynamic_pointer_cast( + snippet_attrs->snippet->get_generator()->get_target_machine()); + enabled_segfault_detector = target && target->debug_config.enable_segfault_detector; +#endif +} + +#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS) +void SubgraphExecutor::segfault_detector() { + if (enabled_segfault_detector) { + __sighandler_t signal_handler = [](int signal) { + std::lock_guard guard(err_print_lock); + if (auto segfault_detector_emitter = ov::intel_cpu::g_custom_segfault_handler->local()) + std::cout << segfault_detector_emitter->info() << std::endl; + auto tid = parallel_get_thread_num(); + OPENVINO_THROW("Segfault was caught by the signal handler in subgraph node execution on thread " + + std::to_string(tid)); + }; + struct sigaction new_handler {}; + new_handler.sa_handler = signal_handler; + sigaction(SIGSEGV, &new_handler, nullptr); + } +} +#endif + +std::vector SubgraphExecutor::separately_repack_inputs(const dnnl::stream& strm, + const std::vector& srcMemPtrs) { + auto reordered_in_ptrs = srcMemPtrs; + size_t offset = m_internal_buffer_size; + for (const auto& p : m_repacked_inputs) { + const auto in_idx = p.first; + const auto& repacked_input = p.second; + const auto& desc = repacked_input.desc(); + const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; + + OPENVINO_ASSERT(in_idx < srcMemPtrs.size(), "Incorrect index of input repacked mem ptr"); + const auto& src_mem = srcMemPtrs[in_idx]; + const auto& dst_mem = std::make_shared(strm.get_engine(), desc, data_ptr, false); + + const auto* src = src_mem->getDataAs() + m_start_offset_in[in_idx]; + auto* dst = dst_mem->getDataAs(); + + VectorDims dom; + const auto& shape = dst_mem->getShape().getDims(); + OPENVINO_ASSERT(shape.size() <= m_tensor_rank, "Unsupported shape rank of repacking data"); + init_parallel_domain(shape, m_tensor_rank, 2lu, dom); + + const auto& in_strides = repacked_input.in_offsets(); + const auto& out_strides = repacked_input.out_offsets(); + OPENVINO_ASSERT(everyone_is(m_tensor_rank, in_strides.size(), out_strides.size(), dom.size()), + "Unsupported shape rank of repacking data"); + + const auto& kernel = repacked_input.kernel(); + if (m_tensor_rank == rank6D) + parallel4d_repacking(kernel.get(), dom, in_strides, out_strides, src, dst); + else + parallelNd_repacking(kernel.get(), dom, in_strides, out_strides, src, dst); + + reordered_in_ptrs[in_idx] = dst_mem; + offset += desc->getCurrentMemSize(); + } + return reordered_in_ptrs; +} + +void SubgraphExecutor::in_parallel_repack_inputs(const std::vector& inMemPtrs, + const std::vector& indexes, + int ithr, + jit_snippets_call_args& call_args) { + size_t repacked_offset_idx = 0; + for (const auto& p : m_repacked_inputs) { + const auto& in_idx = p.first; + const auto& repacked_in = p.second; + + size_t src_offset = m_start_offset_in[in_idx]; + init_offset(repacked_in.in_offsets(), indexes, src_offset); + + auto* repacked_ptr = get_external_scratchpad_ptr(ithr, in_idx); + + auto& last_processed_src_offset = m_repacked_offsets_by_threads[ithr][repacked_offset_idx]; + if (src_offset != last_processed_src_offset) { + BrgemmCopyBKernel::call_args args; + args.src = inMemPtrs[in_idx]->getDataAs() + src_offset; + args.tr_src = repacked_ptr; + (*repacked_in.kernel())(&args); + + last_processed_src_offset = src_offset; + } + + call_args.src_ptrs[in_idx] = repacked_ptr; + ++repacked_offset_idx; + } +} + +void SubgraphExecutor::execute(const dnnl::stream& strm, + const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + switch (get_repacking_impl_type()) { + case RepackingImplType::SEPARATE: + exec_impl(separately_repack_inputs(strm, inMemPtrs), outMemPtrs); + return; + case RepackingImplType::IN_PARALLEL: + case RepackingImplType::NONE: + exec_impl(inMemPtrs, outMemPtrs); + return; + default: + OPENVINO_THROW("Uknown RepackingImplType"); + } +} + +void SubgraphStaticExecutor::exec_impl(const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + const auto& callable = m_schedule->get_callable(); + + initializer_functor initializer; + call_functor caller; + + switch (get_repacking_impl_type()) { + case RepackingImplType::IN_PARALLEL: + initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + clean_repacked_offsets(ithr); + }; + caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args); + callable(&call_args, indexes.data()); + }; + break; + case RepackingImplType::SEPARATE: + case RepackingImplType::NONE: + initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + }; + caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + callable(&call_args, indexes.data()); + }; + break; + default: + OPENVINO_THROW("Uknown RepackingImplType"); + } + +#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS) + segfault_detector(); +#endif + + if (m_parallel_exec_domain.size() == rank6D) { + parallel_for6d(initializer, caller); + } else { + parallel_forNd(initializer, caller); + } +} + +void SubgraphDynamicSpecializedExecutor::exec_impl(const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + const auto& callable = m_schedule->get_callable(); + + OPENVINO_ASSERT(m_data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!"); + OPENVINO_ASSERT(m_data_offsets.front().size() == m_parallel_exec_domain.size(), + "Data offsets with invalid ranks detected"); + + // Note: we need to reset KernelExecutorTable to the state that was recorded in the + // SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes + m_reset_exec_table_state(); + + std::vector src_ptrs; + std::vector dst_ptrs; + init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs, m_start_offset_in, m_start_offset_out); + + initializer_functor initializer; + call_functor caller; + + switch (get_repacking_impl_type()) { + case RepackingImplType::IN_PARALLEL: + initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, ithr); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + clean_repacked_offsets(ithr); + }; + caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); + in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args); + callable(&call_args); + }; + break; + case RepackingImplType::SEPARATE: + case RepackingImplType::NONE: + initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, ithr); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + }; + caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); + callable(&call_args); + }; + break; + default: + OPENVINO_THROW("Uknown RepackingImplType"); + } + +#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS) + segfault_detector(); +#endif + + if (m_parallel_exec_domain.size() == rank6D) { + parallel_for6d(initializer, caller); + } else { + parallel_forNd(initializer, caller); + } +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.hpp new file mode 100644 index 00000000000000..457d4982cf942a --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.hpp @@ -0,0 +1,94 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "nodes/executors/subgraph.hpp" + +namespace ov { +namespace intel_cpu { + +class SubgraphExecutor : public SubgraphBaseExecutor { +public: + SubgraphExecutor(const std::shared_ptr& snippet_config, + const std::shared_ptr& snippet_attrs, + const std::shared_ptr& snippet, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache); + + void execute(const dnnl::stream& strm, + const std::vector& inMemPtrs, + const std::vector& outMemPtrs) override; + +protected: + std::vector separately_repack_inputs(const dnnl::stream& strm, const std::vector& srcMemPtrs); + void in_parallel_repack_inputs(const std::vector& inMemPtrs, + const std::vector& indexes, + int ithr, + jit_snippets_call_args& call_args); + + inline void* get_external_scratchpad_ptr(size_t ithr, size_t idx) const { + if (m_repacked_inputs.empty()) + return nullptr; + + uint8_t* data_ptr = m_buffer_scratchpad->getDataAs() + m_internal_buffer_size; + for (const auto& p : m_repacked_inputs) { + const auto& desc = p.second.desc(); + const auto size = desc->getCurrentMemSize(); + if (p.first == idx) { + return data_ptr + ithr * size; + } + data_ptr += m_nthreads * size; + } + OPENVINO_THROW("External buffer pointer has not been found"); + } + + // [ Thread Index -> Index of input with repacking data - > last repacked src_offset ] + std::vector> m_repacked_offsets_by_threads = {}; + std::unordered_map m_repacked_inputs = {}; + + std::function&, const std::vector&, size_t&)> init_offset = {}; + + using RepackingImplType = CPURuntimeConfig::RepackingImplType; + const RepackingImplType& get_repacking_impl_type() const { + return m_repacking_impl_type; + } + + inline void clean_repacked_offsets(size_t ithr) { + m_repacked_offsets_by_threads[ithr].assign(m_repacked_inputs.size(), std::numeric_limits::max()); + } + +#ifdef SNIPPETS_DEBUG_CAPS + bool enabled_segfault_detector = false; + inline void segfault_detector(); +#endif + +private: + RepackingImplType m_repacking_impl_type = RepackingImplType::NONE; +}; + +class SubgraphStaticExecutor : public SubgraphExecutor, public SubgraphStaticBaseExecutor { +public: + template + SubgraphStaticExecutor(T&& first, Args&&... rest) + : SubgraphExecutor(std::forward(first), std::forward(rest)...), + SubgraphStaticBaseExecutor() {} + + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override; +}; + +class SubgraphDynamicSpecializedExecutor : public SubgraphExecutor, public SubgraphDynamicSpecializedBaseExecutor { +public: + template + SubgraphDynamicSpecializedExecutor(T&& first, Args&&... rest) + : SubgraphExecutor(std::forward(first), std::forward(rest)...), + SubgraphDynamicSpecializedBaseExecutor(std::forward(first)) {} + + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index ba6f1eda215dce..4a84fc6667406d 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -7,7 +7,6 @@ #include "dnnl_extension_utils.h" #include "onednn/dnnl.h" #include "openvino/core/parallel.hpp" -#include "openvino/core/rt_info.hpp" #include "shape_inference/custom/subgraph.hpp" #include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/pass/insert_buffers.hpp" @@ -27,9 +26,11 @@ #if defined(OPENVINO_ARCH_ARM64) # include "emitters/snippets/aarch64/cpu_generator.hpp" +# include "executors/aarch64/subgraph.hpp" # include "transformations/snippets/aarch64/shape_inference.hpp" #else # include "emitters/snippets/x64/cpu_generator.hpp" +# include "executors/x64/subgraph.hpp" # include "transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp" # include "transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp" # include "transformations/snippets/x64/pass/enforce_precision.hpp" @@ -48,13 +49,6 @@ #include "utils/cpu_utils.hpp" #include "utils/ngraph_utils.hpp" -#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) -# include - -# include "emitters/snippets/x64/jit_segfault_detector_emitter.hpp" -std::mutex err_print_lock; -#endif - #ifdef SNIPPETS_LIBXSMM_TPP # include "snippets/lowered/pass/optimize_domain.hpp" # include "transformations/tpp/x64/pass/brgemm_to_brgemm_tpp.hpp" @@ -70,265 +64,76 @@ namespace intel_cpu { namespace node { namespace { -// Class for Subgraphs with static shapes -class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor { -public: - SubgraphStaticExecutor(const std::shared_ptr& snippet_attrs, - const std::shared_ptr& snippet, - const std::vector& start_offset_in, - const std::vector& start_offset_out, - const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator) - : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {} - - void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { - const auto& callable = m_schedule->get_callable(); - - auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { - init_call_args(call_args, inMemPtrs, outMemPtrs, ithr); - }; - auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes) { - callable(&call_args, indexes.data()); - }; - - if (m_parallel_exec_domain.size() == rank6D) { - parallel_for6d(initializer, caller); - } else { - parallel_forNd(initializer, caller); - } - } - -protected: - typedef void (*kernel)(const void*, const void*); - - inline void init_call_args(jit_snippets_call_args& call_args, - const std::vector& srcMemPtrs, - const std::vector& dstMemPtrs, - size_t ithr) { - for (size_t i = 0; i < srcMemPtrs.size(); i++) - call_args.src_ptrs[i] = srcMemPtrs[i]->getDataAs() + m_start_offset_in[i]; - - for (size_t i = 0; i < dstMemPtrs.size(); i++) - call_args.dst_ptrs[i] = dstMemPtrs[i]->getDataAs() + m_start_offset_out[i]; - - update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); - } -}; - -// Specialized dynamic executor based on shape agnostic kernel for the specific input shapes -class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { -public: - SubgraphDynamicSpecializedExecutor(const std::shared_ptr& snippet_attrs, - const std::shared_ptr& snippet, - const std::vector& start_offset_in, - const std::vector& start_offset_out, - const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator) - : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) { - buffer_offsets = snippet_config->buffer_cluster_offsets; - data_offsets = snippet_config->io_data_offsets; - loop_args = snippet_config->loop_args; - reset_exec_table_state = snippet_config->kernel_executor_table->get_state_reset(); - } - - void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { - const auto& callable = m_schedule->get_callable(); - - OPENVINO_ASSERT(data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!"); - OPENVINO_ASSERT(data_offsets.front().size() == m_parallel_exec_domain.size(), - "Data offsets with invalid ranks detected"); - - // Note: we need to reset KernelExecutorTable to the state that was recorded in the - // SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes - reset_exec_table_state(); - - std::vector src_ptrs; - std::vector dst_ptrs; - init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs); - - auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { - init_call_args(call_args, ithr); - }; - auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes) { - update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); - callable(&call_args); - }; - - if (m_parallel_exec_domain.size() == rank6D) { - parallel_for6d(initializer, caller); - } else { - parallel_forNd(initializer, caller); - } - } - -protected: - typedef void (*dynamic_kernel)(const void*); - - inline void init_call_args(jit_snippets_call_args& call_args, size_t ithr) { - call_args.register_loops(loop_args); - std::copy(buffer_offsets.cbegin(), buffer_offsets.cend(), call_args.buffer_offsets); - - update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); - } - - inline void init_original_ptrs(const std::vector& srcMemPtrs, - const std::vector& dstMemPtrs, - std::vector& src_ptrs, - std::vector& dst_ptrs) { - const auto in_num = srcMemPtrs.size(); - const auto out_num = dstMemPtrs.size(); - - src_ptrs.resize(in_num, nullptr); - dst_ptrs.resize(out_num, nullptr); - - for (size_t i = 0; i < in_num; i++) - src_ptrs[i] = srcMemPtrs[i]->getDataAs() + m_start_offset_in[i]; - for (size_t i = 0; i < out_num; i++) - dst_ptrs[i] = dstMemPtrs[i]->getDataAs() + m_start_offset_out[i]; - } - - inline void update_ptrs(jit_snippets_call_args& call_args, - const std::vector& src_ptrs, - const std::vector& dst_ptrs, - const std::vector& indexes) const { - for (size_t i = 0; i < src_ptrs.size(); i++) { - auto i_ptr = src_ptrs[i]; - for (size_t j = 0; j < indexes.size(); j++) { - i_ptr += data_offsets[i][j] * indexes[j]; - } - call_args.src_ptrs[i] = i_ptr; - } - for (size_t i = 0; i < dst_ptrs.size(); i++) { - auto i_ptr = dst_ptrs[i]; - for (size_t j = 0; j < indexes.size(); j++) { - i_ptr += data_offsets[i + src_ptrs.size()][j] * indexes[j]; - } - call_args.dst_ptrs[i] = i_ptr; - } - } - - std::vector buffer_offsets = {}; - std::vector> data_offsets = {}; - std::vector loop_args = {}; - std::function reset_exec_table_state; -}; - +#if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64) struct SubgraphKey { SubgraphKey() = default; - SubgraphKey(const std::shared_ptr& attrs_, const std::vector& in_shapes_) + SubgraphKey(const std::shared_ptr& attrs_, const std::vector& in_shapes_) : attrs(attrs_), in_shapes(in_shapes_) {} virtual ~SubgraphKey() = default; - size_t hash() const; - bool operator==(const SubgraphKey& rhs) const; + size_t hash() const { + using namespace dnnl::impl; + using namespace dnnl::impl::primitive_hashing; + + size_t seed = get_attr_hash(0, attrs); + for (const auto& shape : in_shapes) + seed = get_vector_hash(seed, shape); + + return seed; + } + bool operator==(const SubgraphKey& rhs) const { + return *attrs == *rhs.attrs && in_shapes == rhs.in_shapes; + } - std::shared_ptr attrs = nullptr; + std::shared_ptr attrs = nullptr; std::vector in_shapes = {}; }; struct SubgraphCodeGeneratorKey { - SubgraphCodeGeneratorKey(const std::shared_ptr& attrs_, uint8_t mask_) + SubgraphCodeGeneratorKey(const std::shared_ptr& attrs_, uint8_t mask_) : attrs(attrs_), broadcasting_mask(mask_) {} - size_t hash() const; - bool operator==(const SubgraphCodeGeneratorKey& rhs) const; + size_t hash() const { + using namespace dnnl::impl; + using namespace dnnl::impl::primitive_hashing; - std::shared_ptr attrs = nullptr; + size_t seed = get_attr_hash(0, attrs); + return hash_combine(seed, broadcasting_mask); + } + bool operator==(const SubgraphCodeGeneratorKey& rhs) const { + return *attrs == *rhs.attrs && broadcasting_mask == rhs.broadcasting_mask; + } + + std::shared_ptr attrs = nullptr; uint32_t broadcasting_mask = 0; }; +#endif struct SubgraphShapeInferResultKey { SubgraphShapeInferResultKey(std::vector in_shapes_, uint64_t body_hash_) : in_shapes(std::move(in_shapes_)), body_hash(body_hash_) {} - size_t hash() const; - bool operator==(const SubgraphShapeInferResultKey& rhs) const; + size_t hash() const { + using namespace dnnl::impl; + using namespace dnnl::impl::primitive_hashing; + + size_t seed = hash_combine(0, body_hash); + for (const auto& shape : in_shapes) + seed = get_vector_hash(seed, shape); + + return seed; + } + bool operator==(const SubgraphShapeInferResultKey& rhs) const { + return body_hash == rhs.body_hash && in_shapes == rhs.in_shapes; + } std::vector in_shapes = {}; uint64_t body_hash = 0; }; -size_t get_attr_hash(size_t seed, const std::shared_ptr& attrs) { - using namespace dnnl::impl; - using namespace dnnl::impl::primitive_hashing; - - for (const auto& order : attrs->inMemOrders) - seed = get_vector_hash(seed, order); - for (const auto& prec : attrs->inMemPrecs) - seed = hash_combine(seed, prec.hash()); - - for (const auto& order : attrs->outMemOrders) - seed = get_vector_hash(seed, order); - for (const auto& prec : attrs->outMemPrecs) - seed = hash_combine(seed, prec.hash()); - - seed = hash_combine(seed, attrs->bodyHash); - return seed; -} - -size_t SubgraphKey::hash() const { - using namespace dnnl::impl; - using namespace dnnl::impl::primitive_hashing; - - size_t seed = get_attr_hash(0, attrs); - for (const auto& shape : in_shapes) - seed = get_vector_hash(seed, shape); - - return seed; -} - -size_t SubgraphCodeGeneratorKey::hash() const { - using namespace dnnl::impl; - using namespace dnnl::impl::primitive_hashing; - - size_t seed = get_attr_hash(0, attrs); - seed = hash_combine(seed, broadcasting_mask); - - return seed; -} - -size_t SubgraphShapeInferResultKey::hash() const { - using namespace dnnl::impl; - using namespace dnnl::impl::primitive_hashing; - - size_t seed = hash_combine(0, body_hash); - for (const auto& shape : in_shapes) - seed = get_vector_hash(seed, shape); - - return seed; -} - -bool operator==(const Subgraph::SubgraphAttrs& lhs, const Subgraph::SubgraphAttrs& rhs) { - if (&lhs == &rhs) - return true; - if (lhs.bodyHash != rhs.bodyHash) - return false; - if (lhs.inMemOrders.size() != rhs.inMemOrders.size() || lhs.inMemPrecs.size() != rhs.inMemPrecs.size()) - return false; - if (lhs.outMemOrders.size() != rhs.outMemOrders.size() || lhs.outMemPrecs.size() != rhs.outMemPrecs.size()) - return false; - if (lhs.inMemOrders != rhs.inMemOrders || lhs.inMemPrecs != rhs.inMemPrecs) - return false; - if (lhs.outMemOrders != rhs.outMemOrders || lhs.outMemPrecs != rhs.outMemPrecs) - return false; - return true; -} - -bool SubgraphKey::operator==(const SubgraphKey& rhs) const { - return *attrs == *rhs.attrs && in_shapes == rhs.in_shapes; -} - -bool SubgraphCodeGeneratorKey::operator==(const SubgraphCodeGeneratorKey& rhs) const { - return *attrs == *rhs.attrs && broadcasting_mask == rhs.broadcasting_mask; -} - -bool SubgraphShapeInferResultKey::operator==(const SubgraphShapeInferResultKey& rhs) const { - return body_hash == rhs.body_hash && in_shapes == rhs.in_shapes; -} - struct SubgraphShapeInferResult { SubgraphShapeInferResult(IShapeInfer::Result res) : result(std::move(res)) {} @@ -352,7 +157,8 @@ Subgraph::Subgraph(const std::shared_ptr& op, const GraphContext::CPtr subgraph_attrs->bodyHash = getBodyHash(tmp_snippet); #if defined(OPENVINO_ARCH_ARM64) - subgraph_attrs->snippet->set_generator(std::make_shared(host_isa)); + subgraph_attrs->snippet->set_generator( + std::make_shared(host_isa, context->getParamsCache())); #elif defined(OPENVINO_ARCH_X86_64) subgraph_attrs->snippet->set_generator(std::make_shared(host_isa, context->getParamsCache())); #else @@ -796,12 +602,13 @@ void Subgraph::optimizeIR() { } void Subgraph::prepareParams() { +#if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64) const auto& cache = context->getParamsCache(); - auto builder = [this, &cache](const SubgraphKey& key) -> std::shared_ptr { + auto builder = [this, &cache](const SubgraphKey& key) -> std::shared_ptr { const auto& snippet = subgraph_attrs->snippet; - SubgraphExecutor::BufferScratchpadAllocator allocator = [this](size_t size) { + SubgraphBaseExecutor::BufferScratchpadAllocator allocator = [this](size_t size) { return getScratchPadMem(std::make_shared(ov::element::u8, intel_cpu::Shape{size})); }; @@ -824,12 +631,13 @@ void Subgraph::prepareParams() { code_gen->get()->lowering_result.kernel_executor_table); } const auto& snippet_config = ov::as_type_ptr(snippet->update_runtime_config()); - return std::make_shared(key.attrs, + return std::make_shared(snippet_config, + key.attrs, code_gen, start_offset_in, start_offset_out, - snippet_config, - allocator); + allocator, + cache); } else { // Static case: // 1. Update runtime config to get static scheduling data (io data offsets, parallel domain) which will be @@ -842,17 +650,20 @@ void Subgraph::prepareParams() { [&snippet_config](const SubgraphCodeGeneratorKey& key) -> std::shared_ptr { return std::make_shared(key.attrs, snippet_config); }); - return std::make_shared(key.attrs, + return std::make_shared(snippet_config, + key.attrs, code_gen_result.first, start_offset_in, start_offset_out, - snippet_config, - allocator); + allocator, + cache); } }; const auto result = cache->getOrCreate(SubgraphKey(subgraph_attrs, in_shapes), builder); execPtr = result.first; +#endif + OPENVINO_ASSERT(execPtr != nullptr, "Executor is not created for node ", getName(), "."); } @@ -907,191 +718,6 @@ void Subgraph::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -namespace { -inline void init_parallel_domain(const std::shared_ptr& snippet_config, std::vector& domain) { - const auto& master_shape = snippet_config->master_shape; - const auto& tensor_rank = snippet_config->tensor_rank; - const auto& tile_rank = snippet_config->tile_rank; - domain.resize(tensor_rank, 1); - - std::fill(domain.begin(), domain.end(), 1); - std::copy(master_shape.cbegin(), - master_shape.cbegin() + (master_shape.size() - tile_rank), - domain.begin() + (tensor_rank - master_shape.size())); -} -} // namespace - -Subgraph::SubgraphCodeGenerator::SubgraphCodeGenerator(const std::shared_ptr& snippet_attrs, - const std::shared_ptr& config) { - OPENVINO_ASSERT(snippet_attrs, "Subgraph attributes are empty!"); - OPENVINO_ASSERT(config, "Runtime Config is empty!"); - - jit_snippets_compile_args jcp; - jcp.data_offsets = config->io_data_offsets; - init_parallel_domain(config, jcp.exec_domain); - schedule = - std::make_shared(snippet_attrs->snippet->generate(reinterpret_cast(&jcp))); -} - -Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& snippet_attrs, - const std::shared_ptr& snippet, - const std::vector& start_offset_in, - const std::vector& start_offset_out, - const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator) - : m_schedule(snippet->get()), - m_start_offset_in(start_offset_in), - m_start_offset_out(start_offset_out) { - OPENVINO_ASSERT(m_schedule, "Schedule is empty!"); - OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!"); - init_parallel_domain(snippet_config, m_parallel_exec_domain); - - m_harness_work_amount = std::accumulate(m_parallel_exec_domain.cbegin(), - m_parallel_exec_domain.cend(), - size_t(1), - std::multiplies()); - m_nthreads = std::min(parallel_get_max_threads(), static_cast(m_harness_work_amount)); - - m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size; - OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), - "Undefined buffer scratchpad size!"); - m_internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; - m_in_requested_descs = snippet_config->m_in_requested_descs; - const auto external_repacking_buffer_size = - std::accumulate(m_in_requested_descs.begin(), - m_in_requested_descs.end(), - size_t(0), - [](size_t sum, const std::pair& requested_desc_elem) { - return sum + requested_desc_elem.second->getCurrentMemSize(); - }); - m_buffer_scratchpad = allocator(m_internal_buffer_size + external_repacking_buffer_size); - -#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) - const auto target = std::dynamic_pointer_cast( - snippet_attrs->snippet->get_generator()->get_target_machine()); - enabled_segfault_detector = target && target->debug_config.enable_segfault_detector; -#endif -} - -#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) -void Subgraph::SubgraphExecutor::segfault_detector() { - if (enabled_segfault_detector) { - __sighandler_t signal_handler = [](int signal) { - std::lock_guard guard(err_print_lock); - if (auto segfault_detector_emitter = ov::intel_cpu::g_custom_segfault_handler->local()) - std::cout << segfault_detector_emitter->info() << std::endl; - auto tid = parallel_get_thread_num(); - OPENVINO_THROW("Segfault was caught by the signal handler in subgraph node execution on thread " + - std::to_string(tid)); - }; - struct sigaction new_handler {}; - new_handler.sa_handler = signal_handler; - sigaction(SIGSEGV, &new_handler, nullptr); - } -} -#endif - -void Subgraph::SubgraphExecutor::parallel_for6d( - const std::function& initializer, - const std::function&)>& caller) { - const auto& dom = m_parallel_exec_domain; - -#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) - segfault_detector(); -#endif - - parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) { - jit_snippets_call_args call_args; - initializer(call_args, ithr); - - size_t start = 0, end = 0; - splitter(m_harness_work_amount, nthr, ithr, start, end); - - std::vector indexes{0, 0, 0, 0, 0}; - parallel_it_init(start, - indexes[0], - dom[0], - indexes[1], - dom[1], - indexes[2], - dom[2], - indexes[3], - dom[3], - indexes[4], - dom[4]); - for (size_t iwork = start; iwork < end; ++iwork) { - caller(call_args, indexes); - parallel_it_step(indexes[0], - dom[0], - indexes[1], - dom[1], - indexes[2], - dom[2], - indexes[3], - dom[3], - indexes[4], - dom[4]); - } - }); -} - -void Subgraph::SubgraphExecutor::parallel_forNd( - const std::function& initializer, - const std::function&)>& caller) { - const auto& dom = m_parallel_exec_domain; - -#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) - segfault_detector(); -#endif - - parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) { - jit_snippets_call_args call_args; - initializer(call_args, ithr); - - size_t start = 0, end = 0; - splitter(m_harness_work_amount, nthr, ithr, start, end); - - std::vector indexes(dom.size() - 1, 0); - for (size_t iwork = start; iwork < end; ++iwork) { - size_t tmp = iwork; - for (ptrdiff_t j = static_cast(dom.size()) - 2; j >= 0; j--) { - indexes[j] = tmp % dom[j]; - tmp /= dom[j]; - } - - caller(call_args, indexes); - } - }); -} - -void Subgraph::SubgraphExecutor::execute(const dnnl::stream& strm, - const std::vector& inMemPtrs, - const std::vector& outMemPtrs) { - if (!m_in_requested_descs.empty()) { - auto reorderedInMemPtrs = reorder_inputs(strm, inMemPtrs); - exec_impl(reorderedInMemPtrs, outMemPtrs); - } else { - exec_impl(inMemPtrs, outMemPtrs); - } -} - -std::vector Subgraph::SubgraphExecutor::reorder_inputs(const dnnl::stream& strm, - const std::vector& inMemPtrs) { - auto reordered_in_ptrs = inMemPtrs; - size_t offset = m_internal_buffer_size; - for (const auto& requested_descs_elem : m_in_requested_descs) { - const auto in_idx = requested_descs_elem.first; - const auto& requested_desc = requested_descs_elem.second; - - const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; - const auto scratch_mem = std::make_shared(strm.get_engine(), requested_desc, data_ptr, false); - scratch_mem->load(*reordered_in_ptrs[in_idx]); - reordered_in_ptrs[in_idx] = scratch_mem; - offset += requested_desc->getCurrentMemSize(); - } - return reordered_in_ptrs; -} - } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index a84e46d9ae02da..9e6cb3cd49a9d7 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -4,10 +4,8 @@ #pragma once -#include "emitters/snippets/cpu_runtime_configurator.hpp" -#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "executors/subgraph.hpp" #include "node.h" -#include "snippets/op/subgraph.hpp" #if defined(OPENVINO_ARCH_ARM64) # include "cpu/aarch64/cpu_isa_traits.hpp" @@ -15,8 +13,6 @@ # include "cpu/x64/cpu_isa_traits.hpp" #endif -#include - namespace ov { namespace intel_cpu { namespace node { @@ -41,21 +37,6 @@ class Subgraph : public Node { void execute(dnnl::stream strm) override; void executeDynamicImpl(dnnl::stream strm) override; - struct SubgraphAttrs { - // Local copy of subgraph node for canonization & code generation - std::shared_ptr snippet; - uint64_t bodyHash; - std::vector inMemOrders; - std::vector outMemOrders; - std::vector inMemPrecs; - std::vector outMemPrecs; - }; - - // Class for snippet compilation - class SubgraphCodeGenerator; - // Base class for executors - class SubgraphExecutor; - protected: IShapeInfer::Result shapeInfer() const override; @@ -103,79 +84,7 @@ class Subgraph : public Node { // Input shapes that are used in PrepareParams and ShapeInfer to avoid frequent memory allocation mutable std::vector in_shapes; - std::shared_ptr execPtr = nullptr; -}; - -class Subgraph::SubgraphCodeGenerator { -public: - SubgraphCodeGenerator(const std::shared_ptr& snippet_attrs, - const std::shared_ptr& config); - - const std::shared_ptr& get() const { - return schedule; - } - -private: - std::shared_ptr schedule; -}; - -class Subgraph::SubgraphExecutor { -public: - using BufferScratchpadAllocator = std::function; - - SubgraphExecutor(const std::shared_ptr& snippet_attrs, - const std::shared_ptr& snippet, - const std::vector& start_offset_in, - const std::vector& start_offset_out, - const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator); - virtual ~SubgraphExecutor() = default; - - void execute(const dnnl::stream& strm, - const std::vector& inMemPtrs, - const std::vector& outMemPtrs); - -protected: - virtual void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) = 0; - - void parallel_for6d(const std::function& initializer, - const std::function&)>& caller); - void parallel_forNd(const std::function& initializer, - const std::function&)>& caller); - - inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const { - if (m_buffer_scratchpad_size > 0) - scratchpad_ptr = m_buffer_scratchpad->getDataAs() + ithr * m_buffer_scratchpad_size; - } - - std::shared_ptr m_schedule; - // Holds index of output used as in execution domain - // it should be compatible with a schedule's work size - std::vector m_parallel_exec_domain = {}; - size_t m_harness_work_amount = 0; - - // Buffer scratchpad - MemoryPtr m_buffer_scratchpad = nullptr; - size_t m_buffer_scratchpad_size = 0; - size_t m_internal_buffer_size = 0; - - const size_t rank6D = 6; - - // Count of threads for parallel_nt - int m_nthreads = 0; - - std::vector m_start_offset_in = {}; - std::vector m_start_offset_out = {}; - -#ifdef SNIPPETS_DEBUG_CAPS - bool enabled_segfault_detector = false; - inline void segfault_detector(); -#endif - -private: - std::vector reorder_inputs(const dnnl::stream& strm, const std::vector& inMemPtrs); - - std::unordered_map m_in_requested_descs = {}; + std::shared_ptr execPtr = nullptr; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp index 7e52905145869f..ce57cd1529b893 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp @@ -114,6 +114,13 @@ size_t BrgemmCopyB::get_offset_compensations() const { return get_output_offset(1); } +bool BrgemmCopyB::is_transposed(const std::vector& layout) { + const auto is_transposed = !layout.empty() && layout.back() != layout.size() - 1; + OPENVINO_ASSERT(IMPLICATION(is_transposed, (layout[layout.size() - 2] == layout.size() - 1)), + "supports only N dim placed as last or pre last dimension"); + return is_transposed; +} + BrgemmCopyB::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { const auto& brg_copyb = ov::as_type_ptr(n); OPENVINO_ASSERT(brg_copyb, "Got invalid node in BrgemmCopyB::ShapeInfer"); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp index 54e2c39fcf1c06..b4e7b030fc605b 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp @@ -72,6 +72,8 @@ class BrgemmCopyB : public snippets::modifier::MemoryAccess, public ov::op::Op { Result infer(const std::vector& input_shapes) override; }; + static bool is_transposed(const std::vector& layout); + private: void custom_constructor_validate_and_infer_types(std::vector layout_input = {}); void validate_element_type(const ov::element::Type& element_type); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp index 939ae93ad92b18..b87a78c6b0cb40 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp @@ -10,6 +10,7 @@ #include "openvino/pass/pattern/op/wrap_type.hpp" #include "snippets/itt.hpp" #include "snippets/op/rank_normalization.hpp" +#include "snippets/op/reorder.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" namespace ov { @@ -30,12 +31,26 @@ pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() { const auto& in_desc = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(copy_b_node->input(0)); const auto& layout = in_desc->get_layout(); - // TODO: - // 1. Ticket 157340: support external repacking for copyB with compensations - // 2. Ticket 157339: support external repacking for non-planar layout - if (!ov::snippets::utils::is_planar_layout(layout) || - brgemm_utils::with_compensations(copy_b_node->get_type()) || transformation_callback(copy_b_node)) + + auto is_supported_layout = [](const std::vector& layout) { + return layout.empty() || (layout.size() - 1 == layout.back()); + }; + + // TODO [157340]: support external repacking for copyB with compensations + if (!is_supported_layout(layout) || brgemm_utils::with_compensations(copy_b_node->get_type()) || + transformation_callback(copy_b_node)) return false; + + // If there is non-planar layout, we should insert reshape to support shape inference + if (!ov::snippets::utils::is_planar_layout(layout)) { + const auto& subtensor = in_desc->get_subtensor(); + const auto& reshape = std::make_shared(copy_b_node->input_value(0), layout); + ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->input(0), subtensor, layout); + ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->output(0), subtensor); + return ov::replace_node_update_name(copy_b_node, reshape); + } + + // If there is no layout, we can just remove BrgemmCopyB from the subgraph return ov::replace_output_update_name(copy_b_out, copy_b_node->input_value(0)); }; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp index 1cb8263d189d18..16df97bb209ed9 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp @@ -70,8 +70,11 @@ bool pass::AdjustBrgemmCopyBLoopPorts::run(const snippets::lowered::LinearIR& li auto get_repacking_loop_idces = [](const snippets::lowered::ExpressionPtr& brgemm_expr) { // Repacking may be extracted outside the snippets kernel. In this case, brgemm parent expression is a // parameter. - if (is_type( - brgemm_expr->get_input_port_connector(1)->get_source().get_expr()->get_node())) + const auto& brgemm_in1 = brgemm_expr->get_input_port_connector(1)->get_source(); + const auto& shape_infer_seq = ov::snippets::utils::get_first_parent_shape_infer_expr_seq(brgemm_in1.get_expr()); + const auto source = + shape_infer_seq.empty() ? brgemm_in1 : shape_infer_seq.back()->get_input_port_connector(0)->get_source(); + if (is_type(source.get_expr()->get_node())) return std::vector{}; const auto repacking_expr = brgemm_utils::repacking::get_copy_b_expr(brgemm_expr); OPENVINO_ASSERT(repacking_expr, "BrgemmCopyB expression is not found"); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp index 78f9b928298a9d..add7c66d3d7ffc 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp @@ -5,6 +5,7 @@ #include "external_repacking_adjuster.hpp" #include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp" #include "memory_desc/cpu_blocked_memory_desc.h" #include "snippets/itt.hpp" #include "snippets/utils/utils.hpp" @@ -14,59 +15,142 @@ namespace ov { namespace intel_cpu { +const size_t BrgemmExternalRepackingAdjuster::brgemm_kernel_rank = 2; + BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, const CPURuntimeConfigurator* configurator) : snippets::lowered::pass::RuntimeOptimizer(configurator) { const auto& params = linear_ir->get_parameters(); for (size_t i = 0; i < params.size(); ++i) { const auto& param = params[i]; - const auto consumers = param->get_output_port_connector(0)->get_consumers(); - const bool brgemm_with_extracted_repacking = - std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) { - auto brgemm = ov::as_type_ptr(port.get_expr()->get_node()); - return brgemm && brgemm_utils::with_repacking(brgemm->get_type()) && port.get_index() == 1; - }); - if (brgemm_with_extracted_repacking) { - m_param_idces_with_external_repacking.insert(i); - // Ticket 157339: Support non-planar layout - OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(configurator->get_io_descs()[i]->get_layout()), - "Non-planar layout is not supported for external repacking"); + const auto& shape_infer_consumers = ov::snippets::utils::get_first_child_shape_infer_expr_seq(param); + const auto& out = shape_infer_consumers.empty() ? param->get_output_port(0) + : shape_infer_consumers.back()->get_output_port(0); + const auto consumers = out.get_connected_ports(); + + for (const auto& consumer : consumers) { + auto brgemm = ov::as_type_ptr(consumer.get_expr()->get_node()); + if (brgemm && brgemm_utils::with_repacking(brgemm->get_type()) && consumer.get_index() == 1) { + const auto src_prc = brgemm->get_input_element_type(0); + const auto wei_prc = brgemm->get_input_element_type(1); + const auto isa = brgemm_utils::get_primitive_isa(src_prc, brgemm_utils::with_amx(brgemm->get_type())); + const auto inner_n_block = brgemm_utils::repacking::compute_inner_n_block(wei_prc); + const auto is_transposed_b = + BrgemmCopyB::is_transposed(m_configurator->get_io_descs()[i]->get_layout()); + auto config = BrgemmCopyBKernelConfig(src_prc, wei_prc, isa, false, is_transposed_b, inner_n_block); + m_executors[i] = std::make_shared(configurator->get_cache(), config); + } } } } +VectorDims BrgemmExternalRepackingAdjuster::get_blk_order(size_t shape_rank) { + VectorDims order(shape_rank - brgemm_kernel_rank); + std::iota(order.begin(), order.end(), 0); + const auto last_idx = shape_rank - 1; + order.insert(order.end(), {last_idx - 1, last_idx, last_idx - 1}); + return order; +} + +VectorDims BrgemmExternalRepackingAdjuster::get_blk_shape(const VectorDims& planar_shape, ov::element::Type prc) { + const auto vnni_factor = brgemm_utils::compute_vnni_factor(prc); + const auto K = *++planar_shape.rbegin(); + const auto N = *planar_shape.rbegin(); + const auto new_K = snippets::utils::div_up(K, vnni_factor); + const auto new_N = std::max(N, brgemm_utils::repacking::compute_inner_n_block(prc)); + VectorDims blk_shape(planar_shape.begin(), planar_shape.end() - brgemm_kernel_rank); + blk_shape.insert(blk_shape.end(), {new_K, new_N, vnni_factor}); + return blk_shape; +} + +void BrgemmExternalRepackingAdjuster::update_kernel(const RepackExecutorPtr& executor, + const VectorDims& shape, + const VectorDims& layout, + size_t N, + size_t K, + ov::element::Type prc) { + const auto generic_config = executor->get_config().get_clone_ptr(); + auto config = static_cast(generic_config.get()); + const auto idx = config->is_transposed_B() ? 0 : 1; + const auto copy_wei_stride = ov::snippets::utils::get_dim_in_stride(shape, layout, idx) * prc.size(); + config->update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, prc)); + executor->update_by_config(*config); +} + bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmExternalRepackingAdjuster") const auto& cpu_config = ov::as_type_ptr(m_configurator->get_config()); - auto& optimal_descs = cpu_config->m_in_requested_descs; - for (const auto& i : m_param_idces_with_external_repacking) { + + size_t data_size = 0; + for (const auto& p : m_executors) { + const auto& i = p.first; const auto& shape = cpu_config->io_shapes[i]; - const auto& K = *++shape.rbegin(); - const auto& N = *shape.rbegin(); - - const auto& precision = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0); - const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision); - const size_t brgemm_kernel_rank = 2; - // Firstly, batch dims are set - VectorDims requested_blocked_shape(shape.begin(), shape.end() - brgemm_kernel_rank); - // Then, the blocked dims are formed - requested_blocked_shape.insert(requested_blocked_shape.end(), - {snippets::utils::div_up(K, vnni_factor), - std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)), - vnni_factor}); - - VectorDims requested_order(shape.size() - brgemm_kernel_rank); - std::iota(requested_order.begin(), requested_order.end(), 0); - const auto last_idx = shape.size() - 1; - requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); - - optimal_descs[i] = - std::make_shared(precision, Shape(shape), requested_blocked_shape, requested_order); - - ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1); - shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end()); - m_configurator->compute_offsets(shape_for_offset, i, 0); + + const auto& layout = cpu_config->io_layouts[i]; + const auto planar_shape = ov::snippets::utils::get_planar_vdims(shape, layout); + const auto& K = *++planar_shape.rbegin(); + const auto& N = *planar_shape.rbegin(); + + const auto& prc = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0); + const auto blk_shape = get_blk_shape(planar_shape, prc); + + // src data + dst data per kernel call + const auto src_data = N * K * prc.size(); + const auto dst_data = + std::accumulate(blk_shape.rbegin(), blk_shape.rbegin() + 3, prc.size(), std::multiplies()); + data_size += src_data + dst_data; + + update_kernel(p.second, shape, layout, N, K, prc); } + + const auto cache_size = dnnl::utils::get_cache_size(1, true) + dnnl::utils::get_cache_size(2, true); + const auto fit_into_cache = data_size < cache_size; + // Heuristic: If external repacking data doesn't fit in the caches L1 and L2, + // external repacking should be executed in seperate parallel section before kernel execution. + cpu_config->repacking_impl_type = fit_into_cache ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL + : CPURuntimeConfig::RepackingImplType::SEPARATE; + + const auto is_impl_parallel = cpu_config->repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL; + + for (const auto& p : m_executors) { + const auto& i = p.first; + const auto& shape = cpu_config->io_shapes[i]; + auto& repacked_in = cpu_config->repacked_inputs[i]; + + const auto& prc = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0); + auto planar_shape = ov::snippets::utils::get_planar_vdims(shape, cpu_config->io_layouts[i]); + auto blk_shape = get_blk_shape(planar_shape, prc); + // In parallel impl, each thread needs buffer with only shape [K_blk, N_blk, VNNI] to store repacking data + if (is_impl_parallel) { + std::fill(planar_shape.rbegin() + brgemm_kernel_rank, planar_shape.rend(), 1); + std::fill(blk_shape.rbegin() + brgemm_kernel_rank + 1, blk_shape.rend(), 1); + } + const auto order = get_blk_order(planar_shape.size()); + const auto desc = std::make_shared(prc, Shape(planar_shape), blk_shape, order); + + // Save original input offsets for input before repacking. + // If the shape has not been changed, it means that we already created `RepackedInput` for this input + // on previous pass call and now `cpu_config->io_data_offsets[i]` contains offsets not for original input - + // they were updated for blocked shapes/zeroed for previous initialization and we cannot use them as original + // offsets. + const auto in_offsets = + shape == cpu_config->latest_shapes[i] ? repacked_in.in_offsets() : cpu_config->io_data_offsets[i]; + + // In parallel case Kernel should not add offsets to repacked inputs because + // they will be applied during repacking in execution stage + if (is_impl_parallel) { + auto& offsets = cpu_config->io_data_offsets[i]; + std::fill(offsets.begin(), offsets.end(), 0); + } else { + ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1); + shape_for_offset.insert(shape_for_offset.end(), blk_shape.begin(), blk_shape.end()); + m_configurator->compute_offsets(shape_for_offset, i, 0); + } + const auto out_offsets = cpu_config->io_data_offsets[i]; + + repacked_in = RepackedInput(p.second->get_kernel(), desc, in_offsets, out_offsets); + } + return true; } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp index 4d0c9586f3be31..2ef0b382a6ad8c 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp @@ -5,6 +5,7 @@ #pragma once #include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp" #include "snippets/lowered/pass/runtime_optimizer.hpp" #include "snippets/runtime_configurator.hpp" @@ -24,11 +25,23 @@ class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::Runt bool run(const snippets::lowered::LinearIR& linear_ir) override; bool applicable() const override { - return !m_param_idces_with_external_repacking.empty(); + return !m_executors.empty(); } private: - std::set m_param_idces_with_external_repacking; + using RepackExecutorPtr = std::shared_ptr; + static VectorDims get_blk_order(size_t shape_rank); + static VectorDims get_blk_shape(const VectorDims& planar_shape, ov::element::Type prc); + + void update_kernel(const RepackExecutorPtr& executor, + const VectorDims& shape, + const VectorDims& layout, + size_t N, + size_t K, + ov::element::Type prc); + + static const size_t brgemm_kernel_rank; + std::unordered_map m_executors; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp index 0f5a6472b741f4..0186e5b66030ca 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.cpp @@ -22,7 +22,7 @@ bool ov::intel_cpu::pass::FuseLoadStoreConvert::fuse_load_convert( const auto& load_output = input_connector->get_source(); const auto& load_expr = load_output.get_expr(); const auto load = ov::as_type_ptr(load_expr->get_node()); - if (!load || ov::is_type(load_expr->get_node()) || + if (!load || ov::is_type(load_expr->get_node()) || ov::is_type(load_expr->get_node())) return false; diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 81eb70d328630d..7b787f2afd0296 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -1081,10 +1081,7 @@ void Transformations::MainSnippets(void) { // Only FP32 dynamic MHA is supported if (matmul->is_dynamic()) return false; - // [114487] brgemm kernel in oneDNN requires brgemm_copy_b kernel if MatMul node has transposed_b=True - // The current solution with ExtractExplicitMatMulTranspose pass is slower for non-f32 cases than using of - // brgemm_copy_b kernel - if (matmul->get_transpose_a() || matmul->get_transpose_b()) + if (matmul->get_transpose_a()) return false; // [150842] The execution of Brgemm INT8/BF16/FP16 on AMX platforms depends on the value of "K % VNNIFactor". // For more details, please teake a look at the ticket 150842 @@ -1113,6 +1110,7 @@ void Transformations::MainSnippets(void) { return false; const auto parallel_work_amount = std::accumulate(shape.rbegin() + 2, shape.rend(), ov::Dimension(1), std::multiplies()); + // Ticket 160154: enable tokenization for MHA with insufficient parallel work amount const auto is_unsupported_parallel_work_amount = static_cast(parallel_work_amount.get_length()) < tokenization_config.get_concurrency() && !ov::snippets::pass::SplitDimensionM::can_be_optimized(n, tokenization_config.get_concurrency()); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp index a94f52be91df02..b69dcb66fb2d44 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp @@ -296,7 +296,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHA, ElementType::f32}), ::testing::ValuesIn(matMulIn0Precisions), ::testing::ValuesIn(patternTypes), - ::testing::Values(ExpectedNodes{{"Subgraph", 1}}), + ::testing::Values(ExpectedNodes{{"Subgraph", 2}}), // MHA + Decomposed Transpose on input ::testing::Values(ov::test::utils::DEVICE_CPU)), MHATest::getTestCaseName); @@ -309,7 +309,7 @@ INSTANTIATE_TEST_SUITE_P( std::vector{ElementType::bf16, ElementType::bf16, ElementType::bf16, ElementType::bf16}), ::testing::ValuesIn(matMulIn0Precisions), ::testing::ValuesIn(patternTypes), - ::testing::Values(ExpectedNodes{{"Subgraph", 1}, + ::testing::Values(ExpectedNodes{{"Subgraph", 2}, // MHA + Decomposed Transpose on input {"Transpose", 1}}), // Plugin disables tokenization of Transpose on output ::testing::Values(ov::test::utils::DEVICE_CPU)), MHATest::getTestCaseName); @@ -323,7 +323,7 @@ INSTANTIATE_TEST_SUITE_P( std::vector{ElementType::f16, ElementType::f16, ElementType::f16, ElementType::f16}), ::testing::ValuesIn(matMulIn0Precisions), ::testing::ValuesIn(patternTypes), - ::testing::Values(ExpectedNodes{{"Subgraph", 1}, + ::testing::Values(ExpectedNodes{{"Subgraph", 2}, // MHA + Decomposed Transpose on input {"Transpose", 1}}), // Plugin disables tokenization of Transpose on output ::testing::Values(ov::test::utils::DEVICE_CPU)), MHATest::getTestCaseName); @@ -694,7 +694,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern0, ::testing::Values(0), ::testing::Values(ExpectedNodes{ {"Subgraph", 5}, // FQs on inputs x 3 + MHA + Deq Mul - {"Transpose", 1}}), // Transpose between MHA and Deq Mul + {"Transpose", 2}}), // Decomposed Transpose on input + Transpose between MHA and Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU)), MHAQuantTest::getTestCaseName); @@ -706,7 +706,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern1, ::testing::Values(1), ::testing::Values(ExpectedNodes{ {"Subgraph", 4}, // FQ on input x 2 + MHA + Deq Mul - {"Transpose", 1}}), // Transpose between MHA and Deq Mul + {"Transpose", 2}}), // Decomposed Transpose on input + Transpose between MHA and Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU)), MHAQuantTest::getTestCaseName); @@ -717,7 +717,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern2, ::testing::ValuesIn(matMulIn0PrecisionsQuant), ::testing::Values(2), ::testing::Values(ExpectedNodes{{"Subgraph", 3}, // FQ on inputs x 2 + MHA - {"Transpose", 0}}), // Transpose is fused + {"Transpose", 1}}), // Decomposed Transpose on input ::testing::Values(ov::test::utils::DEVICE_CPU)), MHAQuantTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp index df0b69f99ef06d..1709fd21f988a0 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp @@ -75,8 +75,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D, ::testing::Values(ov::element::f32), ::testing::Values(false), ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), + ::testing::Values(2), // decomposed Transpose + MHA + ::testing::Values(2), // decomposed Transpose + MHA ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); @@ -88,8 +88,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D_WithScalarMul, ::testing::Values(ov::element::f32), ::testing::Values(true), ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), + ::testing::Values(2), // decomposed Transpose + MHA + ::testing::Values(2), // decomposed Transpose, Mul + MHA ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); @@ -125,9 +125,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16_4D, ::testing::Combine(::testing::ValuesIn(transposedShape_4D()), ::testing::ValuesIn(precision_bf16_if_supported(4)), ::testing::Values(ov::element::f32), - ::testing::ValuesIn({false, true}), + ::testing::Values(false), ::testing::Values(MHA::default_thread_count), - ::testing::Values(7), // MHA + 5 Converts + 1 Transpose on output + ::testing::Values(8), // decomposed Transpose + MHA + 5 Converts + 1 Transpose on output ::testing::Values(6), // MHA + 5 Converts on inputs and output ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), @@ -140,8 +140,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceBF16, ::testing::Values(ov::element::bf16), ::testing::ValuesIn({false}), ::testing::Values(MHA::default_thread_count), - ::testing::Values(7), - ::testing::Values(6), + ::testing::Values(8), // decomposed Transpose + MHA + 5 Converts + 1 Transpose on output + ::testing::Values(6), // MHA + 5 Reorders on inputs and output ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), MHA::getTestCaseName); @@ -153,8 +153,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_FP16_4D_Without_Multiply, ::testing::Values(ov::element::f16), ::testing::ValuesIn({false}), ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), ::testing::Values(2), - ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); @@ -165,8 +165,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_FP16_4D_With_Multiply_Static, ::testing::Values(ov::element::f16), ::testing::ValuesIn({true}), ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), ::testing::Values(2), - ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); @@ -178,7 +178,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_FP16_4D_With_Multiply_Dynamic, ::testing::Values(ov::element::f16), ::testing::ValuesIn({true}), ::testing::Values(MHA::default_thread_count), - ::testing::Values(3), + ::testing::Values(4), ::testing::Values(2), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), @@ -191,8 +191,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceFP16_Without_Multiply, ::testing::Values(ov::element::f16), ::testing::ValuesIn({false}), ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), ::testing::Values(2), - ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::cpu_f16_plugin_config)), MHA::getTestCaseName); @@ -203,8 +203,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceFP16_With_Multiply_Static, ::testing::Values(ov::element::f16), ::testing::ValuesIn({true}), ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), ::testing::Values(2), - ::testing::Values(1), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::cpu_f16_plugin_config)), MHA::getTestCaseName); @@ -215,7 +215,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceFP16_With_Multiply_Dynamic, ::testing::Values(ov::element::f16), ::testing::ValuesIn({true}), ::testing::Values(MHA::default_thread_count), - ::testing::Values(3), + ::testing::Values(4), ::testing::Values(2), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::cpu_f16_plugin_config)), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp index 4bf35e2daa690d..f9bc640160a67c 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp @@ -21,8 +21,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::ValuesIn({false}), // Need to support True for graph builder in tests ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), + ::testing::Values(2), // Subgraph with MHA + Subgraph with Transpose1 + ::testing::Values(2), // Subgraph with MHA + Subgraph with Transpose1 ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp index 0c731b74565863..38806dff765833 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp @@ -48,7 +48,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply ::testing::Values(MHA::default_thread_count), - ::testing::Values(6), // FQx3 on inputs + MHA + Transpose on output + Deq Mul + ::testing::Values(7), // FQx3, Transpose1 on inputs + MHA + Transpose on output + Deq Mul ::testing::Values(5), // FQx3 on inputs + MHA + Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), @@ -63,7 +63,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply ::testing::Values(MHA::default_thread_count), - ::testing::Values(5), // FQx2 on inputs + MHA + Transpose on output + Deq Mul + ::testing::Values(6), // FQx2, Transpose1 on inputs + MHA + Transpose on output + Deq Mul ::testing::Values(4), // FQx2 on inputs + MHA + Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), @@ -77,8 +77,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply ::testing::Values(MHA::default_thread_count), - ::testing::Values(3), // MHA + Transpose on output + Deq Mul - ::testing::Values(2), // MHA + Deq Mul + ::testing::Values(4), // Transpose1 + MHA + Transpose on output + Deq Mul + ::testing::Values(3), // Transpose1 + MHA + Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); @@ -91,7 +91,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply ::testing::Values(MHA::default_thread_count), - ::testing::Values(7), // Transposex2 + Subgraphsx5 + ::testing::Values(8), // Transposex3 + Subgraphsx5 ::testing::Values(5), // MHA + Deq Mul on output + Deqs on inputs + 2 xFQ on inputs ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp index 3fc1417d20b102..cc438301101811 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp @@ -29,8 +29,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), // Need to support True for graph builder in tests ::testing::Values(MHA::default_thread_count), - ::testing::Values(2), // Less + MHA - ::testing::Values(2), + ::testing::Values(3), // Transpose1 + Less + MHA + ::testing::Values(3), // Transpose1 + Less + MHA ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp index bb5f7fe2fa5b52..d3598ebba1ac1f 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp @@ -24,8 +24,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(true), ::testing::Values(4), // 4 Threads - ::testing::Values(6), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output - ::testing::Values(1), + ::testing::Values(7), // Subgraph + 4 Reshapes, Transpose1 on inputs and 1 Reshape on output + ::testing::Values(2), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(enable_callback())), MHA::getTestCaseName); @@ -80,8 +80,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), ::testing::Values(4), // 4 Threads - ::testing::Values(1), - ::testing::Values(1), + ::testing::Values(2), // Transpose1 + MHA + ::testing::Values(2), // Transpose1 + MHA ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp index 7876d737af2281..9a9e56621b10a6 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp @@ -43,8 +43,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::ValuesIn(precision_f32(5)), ::testing::Values(ov::element::f32), ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), + ::testing::Values(2), // Transpose1 + MHA + ::testing::Values(2), // Transpose1 + MHA ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), MHAWithDynamicMul::getTestCaseName); @@ -56,7 +56,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::ValuesIn(precision_f32(5)), ::testing::Values(ov::element::bf16), ::testing::Values(MHA::default_thread_count), - ::testing::Values(8), // MHA + 1 Transpose on output + 6 Converts around + ::testing::Values(9), // Transpose1 + MHA + 1 Transpose on output + 6 Converts around ::testing::Values(7), // MHA + 6 Converts around ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp index 9ace85b3038afa..7c425b0bca6781 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp @@ -149,7 +149,7 @@ class MHAFP32BufferAllocationTest : public BufferAllocationCPUTest { const auto parameter2 = std::make_shared(ov::element::f32, shapes[2]); const auto order = std::vector{0, 2, 3, 1}; - const auto load_reshape = std::make_shared(parameter1, 1, 0, order); + const auto load_reshape = std::make_shared(parameter1, 1, 0, order); const auto store = std::make_shared(load_reshape); const auto relu0 = std::make_shared(store); const auto brgemm_cpu0 = std::make_shared(parameter0, relu0, BRGEMM_TYPE::STAND_ALONE); @@ -199,7 +199,7 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { const auto parameter2 = std::make_shared(ov::element::bf16, shapes[2]); const auto order = std::vector{0, 2, 3, 1}; - const auto load_reshape = std::make_shared(parameter1, 1, 0, order); + const auto load_reshape = std::make_shared(parameter1, 1, 0, order); const auto store = std::make_shared(load_reshape); const auto convert0 = std::make_shared(store, ov::element::f32); const auto relu0 = std::make_shared(convert0); diff --git a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp index 5f854326a47217..eb0dfaa8710fa8 100644 --- a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp +++ b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp @@ -110,46 +110,43 @@ std::shared_ptr MHAFunction::initReference() const { auto data1 = std::make_shared(precisions[1], input_shapes[1]); auto data2 = std::make_shared(precisions[2], input_shapes[2]); auto data3 = std::make_shared(precisions[3], input_shapes[3]); - ov::ParameterVector ngraphParams = {data0, data1, data2, data3}; - NodeVector subgraph_inputs = {data0, data1, data2, data3}; - - auto transpose0Param = std::make_shared(precisions[0], input_shapes[0]); - auto transpose1Param = std::make_shared(precisions[1], input_shapes[1]); - auto addParam = std::make_shared(precisions[2], input_shapes[2]); - auto transpose2Param = std::make_shared(precisions[3], input_shapes[3]); - ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param}; + ov::ParameterVector ngraphParams = {data0, data1, data2, data3}; const auto rank = input_shapes[0].size(); const auto fusion_order = get_fusion_order(rank); const auto decomposed_order = get_decomposed_order(rank); - const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, decomposed_order); - const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); - const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose1 = std::make_shared(data1, transpose1Const); - const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); - const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); - std::shared_ptr matmul_parent1 = transpose1; + std::shared_ptr subgraph_parent1 = transpose1; if (with_mul) { ov::Shape shape(rank, 1); if (transpose1->get_output_partial_shape(0).is_static()) { shape[rank - 3] = transpose1->get_output_shape(0)[rank - 3]; } - const auto mulConst = ov::test::utils::make_constant(precisions[1], shape); - if (ov::shape_size(shape) > 1) { - const auto mulParam = std::make_shared(precisions[1], mulConst->get_shape()); - matmul_parent1 = std::make_shared(transpose1, mulParam); - subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param}; - subgraph_inputs = {data0, data1, mulConst, data2, data3}; - } else { - matmul_parent1 = std::make_shared(transpose1, mulConst); - } + const auto mulConst = ov::test::utils::make_constant(precisions[1], shape); + subgraph_parent1 = std::make_shared(transpose1, mulConst); } - const auto matMul0 = std::make_shared(transpose0, matmul_parent1); + NodeVector subgraph_inputs = {data0, subgraph_parent1, data2, data3}; + + auto transpose0Param = std::make_shared(precisions[0], input_shapes[0]); + auto brgemm1Param = std::make_shared(subgraph_parent1->get_element_type(), subgraph_parent1->get_output_partial_shape(0)); + auto addParam = std::make_shared(precisions[2], input_shapes[2]); + auto transpose2Param = std::make_shared(precisions[3], input_shapes[3]); + + ov::ParameterVector subgraph_params = {transpose0Param, brgemm1Param, addParam, transpose2Param}; + + const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + + const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); + + const auto matMul0 = std::make_shared(transpose0, brgemm1Param); const auto add = std::make_shared(matMul0, addParam); const auto softMax = std::make_shared(add, rank - 1); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); @@ -168,55 +165,45 @@ std::shared_ptr MHASplitMFunction::initReference() const { auto data3 = std::make_shared(precisions[3], input_shapes[3]); ov::ParameterVector ngraphParams = {data0, data1, data2, data3}; + const auto rank_before = input_shapes[1].size(); + const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank_before}, get_decomposed_order(rank_before)); + const auto transpose1 = std::make_shared(data1, transpose1Const); + + std::shared_ptr subgraph_parent1 = transpose1; + if (with_mul) { + ov::Shape shape(rank_before, 1); + if (transpose1->get_output_partial_shape(0).is_static()) { + shape[rank_before - 3] = transpose1->get_output_shape(0)[rank_before - 3]; + } + const auto mulConst = ov::test::utils::make_constant(precisions[1], shape); + subgraph_parent1 = std::make_shared(transpose1, mulConst); + } + auto make_reshape = [](const std::shared_ptr& node, const ov::Shape& new_shape) { auto shape_const = ov::op::v0::Constant::create(ov::element::i32, {new_shape.size()}, new_shape); return std::make_shared(node, shape_const, true); }; auto reshape0 = make_reshape(data0, reshapes[0]); - auto reshape1 = make_reshape(data1, reshapes[1]); + auto reshape1 = make_reshape(subgraph_parent1, reshapes[1]); auto reshape2 = make_reshape(data2, reshapes[2]); auto reshape3 = make_reshape(data3, reshapes[3]); NodeVector subgraph_inputs = {reshape0, reshape1, reshape2, reshape3}; auto transpose0Param = std::make_shared(precisions[0], reshape0->get_shape()); - auto transpose1Param = std::make_shared(precisions[1], reshape1->get_shape()); + auto brgemm1Param = std::make_shared(precisions[1], reshape1->get_shape()); auto addParam = std::make_shared(precisions[2], reshape2->get_shape()); auto transpose2Param = std::make_shared(precisions[3], reshape3->get_shape()); - ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param}; + ov::ParameterVector subgraph_params = {transpose0Param, brgemm1Param, addParam, transpose2Param}; const auto rank = input_shapes[0].size() + 1; - const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, true)); - const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, get_decomposed_order_after_split_m(rank)); const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, true)); const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, get_fusion_order_after_split_m(rank, false)); const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); - const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); - - std::shared_ptr matmul_parent1 = transpose1; - if (with_mul) { - ov::Shape shape(rank - 1, 1); - if (transpose1->get_output_partial_shape(0).is_static()) { - shape[rank - 4] = transpose1->get_output_shape(0)[rank - 4]; - } - const auto mulConst = ov::test::utils::make_constant(precisions[1], shape); - - if (ov::shape_size(shape) > 1) { - ov::Shape reshape_shape = shape; - reshape_shape.insert(reshape_shape.cbegin() + (rank - 3), 1); - const auto mulReshape = make_reshape(mulConst, reshape_shape); - const auto mulParam = std::make_shared(precisions[1], mulReshape->get_shape()); - matmul_parent1 = std::make_shared(transpose1, mulParam); - subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param}; - subgraph_inputs = {reshape0, reshape1, mulReshape, reshape2, reshape3}; - } else { - matmul_parent1 = std::make_shared(transpose1, mulConst); - } - } - const auto matMul0 = std::make_shared(transpose0, matmul_parent1); + const auto matMul0 = std::make_shared(transpose0, brgemm1Param); const auto add = std::make_shared(matMul0, addParam); const auto softMax = std::make_shared(add, rank - 1); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); @@ -318,30 +305,36 @@ std::shared_ptr MHAMatMul0TransposeFunction::initReference() const { auto data2 = std::make_shared(precisions[2], input_shapes[2]); auto data3 = std::make_shared(precisions[3], input_shapes[3]); ov::ParameterVector ngraphParams = {data0, data1, data2, data3}; - NodeVector subgraph_inputs = {data0, data1, data2, data3}; + + const auto rank = input_shapes[0].size(); + const auto fusion_order = get_fusion_order(rank); + const auto decomposed_order = get_decomposed_order(rank); + std::vector transposed_b_order(rank); + std::iota(transposed_b_order.begin(), transposed_b_order.end(), 0); + std::swap(transposed_b_order[rank - 1], transposed_b_order[rank - 2]); + + const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose1 = std::make_shared(data1, transpose1Const); + const auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1}); + const auto mul = std::make_shared(transpose1, mulConst); + const auto transposeBConst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{rank}, transposed_b_order); + const auto transposeB = std::make_shared(mul, transposeBConst); + + NodeVector subgraph_inputs = {data0, transposeB, data2, data3}; auto transpose0Param = std::make_shared(precisions[0], input_shapes[0]); - auto transpose1Param = std::make_shared(precisions[1], input_shapes[1]); + auto brgemm1Param = std::make_shared(transposeB->get_element_type(), transposeB->get_output_partial_shape(0)); auto addParam = std::make_shared(precisions[2], input_shapes[2]); auto transpose2Param = std::make_shared(precisions[3], input_shapes[3]); - ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param}; - - const auto rank = input_shapes[0].size(); - const auto fusion_order = get_fusion_order(rank); - const auto decomposed_order = get_decomposed_order(rank); + ov::ParameterVector subgraph_params = {transpose0Param, brgemm1Param, addParam, transpose2Param}; const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); - const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, decomposed_order); const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); - const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); - - const auto mulConst = ov::test::utils::make_constant(precisions[1], ov::Shape{1}); - const auto mul = std::make_shared(transpose1, mulConst); - const auto matMul0 = std::make_shared(transpose0, mul); + const auto matMul0 = std::make_shared(transpose0, brgemm1Param); const auto add = std::make_shared(matMul0, addParam); const auto softMax = std::make_shared(add, rank - 1); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); @@ -818,29 +811,33 @@ std::shared_ptr MHAINT8MatMulTypeRelaxedFunction::initReference() con const auto fq0 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data0, ov::element::f32, fq_signed_params); const auto fq1 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data1, ov::element::f32, fq_signed_params); const auto fq2 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data3, ov::element::f32, fq_signed_params); - NodeVector subgraph_inputs = {fq0, fq1, data2, fq2}; - auto transpose0Param = std::make_shared(precision, input_shapes[0]); - auto transpose1Param = std::make_shared(precision, input_shapes[1]); - auto addParam = std::make_shared(precision, input_shapes[2]); - auto transpose2Param = std::make_shared(precision, input_shapes[3]); - ov::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param}; + const auto rank = input_shapes[0].get_shape().size(); + const auto fusion_order = get_fusion_order(rank); + const auto decomposed_order = get_decomposed_order(rank); + const auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, decomposed_order); + const auto transpose1 = std::make_shared(fq1, transpose1Const); - const auto shape_rank = input_shapes[0].get_shape().size(); - auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 3, 1}); - auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); + NodeVector subgraph_inputs = {fq0, transpose1, data2, fq2}; + + const auto transpose0Param = std::make_shared(precision, input_shapes[0]); + const auto brgemm1Param = std::make_shared(transpose1->get_element_type(), transpose1->get_output_partial_shape(0)); + const auto addParam = std::make_shared(precision, input_shapes[2]); + const auto transpose2Param = std::make_shared(precision, input_shapes[3]); + ov::ParameterVector subgraph_params = {transpose0Param, brgemm1Param, addParam, transpose2Param}; + + const auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); + const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{rank}, fusion_order); bool transA = false; bool transB = false; const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); - const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); const auto matMul0 = std::make_shared>( std::vector{ element::f32, element::f32 }, std::vector{ element::f32 }, ov::op::TemporaryReplaceOutputType(transpose0, element::f32).get(), - ov::op::TemporaryReplaceOutputType(transpose1, element::f32).get(), transA, transB); + ov::op::TemporaryReplaceOutputType(brgemm1Param, element::f32).get(), transA, transB); auto decomposed_fq = [](const ov::Output& input, const ov::element::Type& out_precision, float il, float ih, float scale) { @@ -941,8 +938,8 @@ std::shared_ptr MHATransposedInputFunction::initReference() const { const auto data2 = std::make_shared(precision, input_shapes[2]); ov::ParameterVector ngraphParam = {data0, data1, data2}; - bool is_supported = ((m_transposed_b && m_order == std::vector{0, 2, 1, 3}) || - (!m_transposed_b && m_order == std::vector{0, 2, 3, 1})); + bool is_supported = ((m_transposed_b && m_order == std::vector{0, 2, 3, 1}) || + (!m_transposed_b && m_order == std::vector{0, 2, 1, 3})); std::shared_ptr in1 = data1; if (!m_order.empty() && !is_supported) { @@ -963,11 +960,16 @@ std::shared_ptr MHATransposedInputFunction::initReference() const { const auto param0 = std::make_shared(precision, data0->get_output_partial_shape(0)); const auto param1 = std::make_shared(precision, in1->get_output_partial_shape(0)); const auto param2 = std::make_shared(precision, data2->get_output_partial_shape(0)); + ov::ParameterVector subgraph_params = {param0, param1, param2}; + ov::OutputVector subgraphs_inputs = {data0, in1, data2}; std::shared_ptr matmul0_in1 = param1; if (!m_order.empty() && is_supported) { - const auto transposeConst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{m_order.size()}, m_order); + const auto transposeConst = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{m_order.size()}, m_order); matmul0_in1 = std::make_shared(param1, transposeConst); + + std::swap(subgraphs_inputs[0], subgraphs_inputs[1]); + std::swap(subgraph_params[0], subgraph_params[1]); } const bool mm0_transpose_b = m_transposed_b && m_transpose_b_native_support; @@ -975,8 +977,7 @@ std::shared_ptr MHATransposedInputFunction::initReference() const { const auto softmax = std::make_shared(matMul0, -1); const auto matMul1 = std::make_shared(softmax, param2); - auto subgraph = std::make_shared(ov::NodeVector{data0, in1, data2}, - std::make_shared(NodeVector{matMul1}, ov::ParameterVector{param0, param1, param2})); + auto subgraph = std::make_shared(subgraphs_inputs, std::make_shared(NodeVector{matMul1}, subgraph_params)); ov::ResultVector results{std::make_shared(subgraph)}; return std::make_shared(results, ngraphParam, "mha");