From 9bb96464c99ee950b3277fded23beaa55d07118e Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Mon, 30 Dec 2024 08:42:27 +0100 Subject: [PATCH] [Snippets][CPU] Applied Vladislav comments --- .../x64/jit_brgemm_copy_b_emitter.cpp | 11 +--------- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 2 +- .../snippets/x64/op/brgemm_copy_b.cpp | 7 +++++++ .../snippets/x64/op/brgemm_copy_b.hpp | 2 ++ .../x64/pass/eliminate_brgemm_copy_b.cpp | 4 ++-- .../lowered/external_repacking_adjuster.cpp | 21 +++++++++++++------ .../lowered/external_repacking_adjuster.hpp | 4 ++-- 7 files changed, 30 insertions(+), 21 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp index 6df658d8d72d0c..861b9779c25533 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp @@ -21,15 +21,6 @@ using namespace ov::snippets::utils; namespace ov { namespace intel_cpu { -namespace { -bool get_is_transposed(const ov::snippets::lowered::ExpressionPtr& expr) { - const auto& layout = expr->get_input_port_descriptor(0)->get_layout(); - const auto is_transposed = !layout.empty() && layout.back() != layout.size() - 1; - OV_CPU_JIT_EMITTER_ASSERT(IMPLICATION(is_transposed, (layout[layout.size() - 2] == layout.size() - 1)), - "supports only N dim placed as last or pre last dimension"); - return is_transposed; -} -} // namespace jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h, cpu_isa_t isa, @@ -50,7 +41,7 @@ jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h, const auto& src_prc = brgemm_repack->get_src_element_type(); const auto& wei_prc = brgemm_repack->get_input_element_type(0); const auto wei_N_blk = brgemm_utils::repacking::compute_inner_n_block(wei_prc); - const auto is_transposed = get_is_transposed(expr); + const auto is_transposed = BrgemmCopyB::is_transposed(expr->get_input_port_descriptor(0)->get_layout()); const auto brgemm_type = get_brgemm_type(src_prc, is_transposed); const auto primitive_isa = brgemm_utils::get_primitive_isa(src_prc, with_amx(brgemm_type)); m_with_comp = with_compensations(brgemm_type); diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index f1dd7be98aa10e..a0a5537eaf3b1a 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -1015,7 +1015,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& layout) { + const auto is_transposed = !layout.empty() && layout.back() != layout.size() - 1; + OPENVINO_ASSERT(IMPLICATION(is_transposed, (layout[layout.size() - 2] == layout.size() - 1)), + "supports only N dim placed as last or pre last dimension"); + return is_transposed; +} + BrgemmCopyB::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { const auto& brg_copyb = ov::as_type_ptr(n); OPENVINO_ASSERT(brg_copyb, "Got invalid node in BrgemmCopyB::ShapeInfer"); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp index 54e2c39fcf1c06..b4e7b030fc605b 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp @@ -72,6 +72,8 @@ class BrgemmCopyB : public snippets::modifier::MemoryAccess, public ov::op::Op { Result infer(const std::vector& input_shapes) override; }; + static bool is_transposed(const std::vector& layout); + private: void custom_constructor_validate_and_infer_types(std::vector layout_input = {}); void validate_element_type(const ov::element::Type& element_type); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp index 6176e99ebc3a9a..f17d052e7ffe43 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp @@ -42,13 +42,13 @@ pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() { return false; // If there is non-empty and non-planar layout, we should insert reshape to support shape inference - if (!layout.empty() && !ov::snippets::utils::is_planar_layout(layout)) { + if (!ov::snippets::utils::is_planar_layout(layout)) { const auto& subtensor = in_desc->get_subtensor(); const auto& reshape = std::make_shared(copy_b_node->input_value(0), layout); ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->input(0), subtensor, layout); ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->output(0), subtensor); - ov::replace_node(copy_b_node, reshape); + ov::replace_node_update_name(copy_b_node, reshape); return true; } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp index 61748054acbbac..6f9a652620df2d 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp @@ -14,6 +14,8 @@ namespace ov { namespace intel_cpu { +const size_t BrgemmExternalRepackingAdjuster::brgemm_kernel_rank = 2; + BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, const CPURuntimeConfigurator* configurator) : snippets::lowered::pass::RuntimeOptimizer(configurator) { @@ -32,7 +34,9 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp const auto wei_prc = brgemm->get_input_element_type(1); const auto isa = brgemm_utils::get_primitive_isa(src_prc, brgemm_utils::with_amx(brgemm->get_type())); const auto inner_n_block = brgemm_utils::repacking::compute_inner_n_block(wei_prc); - auto config = BrgemmCopyBKernelConfig(src_prc, wei_prc, isa, false, false, inner_n_block); + const auto is_transposed_b = + BrgemmCopyB::is_transposed(m_configurator->get_io_descs()[i]->get_layout()); + auto config = BrgemmCopyBKernelConfig(src_prc, wei_prc, isa, false, is_transposed_b, inner_n_block); m_executors[i] = std::make_shared( static_cast(m_configurator)->get_cache(), config); @@ -49,13 +53,13 @@ VectorDims BrgemmExternalRepackingAdjuster::get_blk_order(size_t shape_rank) { return order; } -VectorDims BrgemmExternalRepackingAdjuster::get_blk_shape(const VectorDims& shape, ov::element::Type prc) { +VectorDims BrgemmExternalRepackingAdjuster::get_blk_shape(const VectorDims& planar_shape, ov::element::Type prc) { const auto vnni_factor = brgemm_utils::compute_vnni_factor(prc); - const auto K = *++shape.rbegin(); - const auto N = *shape.rbegin(); + const auto K = *++planar_shape.rbegin(); + const auto N = *planar_shape.rbegin(); const auto new_K = snippets::utils::div_up(K, vnni_factor); const auto new_N = std::max(N, brgemm_utils::repacking::compute_inner_n_block(prc)); - VectorDims blk_shape(shape.begin(), shape.end() - brgemm_kernel_rank); + VectorDims blk_shape(planar_shape.begin(), planar_shape.end() - brgemm_kernel_rank); blk_shape.insert(blk_shape.end(), {new_K, new_N, vnni_factor}); return blk_shape; } @@ -66,9 +70,10 @@ void BrgemmExternalRepackingAdjuster::update_kernel(const RepackExecutorPtr& exe size_t N, size_t K, ov::element::Type prc) { - const auto copy_wei_stride = ov::snippets::utils::get_dim_in_stride(shape, layout, 1) * prc.size(); const auto generic_config = executor->get_config().get_clone_ptr(); auto config = static_cast(generic_config.get()); + const auto idx = config->is_transposed_B() ? 0 : 1; + const auto copy_wei_stride = ov::snippets::utils::get_dim_in_stride(shape, layout, idx) * prc.size(); config->update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, prc)); executor->update_by_config(*config); } @@ -125,6 +130,10 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin const auto desc = std::make_shared(prc, Shape(planar_shape), blk_shape, order); // Save original input offsets for input before repacking. + // If the shape has not been changed, it means that we already created `RepackedInput` for this input + // on previous pass call and now `cpu_config->io_data_offsets[i]` contains offsets not for original input - + // they were updated for blocked shapes/zeroed for previous initialization and we canonot use them as original + // offsets. const auto in_offsets = shape == cpu_config->latest_shapes[i] ? repacked_in.in_offsets() : cpu_config->io_data_offsets[i]; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp index 29bd60948ebadc..5efc5a738c5d76 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp @@ -30,7 +30,7 @@ class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::Runt private: using RepackExecutorPtr = std::shared_ptr; static VectorDims get_blk_order(size_t shape_rank); - static VectorDims get_blk_shape(const VectorDims& shape, ov::element::Type prc); + static VectorDims get_blk_shape(const VectorDims& planar_shape, ov::element::Type prc); void update_kernel(const RepackExecutorPtr& executor, const VectorDims& shape, @@ -39,7 +39,7 @@ class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::Runt size_t K, ov::element::Type prc); - const static size_t brgemm_kernel_rank = 2; + static const size_t brgemm_kernel_rank; std::unordered_map m_executors; };