From fea453e7c46fd0740af80279c2a9b832526f491b Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Sat, 28 Dec 2024 09:34:07 +0100 Subject: [PATCH] [Snippets][CPU] Small optimizations --- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 99 +++++++++---- src/plugins/intel_cpu/src/nodes/subgraph.h | 3 +- .../lowered/external_repacking_adjuster.cpp | 132 ++++++++++-------- .../lowered/external_repacking_adjuster.hpp | 14 +- 4 files changed, 167 insertions(+), 81 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index fb657263fc3161..11adfab7d5a9c8 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -95,13 +95,19 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor { init_call_args(call_args, inMemPtrs, outMemPtrs, ithr); }; - auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + using call_functor = std::function&, size_t)>; + call_functor caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + callable(&call_args, indexes.data()); + }; + #ifdef OPENVINO_ARCH_X86_64 - if (should_repacking_be_in_parallel()) + if (should_repacking_be_in_parallel()) { + caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args); + callable(&call_args, indexes.data()); + }; + } #endif // OPENVINO_ARCH_X86_64 - callable(&call_args, indexes.data()); - }; if (m_parallel_exec_domain.size() == rank6D) { parallel_for6d(initializer, caller); @@ -169,14 +175,21 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { init_call_args(call_args, ithr); }; - auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + using call_functor = std::function&, size_t)>; + call_functor caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); + callable(&call_args); + }; + #ifdef OPENVINO_ARCH_X86_64 - if (should_repacking_be_in_parallel()) + if (should_repacking_be_in_parallel()) { + caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args); + callable(&call_args); + }; + } #endif // OPENVINO_ARCH_X86_64 - callable(&call_args); - }; if (m_parallel_exec_domain.size() == rank6D) { parallel_for6d(initializer, caller); @@ -973,6 +986,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptrtensor_rank; m_harness_work_amount = std::accumulate(m_parallel_exec_domain.cbegin(), m_parallel_exec_domain.cend(), size_t(1), @@ -1021,6 +1035,49 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr()); + parallel_nt_static(0, [&](const int ithr, const int nthr) { + BrgemmCopyBKernel::call_args args; + size_t start = 0, end = 0; + splitter(batch, nthr, ithr, start, end); + for (size_t iwork = start; iwork < end; ++iwork) { + const uint8_t* src_u8 = src; + uint8_t* dst_u8 = dst; + size_t tmp = iwork; + for (ptrdiff_t j = static_cast(dom.size()) - 3; j >= 0; j--) { + auto idx = tmp % dom[j]; + tmp /= dom[j]; + + src_u8 += idx * in_str[j]; + dst_u8 += idx * out_str[j]; + } + args.src = src_u8; + args.tr_src = dst_u8; + (*ker)(&args); + } + }); +}; +} // namespace std::vector Subgraph::SubgraphExecutor::separately_repack_inputs(const dnnl::stream& strm, const std::vector& srcMemPtrs) { auto reordered_in_ptrs = srcMemPtrs; @@ -1040,21 +1097,19 @@ std::vector Subgraph::SubgraphExecutor::separately_repack_inputs(cons VectorDims dom; const auto& shape = dst_mem->getShape().getDims(); - OPENVINO_ASSERT(shape.size() <= rank6D, "Unsupported shape rank of repacking data"); - init_parallel_domain(shape, rank6D, 2lu, dom); + OPENVINO_ASSERT(shape.size() <= m_tensor_rank, "Unsupported shape rank of repacking data"); + init_parallel_domain(shape, m_tensor_rank, 2lu, dom); const auto& in_strides = repacked_input.in_offsets(); const auto& out_strides = repacked_input.out_offsets(); - OPENVINO_ASSERT(in_strides.size() == rank6D && out_strides.size() == rank6D && dom.size() == rank6D, + OPENVINO_ASSERT(everyone_is(m_tensor_rank, in_strides.size(), out_strides.size(), dom.size()), "Unsupported shape rank of repacking data"); const auto& kernel = repacked_input.kernel(); - parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) { - BrgemmCopyBKernel::call_args args; - args.src = src + d0 * in_strides[0] + d1 * in_strides[1] + d2 * in_strides[2] + d3 * in_strides[3]; - args.tr_src = dst + d0 * out_strides[0] + d1 * out_strides[1] + d2 * out_strides[2] + d3 * out_strides[3]; - (*kernel)(&args); - }); + if (m_tensor_rank == rank6D) + parallel4d_repacking(kernel.get(), dom, in_strides, out_strides, src, dst); + else + parallelNd_repacking(kernel.get(), dom, in_strides, out_strides, src, dst); reordered_in_ptrs[in_idx] = dst_mem; offset += desc->getCurrentMemSize(); @@ -1072,15 +1127,11 @@ void Subgraph::SubgraphExecutor::in_parallel_repack_inputs(const std::vectorget_config().get_clone_ptr(); + auto config = static_cast(generic_config.get()); + config->update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, prc)); + executor->update_by_config(*config); +} + bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmExternalRepackingAdjuster") const auto& cpu_config = ov::as_type_ptr(m_configurator->get_config()); @@ -49,76 +81,66 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin for (const auto& p : m_executors) { const auto& i = p.first; const auto& shape = cpu_config->io_shapes[i]; - if (shape == cpu_config->latest_shapes[i]) - continue; const auto& layout = cpu_config->io_layouts[i]; const auto planar_shape = ov::snippets::utils::get_planar_vdims(shape, layout); const auto& K = *++planar_shape.rbegin(); const auto& N = *planar_shape.rbegin(); - // Create CPU Memory descriptor - const auto& precision = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0); - const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision); - const size_t brgemm_kernel_rank = 2; - // Firstly, batch dims are set - VectorDims requested_blocked_shape(planar_shape.begin(), planar_shape.end() - brgemm_kernel_rank); - // Then, the blocked dims are formed - const auto new_K = snippets::utils::div_up(K, vnni_factor); - const auto new_N = std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)); - requested_blocked_shape.insert(requested_blocked_shape.end(), {new_K, new_N, vnni_factor}); - - VectorDims requested_order(planar_shape.size() - brgemm_kernel_rank); - std::iota(requested_order.begin(), requested_order.end(), 0); - const auto last_idx = planar_shape.size() - 1; - requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); - - const auto desc = std::make_shared(precision, - Shape(planar_shape), - requested_blocked_shape, - requested_order); - - // Create Kernel using BrgemmCopyBExecutor - const auto& executor = p.second; - const auto copy_wei_stride = - ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size(); - const auto generic_config = executor->get_config().get_clone_ptr(); - auto config = static_cast(generic_config.get()); - config->update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, precision)); - executor->update_by_config(*config); + const auto& prc = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0); + const auto blk_shape = get_blk_shape(planar_shape, prc); - // Save original input offsets for input before repacking. - const auto in_offsets = cpu_config->io_data_offsets[i]; + // src data + dst data per kernel call + const auto src_data = N * K * prc.size(); + const auto dst_data = + std::accumulate(blk_shape.rbegin(), blk_shape.rbegin() + 3, prc.size(), std::multiplies()); + data_size += src_data + dst_data; - ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1); - shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end()); - m_configurator->compute_offsets(shape_for_offset, i, 0); - // Save new input offsets for input after repacking. - const auto out_offsets = cpu_config->io_data_offsets[i]; + update_kernel(p.second, shape, layout, N, K, prc); + } - cpu_config->repacked_inputs[i] = - CPURuntimeConfig::RepackedInput(executor->get_kernel(), desc, in_offsets, out_offsets); + const auto L2_cache_size = dnnl::utils::get_cache_size(2, true); + const auto fit_into_L2 = data_size < L2_cache_size; + // Heuristic: If external repacking data doesn't fit in the cache L2, + // external repacking should be executed in seperate parallel section before kernel execution. + cpu_config->repacking_impl_type = + fit_into_L2 ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL : CPURuntimeConfig::RepackingImplType::SEPARATE; - // src data + dst data per kernel call - data_size += N * K * precision.size() + new_N * new_K * vnni_factor * precision.size(); - } + const auto is_impl_parallel = cpu_config->repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL; + + for (const auto& p : m_executors) { + const auto& i = p.first; + const auto& shape = cpu_config->io_shapes[i]; + auto& repacked_in = cpu_config->repacked_inputs[i]; + + const auto& prc = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0); + auto planar_shape = ov::snippets::utils::get_planar_vdims(shape, cpu_config->io_layouts[i]); + auto blk_shape = get_blk_shape(planar_shape, prc); + // In parallel impl, each thread needs buffer with only shape [K_blk, N_blk, VNNI] to store repacking data + if (is_impl_parallel) { + std::fill(planar_shape.rbegin() + brgemm_kernel_rank, planar_shape.rend(), 1); + std::fill(blk_shape.rbegin() + brgemm_kernel_rank + 1, blk_shape.rend(), 1); + } + const auto order = get_blk_order(planar_shape.size()); + const auto desc = std::make_shared(prc, Shape(planar_shape), blk_shape, order); - if (!cpu_config->repacked_inputs.empty()) { - const auto L2_cache_size = dnnl::utils::get_cache_size(2, true); - const auto fit_into_L2 = data_size < L2_cache_size; - // Heuristic: If external repacking data doesn't fit in the cache L2, - // external repacking should be executed in seperate parallel section before kernel execution. - cpu_config->repacking_impl_type = fit_into_L2 ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL - : CPURuntimeConfig::RepackingImplType::SEPARATE; + // Save original input offsets for input before repacking. + const auto in_offsets = + shape == cpu_config->latest_shapes[i] ? repacked_in.in_offsets() : cpu_config->io_data_offsets[i]; // In parallel case Kernel should not add offsets to repacked inputs because // they will be applied during repacking in execution stage - if (cpu_config->repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL) { - for (const auto& in : cpu_config->repacked_inputs) { - auto& offsets = cpu_config->io_data_offsets[in.first]; - std::fill(offsets.begin(), offsets.end(), 0); - } + if (is_impl_parallel) { + auto& offsets = cpu_config->io_data_offsets[i]; + std::fill(offsets.begin(), offsets.end(), 0); + } else { + ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1); + shape_for_offset.insert(shape_for_offset.end(), blk_shape.begin(), blk_shape.end()); + m_configurator->compute_offsets(shape_for_offset, i, 0); } + const auto out_offsets = cpu_config->io_data_offsets[i]; + + repacked_in = CPURuntimeConfig::RepackedInput(p.second->get_kernel(), desc, in_offsets, out_offsets); } return true; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp index 6f4e3942b1f581..29bd60948ebadc 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp @@ -28,7 +28,19 @@ class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::Runt } private: - std::unordered_map> m_executors; + using RepackExecutorPtr = std::shared_ptr; + static VectorDims get_blk_order(size_t shape_rank); + static VectorDims get_blk_shape(const VectorDims& shape, ov::element::Type prc); + + void update_kernel(const RepackExecutorPtr& executor, + const VectorDims& shape, + const VectorDims& layout, + size_t N, + size_t K, + ov::element::Type prc); + + const static size_t brgemm_kernel_rank = 2; + std::unordered_map m_executors; }; } // namespace intel_cpu