diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp new file mode 100644 index 00000000000000..496ea1f2e9f5a7 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "nodes/executors/aarch64/subgraph.hpp" + +#include "snippets/op/subgraph.hpp" + + +namespace ov { +namespace intel_cpu { + +void SubgraphStaticExecutor::exec_impl(const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + const auto& callable = m_schedule->get_callable(); + + auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + }; + auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + callable(&call_args, indexes.data()); + }; + + if (m_parallel_exec_domain.size() == rank6D) { + parallel_for6d(initializer, caller); + } else { + parallel_forNd(initializer, caller); + } +} + +void SubgraphDynamicSpecializedExecutor::exec_impl(const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + const auto& callable = m_schedule->get_callable(); + + OPENVINO_ASSERT(m_data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!"); + OPENVINO_ASSERT(m_data_offsets.front().size() == m_parallel_exec_domain.size(), + "Data offsets with invalid ranks detected"); + + // Note: we need to reset KernelExecutorTable to the state that was recorded in the + // SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes + m_reset_exec_table_state(); + + std::vector src_ptrs; + std::vector dst_ptrs; + init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs, m_start_offset_in, m_start_offset_out); + + auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, ithr); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + }; + auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); + callable(&call_args); + }; + + if (m_parallel_exec_domain.size() == rank6D) { + parallel_for6d(initializer, caller); + } else { + parallel_forNd(initializer, caller); + } +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp new file mode 100644 index 00000000000000..d08e3b44d7641f --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "nodes/executors/subgraph.hpp" + +namespace ov { +namespace intel_cpu { + +class SubgraphExecutor : public SubgraphBaseExecutor { +public: + template + SubgraphExecutor(const std::shared_ptr& snippet_config, Args ...args) { + m_buffer_scratchpad = allocator(m_internal_buffer_size); + } +}; + +class SubgraphStaticExecutor : public SubgraphExecutor, public SubgraphStaticBaseExecutor { +public: + template + SubgraphStaticExecutor(const std::shared_ptr& snippet_config, Args ...args) + : SubgraphExecutor(snippet_config, args...), SubgraphStaticBaseExecutor() {} + + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override; +}; + +class SubgraphDynamicSpecializedExecutor : public SubgraphExecutor, public SubgraphDynamicSpecializedBaseExecutor { +public: + template + SubgraphDynamicSpecializedExecutor(const std::shared_ptr& snippet_config, Args ...args) + : SubgraphExecutor(snippet_config, args...), SubgraphDynamicSpecializedBaseExecutor(snippet_config) {} + + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp new file mode 100644 index 00000000000000..34ae1449b56567 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp @@ -0,0 +1,142 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "nodes/executors/subgraph.hpp" +#if defined(OPENVINO_ARCH_ARM64) +# include "emitters/snippets/aarch64/cpu_generator.hpp" +#else +# include "emitters/snippets/x64/cpu_generator.hpp" +#endif +#include "openvino/core/parallel.hpp" + +namespace ov { +namespace intel_cpu { + +SubgraphCodeGenerator::SubgraphCodeGenerator(const std::shared_ptr& snippet_attrs, + const std::shared_ptr& config) { + OPENVINO_ASSERT(snippet_attrs, "Subgraph attributes are empty!"); + OPENVINO_ASSERT(config, "Runtime Config is empty!"); + + jit_snippets_compile_args jcp; + jcp.data_offsets = config->io_data_offsets; + SubgraphBaseExecutor::init_parallel_domain(config, jcp.exec_domain); + schedule = + std::make_shared(snippet_attrs->snippet->generate(reinterpret_cast(&jcp))); +} + +SubgraphBaseExecutor::SubgraphBaseExecutor(const std::shared_ptr& snippet_config, + const std::shared_ptr& snippet_attrs, + const std::shared_ptr& snippet, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) + : m_schedule(snippet->get()), + m_start_offset_in(start_offset_in), + m_start_offset_out(start_offset_out) { + OPENVINO_ASSERT(m_schedule, "Schedule is empty!"); + OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!"); + init_parallel_domain(snippet_config, m_parallel_exec_domain); + + m_tensor_rank = snippet_config->tensor_rank; + m_harness_work_amount = std::accumulate(m_parallel_exec_domain.cbegin(), + m_parallel_exec_domain.cend(), + size_t(1), + std::multiplies()); + m_nthreads = std::min(parallel_get_max_threads(), static_cast(m_harness_work_amount)); + + m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size; + OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), + "Undefined buffer scratchpad size!"); + m_internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; +} + +void SubgraphBaseExecutor::init_parallel_domain(const std::vector& master_shape, + size_t tensor_rank, + size_t tile_rank, + std::vector& domain) { + domain.resize(tensor_rank, 1); + std::fill(domain.begin(), domain.end(), 1); + std::copy(master_shape.cbegin(), + master_shape.cbegin() + (master_shape.size() - tile_rank), + domain.begin() + (tensor_rank - master_shape.size())); +} + +void SubgraphBaseExecutor::init_parallel_domain(const std::shared_ptr& snippet_config, + std::vector& domain) { + init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain); +} +void SubgraphBaseExecutor::parallel_for6d( + const std::function& initializer, + const std::function&, size_t)>& caller) { + const auto& dom = m_parallel_exec_domain; + + parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) { + jit_snippets_call_args call_args; + initializer(call_args, ithr); + + size_t start = 0, end = 0; + splitter(m_harness_work_amount, nthr, ithr, start, end); + + std::vector indexes{0, 0, 0, 0, 0}; + parallel_it_init(start, + indexes[0], + dom[0], + indexes[1], + dom[1], + indexes[2], + dom[2], + indexes[3], + dom[3], + indexes[4], + dom[4]); + for (size_t iwork = start; iwork < end; ++iwork) { + caller(call_args, indexes, ithr); + parallel_it_step(indexes[0], + dom[0], + indexes[1], + dom[1], + indexes[2], + dom[2], + indexes[3], + dom[3], + indexes[4], + dom[4]); + } + }); +} + +void SubgraphBaseExecutor::parallel_forNd( + const std::function& initializer, + const std::function&, size_t)>& caller) { + const auto& dom = m_parallel_exec_domain; + + parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) { + jit_snippets_call_args call_args; + initializer(call_args, ithr); + + size_t start = 0, end = 0; + splitter(m_harness_work_amount, nthr, ithr, start, end); + + std::vector indexes(dom.size() - 1, 0); + for (size_t iwork = start; iwork < end; ++iwork) { + size_t tmp = iwork; + for (ptrdiff_t j = static_cast(dom.size()) - 2; j >= 0; j--) { + indexes[j] = tmp % dom[j]; + tmp /= dom[j]; + } + + caller(call_args, indexes, ithr); + } + }); +} + +void SubgraphBaseExecutor::execute(const dnnl::stream& strm, + const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + exec_impl(inMemPtrs, outMemPtrs); +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp new file mode 100644 index 00000000000000..f2dd48ab6788a6 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/subgraph.hpp @@ -0,0 +1,190 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "cpu_memory.h" +#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "snippets/generator.hpp" +#include "snippets/op/subgraph.hpp" + +namespace ov { +namespace intel_cpu { + +struct SubgraphAttrs { + // Local copy of subgraph node for canonization & code generation + std::shared_ptr snippet; + uint64_t bodyHash; + std::vector inMemOrders; + std::vector outMemOrders; + std::vector inMemPrecs; + std::vector outMemPrecs; +}; + +class SubgraphCodeGenerator { +public: + SubgraphCodeGenerator(const std::shared_ptr& snippet_attrs, + const std::shared_ptr& config); + + const std::shared_ptr& get() const { + return schedule; + } + +private: + std::shared_ptr schedule; +}; + +class SubgraphBaseExecutor { +public: + using BufferScratchpadAllocator = std::function; + + SubgraphBaseExecutor() = default; + SubgraphBaseExecutor(const std::shared_ptr& snippet_config, + const std::shared_ptr& snippet_attrs, + const std::shared_ptr& snippet, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache); + virtual ~SubgraphBaseExecutor() = default; + + virtual void execute(const dnnl::stream& strm, + const std::vector& inMemPtrs, + const std::vector& outMemPtrs); + + static void init_parallel_domain(const std::vector& master_shape, + size_t tensor_rank, + size_t tile_rank, + std::vector& domain); + static void init_parallel_domain(const std::shared_ptr& snippet_config, + std::vector& domain); + +protected: + virtual void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) = 0; + + virtual void parallel_for6d( + const std::function& initializer, + const std::function&, size_t)>& caller); + virtual void parallel_forNd( + const std::function& initializer, + const std::function&, size_t)>& caller); + + inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const { + if (m_buffer_scratchpad_size > 0) + scratchpad_ptr = m_buffer_scratchpad->getDataAs() + ithr * m_buffer_scratchpad_size; + } + + using initializer_functor = std::function; + using call_functor = std::function&, size_t)>; + + std::shared_ptr m_schedule; + // Holds index of output used as in execution domain + // it should be compatible with a schedule's work size + std::vector m_parallel_exec_domain = {}; + size_t m_harness_work_amount = 0; + + // Buffer scratchpad + MemoryPtr m_buffer_scratchpad = nullptr; + size_t m_buffer_scratchpad_size = 0; + size_t m_internal_buffer_size = 0; + size_t m_tensor_rank = 0; + + const size_t rank6D = 6; + + // Count of threads for parallel_nt + int m_nthreads = 0; + + std::vector m_start_offset_in = {}; + std::vector m_start_offset_out = {}; +}; + +// Class for Subgraphs with static shapes +class SubgraphStaticBaseExecutor { +public: + SubgraphStaticBaseExecutor() = default; + virtual ~SubgraphStaticBaseExecutor() = default; + +protected: + typedef void (*kernel)(const void*, const void*); + + inline void init_call_args(jit_snippets_call_args& call_args, + const std::vector& srcMemPtrs, + const std::vector& dstMemPtrs, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + size_t ithr) { + for (size_t i = 0; i < srcMemPtrs.size(); i++) + call_args.src_ptrs[i] = srcMemPtrs[i]->getDataAs() + start_offset_in[i]; + + for (size_t i = 0; i < dstMemPtrs.size(); i++) + call_args.dst_ptrs[i] = dstMemPtrs[i]->getDataAs() + start_offset_out[i]; + } +}; + +// Specialized dynamic executor based on shape agnostic kernel for the specific input shapes +class SubgraphDynamicSpecializedBaseExecutor { +public: + SubgraphDynamicSpecializedBaseExecutor(const std::shared_ptr& snippet_config) { + m_buffer_offsets = snippet_config->buffer_cluster_offsets; + m_data_offsets = snippet_config->io_data_offsets; + m_loop_args = snippet_config->loop_args; + m_reset_exec_table_state = snippet_config->kernel_executor_table->get_state_reset(); + } + virtual ~SubgraphDynamicSpecializedBaseExecutor() = default; + +protected: + typedef void (*dynamic_kernel)(const void*); + + inline void init_call_args(jit_snippets_call_args& call_args, size_t ithr) { + call_args.register_loops(m_loop_args); + std::copy(m_buffer_offsets.cbegin(), m_buffer_offsets.cend(), call_args.buffer_offsets); + } + + inline void init_original_ptrs(const std::vector& srcMemPtrs, + const std::vector& dstMemPtrs, + std::vector& src_ptrs, + std::vector& dst_ptrs, + const std::vector& start_offset_in, + const std::vector& start_offset_out) { + const auto in_num = srcMemPtrs.size(); + const auto out_num = dstMemPtrs.size(); + + src_ptrs.resize(in_num, nullptr); + dst_ptrs.resize(out_num, nullptr); + + for (size_t i = 0; i < in_num; i++) + src_ptrs[i] = srcMemPtrs[i]->getDataAs() + start_offset_in[i]; + for (size_t i = 0; i < out_num; i++) + dst_ptrs[i] = dstMemPtrs[i]->getDataAs() + start_offset_out[i]; + } + + inline void update_ptrs(jit_snippets_call_args& call_args, + const std::vector& src_ptrs, + const std::vector& dst_ptrs, + const std::vector& indexes) const { + for (size_t i = 0; i < src_ptrs.size(); i++) { + auto i_ptr = src_ptrs[i]; + for (size_t j = 0; j < indexes.size(); j++) { + i_ptr += m_data_offsets[i][j] * indexes[j]; + } + call_args.src_ptrs[i] = i_ptr; + } + for (size_t i = 0; i < dst_ptrs.size(); i++) { + auto i_ptr = dst_ptrs[i]; + for (size_t j = 0; j < indexes.size(); j++) { + i_ptr += m_data_offsets[i + src_ptrs.size()][j] * indexes[j]; + } + call_args.dst_ptrs[i] = i_ptr; + } + } + + std::vector m_buffer_offsets = {}; + std::vector> m_data_offsets = {}; + std::vector m_loop_args = {}; + std::function m_reset_exec_table_state; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp new file mode 100644 index 00000000000000..22b18606ba8800 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp @@ -0,0 +1,308 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "nodes/executors/x64/subgraph.hpp" + +#include "emitters/snippets/x64/cpu_generator.hpp" +#include "snippets/op/subgraph.hpp" + +#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS) +# include + +# include "emitters/snippets/x64/jit_segfault_detector_emitter.hpp" +std::mutex err_print_lock; +#endif + +namespace ov { +namespace intel_cpu { + +namespace { +inline void parallel4d_repacking(const BrgemmCopyBKernel* ker, + const VectorDims& dom, + const VectorDims& in_str, + const VectorDims& out_str, + const uint8_t* src, + uint8_t* dst) { + parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) { + BrgemmCopyBKernel::call_args args; + args.src = src + d0 * in_str[0] + d1 * in_str[1] + d2 * in_str[2] + d3 * in_str[3]; + args.tr_src = dst + d0 * out_str[0] + d1 * out_str[1] + d2 * out_str[2] + d3 * out_str[3]; + (*ker)(&args); + }); +}; +inline void parallelNd_repacking(const BrgemmCopyBKernel* ker, + const VectorDims& dom, + const VectorDims& in_str, + const VectorDims& out_str, + const uint8_t* src, + uint8_t* dst) { + const size_t batch = std::accumulate(dom.rbegin() + 2, dom.rend(), 1lu, std::multiplies()); + parallel_nt_static(0, [&](const int ithr, const int nthr) { + BrgemmCopyBKernel::call_args args; + size_t start = 0, end = 0; + splitter(batch, nthr, ithr, start, end); + for (size_t iwork = start; iwork < end; ++iwork) { + const uint8_t* src_u8 = src; + uint8_t* dst_u8 = dst; + size_t tmp = iwork; + for (ptrdiff_t j = static_cast(dom.size()) - 3; j >= 0; j--) { + auto idx = tmp % dom[j]; + tmp /= dom[j]; + + src_u8 += idx * in_str[j]; + dst_u8 += idx * out_str[j]; + } + args.src = src_u8; + args.tr_src = dst_u8; + (*ker)(&args); + } + }); +}; +} // namespace + +SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& snippet_config, + const std::shared_ptr& snippet_attrs, + const std::shared_ptr& snippet, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) + : SubgraphBaseExecutor(snippet_config, + snippet_attrs, + snippet, + start_offset_in, + start_offset_out, + allocator, + kernel_cache) { + m_repacking_impl_type = snippet_config->repacking_impl_type; + m_repacked_inputs = snippet_config->repacked_inputs; + + auto external_buffer_size = + std::accumulate(m_repacked_inputs.begin(), + m_repacked_inputs.end(), + size_t(0), + [](size_t sum, const std::pair& p) { + return sum + p.second.desc()->getCurrentMemSize(); + }); + + if (should_repacking_be_in_parallel()) { + // When external repacking is applied in parallel section, + // each thread should have own buffer to store repacked data + external_buffer_size *= m_nthreads; + + // To avoid extra overheads in runtime on vector creation, + // we initialize `repacked_offsets_by_threads` by default here + m_repacked_offsets_by_threads.resize(m_nthreads); + for (size_t i = 0; i < m_repacked_offsets_by_threads.size(); ++i) + clean_repacked_offsets(i); + + if (m_tensor_rank == rank6D) { + init_offset = [](const std::vector& offsets, const std::vector& indexes, size_t& offset) { + offset += offsets[0] * indexes[0] + offsets[1] * indexes[1] + offsets[2] * indexes[2] + + offsets[3] * indexes[3]; + }; + } else { + init_offset = [](const std::vector& offsets, const std::vector& indexes, size_t& offset) { + for (size_t j = 0; j < indexes.size(); j++) + offset += offsets[j] * indexes[j]; + }; + } + } + + m_buffer_scratchpad = allocator(m_internal_buffer_size + external_buffer_size); + +#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS) + const auto target = std::dynamic_pointer_cast( + snippet_attrs->snippet->get_generator()->get_target_machine()); + enabled_segfault_detector = target && target->debug_config.enable_segfault_detector; +#endif +} + +#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS) +void SubgraphExecutor::segfault_detector() { + if (enabled_segfault_detector) { + __sighandler_t signal_handler = [](int signal) { + std::lock_guard guard(err_print_lock); + if (auto segfault_detector_emitter = ov::intel_cpu::g_custom_segfault_handler->local()) + std::cout << segfault_detector_emitter->info() << std::endl; + auto tid = parallel_get_thread_num(); + OPENVINO_THROW("Segfault was caught by the signal handler in subgraph node execution on thread " + + std::to_string(tid)); + }; + struct sigaction new_handler {}; + new_handler.sa_handler = signal_handler; + sigaction(SIGSEGV, &new_handler, nullptr); + } +} +#endif + +std::vector SubgraphExecutor::separately_repack_inputs(const dnnl::stream& strm, + const std::vector& srcMemPtrs) { + auto reordered_in_ptrs = srcMemPtrs; + size_t offset = m_internal_buffer_size; + for (const auto& p : m_repacked_inputs) { + const auto in_idx = p.first; + const auto& repacked_input = p.second; + const auto& desc = repacked_input.desc(); + const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; + + OPENVINO_ASSERT(in_idx < srcMemPtrs.size(), "Incorrect index of input repacked mem ptr"); + const auto& src_mem = srcMemPtrs[in_idx]; + const auto& dst_mem = std::make_shared(strm.get_engine(), desc, data_ptr, false); + + const auto* src = src_mem->getDataAs() + m_start_offset_in[in_idx]; + auto* dst = dst_mem->getDataAs(); + + VectorDims dom; + const auto& shape = dst_mem->getShape().getDims(); + OPENVINO_ASSERT(shape.size() <= m_tensor_rank, "Unsupported shape rank of repacking data"); + init_parallel_domain(shape, m_tensor_rank, 2lu, dom); + + const auto& in_strides = repacked_input.in_offsets(); + const auto& out_strides = repacked_input.out_offsets(); + OPENVINO_ASSERT(everyone_is(m_tensor_rank, in_strides.size(), out_strides.size(), dom.size()), + "Unsupported shape rank of repacking data"); + + const auto& kernel = repacked_input.kernel(); + if (m_tensor_rank == rank6D) + parallel4d_repacking(kernel.get(), dom, in_strides, out_strides, src, dst); + else + parallelNd_repacking(kernel.get(), dom, in_strides, out_strides, src, dst); + + reordered_in_ptrs[in_idx] = dst_mem; + offset += desc->getCurrentMemSize(); + } + return reordered_in_ptrs; +} + +void SubgraphExecutor::in_parallel_repack_inputs(const std::vector& inMemPtrs, + const std::vector& indexes, + int ithr, + jit_snippets_call_args& call_args) { + size_t repacked_offset_idx = 0; + for (const auto& p : m_repacked_inputs) { + const auto& in_idx = p.first; + const auto& repacked_in = p.second; + + size_t src_offset = m_start_offset_in[in_idx]; + init_offset(repacked_in.in_offsets(), indexes, src_offset); + + auto* repacked_ptr = get_external_scratchpad_ptr(ithr, in_idx); + + auto& last_processed_src_offset = m_repacked_offsets_by_threads[ithr][repacked_offset_idx]; + if (src_offset != last_processed_src_offset) { + BrgemmCopyBKernel::call_args args; + args.src = inMemPtrs[in_idx]->getDataAs() + src_offset; + args.tr_src = repacked_ptr; + (*repacked_in.kernel())(&args); + + last_processed_src_offset = src_offset; + } + + call_args.src_ptrs[in_idx] = repacked_ptr; + ++repacked_offset_idx; + } +} + +void SubgraphExecutor::execute(const dnnl::stream& strm, + const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + if (should_repacking_be_separately()) { + exec_impl(separately_repack_inputs(strm, inMemPtrs), outMemPtrs); + return; + } + + exec_impl(inMemPtrs, outMemPtrs); +} + +void SubgraphStaticExecutor::exec_impl(const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + const auto& callable = m_schedule->get_callable(); + + initializer_functor initializer; + call_functor caller; + if (should_repacking_be_in_parallel()) { + initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + clean_repacked_offsets(ithr); + }; + caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args); + callable(&call_args, indexes.data()); + }; + } else { + initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + }; + caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + callable(&call_args, indexes.data()); + }; + } + +#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS) + segfault_detector(); +#endif + + if (m_parallel_exec_domain.size() == rank6D) { + parallel_for6d(initializer, caller); + } else { + parallel_forNd(initializer, caller); + } +} + +void SubgraphDynamicSpecializedExecutor::exec_impl(const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + const auto& callable = m_schedule->get_callable(); + + OPENVINO_ASSERT(m_data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!"); + OPENVINO_ASSERT(m_data_offsets.front().size() == m_parallel_exec_domain.size(), + "Data offsets with invalid ranks detected"); + + // Note: we need to reset KernelExecutorTable to the state that was recorded in the + // SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes + m_reset_exec_table_state(); + + std::vector src_ptrs; + std::vector dst_ptrs; + init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs, m_start_offset_in, m_start_offset_out); + + initializer_functor initializer; + call_functor caller; + if (should_repacking_be_in_parallel()) { + initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, ithr); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + clean_repacked_offsets(ithr); + }; + caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); + in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args); + callable(&call_args); + }; + } else { + initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, ithr); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + }; + caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { + update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); + callable(&call_args); + }; + } + +#if defined(__linux__) && defined(SNIPPETS_DEBUG_CAPS) + segfault_detector(); +#endif + + if (m_parallel_exec_domain.size() == rank6D) { + parallel_for6d(initializer, caller); + } else { + parallel_forNd(initializer, caller); + } +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.hpp new file mode 100644 index 00000000000000..fa0eb5f1583d2d --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.hpp @@ -0,0 +1,95 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "nodes/executors/subgraph.hpp" + +namespace ov { +namespace intel_cpu { + +class SubgraphExecutor : public SubgraphBaseExecutor { +public: + SubgraphExecutor(const std::shared_ptr& snippet_config, + const std::shared_ptr& snippet_attrs, + const std::shared_ptr& snippet, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache); + + void execute(const dnnl::stream& strm, + const std::vector& inMemPtrs, + const std::vector& outMemPtrs) override; + +protected: + std::vector separately_repack_inputs(const dnnl::stream& strm, const std::vector& srcMemPtrs); + void in_parallel_repack_inputs(const std::vector& inMemPtrs, + const std::vector& indexes, + int ithr, + jit_snippets_call_args& call_args); + + inline void* get_external_scratchpad_ptr(size_t ithr, size_t idx) const { + if (m_repacked_inputs.empty()) + return nullptr; + + uint8_t* data_ptr = m_buffer_scratchpad->getDataAs() + m_internal_buffer_size; + for (const auto& p : m_repacked_inputs) { + const auto& desc = p.second.desc(); + const auto size = desc->getCurrentMemSize(); + if (p.first == idx) { + return data_ptr + ithr * size; + } + data_ptr += m_nthreads * size; + } + OPENVINO_THROW("External buffer pointer has not been found"); + } + + // [ Thread Index -> Index of input with repacking data - > last repacked src_offset ] + std::vector> m_repacked_offsets_by_threads = {}; + std::unordered_map m_repacked_inputs = {}; + + std::function&, const std::vector&, size_t&)> init_offset = {}; + + inline bool should_repacking_be_separately() const { + return m_repacking_impl_type == CPURuntimeConfig::RepackingImplType::SEPARATE; + } + inline bool should_repacking_be_in_parallel() const { + return m_repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL; + } + inline void clean_repacked_offsets(size_t ithr) { + m_repacked_offsets_by_threads[ithr].assign(m_repacked_inputs.size(), std::numeric_limits::max()); + } + +#ifdef SNIPPETS_DEBUG_CAPS + bool enabled_segfault_detector = false; + inline void segfault_detector(); +#endif + +private: + CPURuntimeConfig::RepackingImplType m_repacking_impl_type = CPURuntimeConfig::RepackingImplType::NONE; +}; + +class SubgraphStaticExecutor : public SubgraphExecutor, public SubgraphStaticBaseExecutor { +public: + template + SubgraphStaticExecutor(const std::shared_ptr& snippet_config, Args... args) + : SubgraphExecutor(snippet_config, args...), + SubgraphStaticBaseExecutor() {} + + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override; +}; + +class SubgraphDynamicSpecializedExecutor : public SubgraphExecutor, public SubgraphDynamicSpecializedBaseExecutor { +public: + template + SubgraphDynamicSpecializedExecutor(const std::shared_ptr& snippet_config, Args... args) + : SubgraphExecutor(snippet_config, args...), + SubgraphDynamicSpecializedBaseExecutor(snippet_config) {} + + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index a0a5537eaf3b1a..962c05e76bf29a 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -6,8 +6,6 @@ #include "common/primitive_hashing_utils.hpp" #include "dnnl_extension_utils.h" #include "onednn/dnnl.h" -#include "openvino/core/parallel.hpp" -#include "openvino/core/rt_info.hpp" #include "shape_inference/custom/subgraph.hpp" #include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/pass/insert_buffers.hpp" @@ -27,9 +25,11 @@ #if defined(OPENVINO_ARCH_ARM64) # include "emitters/snippets/aarch64/cpu_generator.hpp" +# include "executors/aarch64/subgraph.hpp" # include "transformations/snippets/aarch64/shape_inference.hpp" #else # include "emitters/snippets/x64/cpu_generator.hpp" +# include "executors/x64/subgraph.hpp" # include "transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp" # include "transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp" # include "transformations/snippets/x64/pass/enforce_precision.hpp" @@ -48,13 +48,6 @@ #include "utils/cpu_utils.hpp" #include "utils/ngraph_utils.hpp" -#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) -# include - -# include "emitters/snippets/x64/jit_segfault_detector_emitter.hpp" -std::mutex err_print_lock; -#endif - #ifdef SNIPPETS_LIBXSMM_TPP # include "snippets/lowered/pass/optimize_domain.hpp" # include "transformations/tpp/x64/pass/brgemm_to_brgemm_tpp.hpp" @@ -70,189 +63,9 @@ namespace intel_cpu { namespace node { namespace { -// Class for Subgraphs with static shapes -class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor { -public: - SubgraphStaticExecutor(const std::shared_ptr& snippet_attrs, - const std::shared_ptr& snippet, - const std::vector& start_offset_in, - const std::vector& start_offset_out, - const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator, - const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) - : SubgraphExecutor(snippet_attrs, - snippet, - start_offset_in, - start_offset_out, - snippet_config, - allocator, - kernel_cache) {} - - void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { - const auto& callable = m_schedule->get_callable(); - - auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { - init_call_args(call_args, inMemPtrs, outMemPtrs, ithr); - }; - - using call_functor = std::function&, size_t)>; - call_functor caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { - callable(&call_args, indexes.data()); - }; - -#ifdef OPENVINO_ARCH_X86_64 - if (should_repacking_be_in_parallel()) { - caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { - in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args); - callable(&call_args, indexes.data()); - }; - } -#endif // OPENVINO_ARCH_X86_64 - - if (m_parallel_exec_domain.size() == rank6D) { - parallel_for6d(initializer, caller); - } else { - parallel_forNd(initializer, caller); - } - } - -protected: - typedef void (*kernel)(const void*, const void*); - - inline void init_call_args(jit_snippets_call_args& call_args, - const std::vector& srcMemPtrs, - const std::vector& dstMemPtrs, - size_t ithr) { - for (size_t i = 0; i < srcMemPtrs.size(); i++) - call_args.src_ptrs[i] = srcMemPtrs[i]->getDataAs() + m_start_offset_in[i]; - - for (size_t i = 0; i < dstMemPtrs.size(); i++) - call_args.dst_ptrs[i] = dstMemPtrs[i]->getDataAs() + m_start_offset_out[i]; - - update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); - } -}; - -// Specialized dynamic executor based on shape agnostic kernel for the specific input shapes -class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { -public: - SubgraphDynamicSpecializedExecutor(const std::shared_ptr& snippet_attrs, - const std::shared_ptr& snippet, - const std::vector& start_offset_in, - const std::vector& start_offset_out, - const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator, - const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) - : SubgraphExecutor(snippet_attrs, - snippet, - start_offset_in, - start_offset_out, - snippet_config, - allocator, - kernel_cache) { - buffer_offsets = snippet_config->buffer_cluster_offsets; - data_offsets = snippet_config->io_data_offsets; - loop_args = snippet_config->loop_args; - reset_exec_table_state = snippet_config->kernel_executor_table->get_state_reset(); - } - - void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { - const auto& callable = m_schedule->get_callable(); - - OPENVINO_ASSERT(data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!"); - OPENVINO_ASSERT(data_offsets.front().size() == m_parallel_exec_domain.size(), - "Data offsets with invalid ranks detected"); - - // Note: we need to reset KernelExecutorTable to the state that was recorded in the - // SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes - reset_exec_table_state(); - - std::vector src_ptrs; - std::vector dst_ptrs; - init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs); - - auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { - init_call_args(call_args, ithr); - }; - - using call_functor = std::function&, size_t)>; - call_functor caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { - update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); - callable(&call_args); - }; - -#ifdef OPENVINO_ARCH_X86_64 - if (should_repacking_be_in_parallel()) { - caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { - update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); - in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args); - callable(&call_args); - }; - } -#endif // OPENVINO_ARCH_X86_64 - - if (m_parallel_exec_domain.size() == rank6D) { - parallel_for6d(initializer, caller); - } else { - parallel_forNd(initializer, caller); - } - } - -protected: - typedef void (*dynamic_kernel)(const void*); - - inline void init_call_args(jit_snippets_call_args& call_args, size_t ithr) { - call_args.register_loops(loop_args); - std::copy(buffer_offsets.cbegin(), buffer_offsets.cend(), call_args.buffer_offsets); - - update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); - } - - inline void init_original_ptrs(const std::vector& srcMemPtrs, - const std::vector& dstMemPtrs, - std::vector& src_ptrs, - std::vector& dst_ptrs) { - const auto in_num = srcMemPtrs.size(); - const auto out_num = dstMemPtrs.size(); - - src_ptrs.resize(in_num, nullptr); - dst_ptrs.resize(out_num, nullptr); - - for (size_t i = 0; i < in_num; i++) - src_ptrs[i] = srcMemPtrs[i]->getDataAs() + m_start_offset_in[i]; - for (size_t i = 0; i < out_num; i++) - dst_ptrs[i] = dstMemPtrs[i]->getDataAs() + m_start_offset_out[i]; - } - - inline void update_ptrs(jit_snippets_call_args& call_args, - const std::vector& src_ptrs, - const std::vector& dst_ptrs, - const std::vector& indexes) const { - for (size_t i = 0; i < src_ptrs.size(); i++) { - auto i_ptr = src_ptrs[i]; - for (size_t j = 0; j < indexes.size(); j++) { - i_ptr += data_offsets[i][j] * indexes[j]; - } - call_args.src_ptrs[i] = i_ptr; - } - for (size_t i = 0; i < dst_ptrs.size(); i++) { - auto i_ptr = dst_ptrs[i]; - for (size_t j = 0; j < indexes.size(); j++) { - i_ptr += data_offsets[i + src_ptrs.size()][j] * indexes[j]; - } - call_args.dst_ptrs[i] = i_ptr; - } - } - - std::vector buffer_offsets = {}; - std::vector> data_offsets = {}; - std::vector loop_args = {}; - std::function reset_exec_table_state; -}; - struct SubgraphKey { SubgraphKey() = default; - SubgraphKey(const std::shared_ptr& attrs_, const std::vector& in_shapes_) + SubgraphKey(const std::shared_ptr& attrs_, const std::vector& in_shapes_) : attrs(attrs_), in_shapes(in_shapes_) {} virtual ~SubgraphKey() = default; @@ -260,19 +73,19 @@ struct SubgraphKey { size_t hash() const; bool operator==(const SubgraphKey& rhs) const; - std::shared_ptr attrs = nullptr; + std::shared_ptr attrs = nullptr; std::vector in_shapes = {}; }; struct SubgraphCodeGeneratorKey { - SubgraphCodeGeneratorKey(const std::shared_ptr& attrs_, uint8_t mask_) + SubgraphCodeGeneratorKey(const std::shared_ptr& attrs_, uint8_t mask_) : attrs(attrs_), broadcasting_mask(mask_) {} size_t hash() const; bool operator==(const SubgraphCodeGeneratorKey& rhs) const; - std::shared_ptr attrs = nullptr; + std::shared_ptr attrs = nullptr; uint8_t broadcasting_mask = 0; }; @@ -288,7 +101,7 @@ struct SubgraphShapeInferResultKey { uint64_t body_hash = 0; }; -size_t get_attr_hash(size_t seed, const std::shared_ptr& attrs) { +size_t get_attr_hash(size_t seed, const std::shared_ptr& attrs) { using namespace dnnl::impl; using namespace dnnl::impl::primitive_hashing; @@ -338,7 +151,7 @@ size_t SubgraphShapeInferResultKey::hash() const { return seed; } -bool operator==(const Subgraph::SubgraphAttrs& lhs, const Subgraph::SubgraphAttrs& rhs) { +bool operator==(const SubgraphAttrs& lhs, const SubgraphAttrs& rhs) { if (&lhs == &rhs) return true; if (lhs.bodyHash != rhs.bodyHash) @@ -833,10 +646,10 @@ void Subgraph::optimizeIR() { void Subgraph::prepareParams() { const auto& cache = context->getParamsCache(); - auto builder = [this, &cache](const SubgraphKey& key) -> std::shared_ptr { + auto builder = [this, &cache](const SubgraphKey& key) -> std::shared_ptr { const auto& snippet = subgraph_attrs->snippet; - SubgraphExecutor::BufferScratchpadAllocator allocator = [this](size_t size) { + SubgraphBaseExecutor::BufferScratchpadAllocator allocator = [this](size_t size) { return getScratchPadMem(std::make_shared(ov::element::u8, intel_cpu::Shape{size})); }; @@ -859,11 +672,11 @@ void Subgraph::prepareParams() { code_gen->get()->lowering_result.kernel_executor_table); } const auto& snippet_config = ov::as_type_ptr(snippet->update_runtime_config()); - return std::make_shared(key.attrs, + return std::make_shared(snippet_config, + key.attrs, code_gen, start_offset_in, start_offset_out, - snippet_config, allocator, cache); } else { @@ -878,11 +691,11 @@ void Subgraph::prepareParams() { [&snippet_config](const SubgraphCodeGeneratorKey& key) -> std::shared_ptr { return std::make_shared(key.attrs, snippet_config); }); - return std::make_shared(key.attrs, + return std::make_shared(snippet_config, + key.attrs, code_gen_result.first, start_offset_in, start_offset_out, - snippet_config, allocator, cache); } @@ -944,322 +757,6 @@ void Subgraph::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -namespace { -inline void init_parallel_domain(const std::vector& master_shape, - size_t tensor_rank, - size_t tile_rank, - std::vector& domain) { - domain.resize(tensor_rank, 1); - std::fill(domain.begin(), domain.end(), 1); - std::copy(master_shape.cbegin(), - master_shape.cbegin() + (master_shape.size() - tile_rank), - domain.begin() + (tensor_rank - master_shape.size())); -} -inline void init_parallel_domain(const std::shared_ptr& snippet_config, std::vector& domain) { - init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain); -} -} // namespace - -Subgraph::SubgraphCodeGenerator::SubgraphCodeGenerator(const std::shared_ptr& snippet_attrs, - const std::shared_ptr& config) { - OPENVINO_ASSERT(snippet_attrs, "Subgraph attributes are empty!"); - OPENVINO_ASSERT(config, "Runtime Config is empty!"); - - jit_snippets_compile_args jcp; - jcp.data_offsets = config->io_data_offsets; - init_parallel_domain(config, jcp.exec_domain); - schedule = - std::make_shared(snippet_attrs->snippet->generate(reinterpret_cast(&jcp))); -} - -Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& snippet_attrs, - const std::shared_ptr& snippet, - const std::vector& start_offset_in, - const std::vector& start_offset_out, - const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator, - const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) - : m_schedule(snippet->get()), - m_start_offset_in(start_offset_in), - m_start_offset_out(start_offset_out) { - OPENVINO_ASSERT(m_schedule, "Schedule is empty!"); - OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!"); - init_parallel_domain(snippet_config, m_parallel_exec_domain); - - m_tensor_rank = snippet_config->tensor_rank; - m_harness_work_amount = std::accumulate(m_parallel_exec_domain.cbegin(), - m_parallel_exec_domain.cend(), - size_t(1), - std::multiplies()); - m_nthreads = std::min(parallel_get_max_threads(), static_cast(m_harness_work_amount)); - - m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size; - OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), - "Undefined buffer scratchpad size!"); - m_internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; - -#ifdef OPENVINO_ARCH_X86_64 - m_repacking_impl_type = snippet_config->repacking_impl_type; - m_repacked_inputs = snippet_config->repacked_inputs; - - auto external_buffer_size = - std::accumulate(m_repacked_inputs.begin(), - m_repacked_inputs.end(), - size_t(0), - [](size_t sum, const std::pair& p) { - return sum + p.second.desc()->getCurrentMemSize(); - }); - - if (should_repacking_be_in_parallel()) { - // When external repacking is applied in parallel section, - // each thread should have own buffer to store repacked data - external_buffer_size *= m_nthreads; - - // To avoid extra overheads in runtime on vector creation, - // we initialize `repacked_offsets_by_threads` by default here - m_repacked_offsets_by_threads.resize(m_nthreads); - for (size_t i = 0; i < m_repacked_offsets_by_threads.size(); ++i) - clean_repacked_offsets(i); - } - -#else - const auto external_buffer_size = 0lu; -#endif // OPENVINO_ARCH_X86_64 - m_buffer_scratchpad = allocator(m_internal_buffer_size + external_buffer_size); - -#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) - const auto target = std::dynamic_pointer_cast( - snippet_attrs->snippet->get_generator()->get_target_machine()); - enabled_segfault_detector = target && target->debug_config.enable_segfault_detector; -#endif -} - -#ifdef OPENVINO_ARCH_X86_64 -namespace { -inline void parallel4d_repacking(const BrgemmCopyBKernel* ker, - const VectorDims& dom, - const VectorDims& in_str, - const VectorDims& out_str, - const uint8_t* src, - uint8_t* dst) { - parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) { - BrgemmCopyBKernel::call_args args; - args.src = src + d0 * in_str[0] + d1 * in_str[1] + d2 * in_str[2] + d3 * in_str[3]; - args.tr_src = dst + d0 * out_str[0] + d1 * out_str[1] + d2 * out_str[2] + d3 * out_str[3]; - (*ker)(&args); - }); -}; -inline void parallelNd_repacking(const BrgemmCopyBKernel* ker, - const VectorDims& dom, - const VectorDims& in_str, - const VectorDims& out_str, - const uint8_t* src, - uint8_t* dst) { - const size_t batch = std::accumulate(dom.rbegin() + 2, dom.rend(), 1lu, std::multiplies()); - parallel_nt_static(0, [&](const int ithr, const int nthr) { - BrgemmCopyBKernel::call_args args; - size_t start = 0, end = 0; - splitter(batch, nthr, ithr, start, end); - for (size_t iwork = start; iwork < end; ++iwork) { - const uint8_t* src_u8 = src; - uint8_t* dst_u8 = dst; - size_t tmp = iwork; - for (ptrdiff_t j = static_cast(dom.size()) - 3; j >= 0; j--) { - auto idx = tmp % dom[j]; - tmp /= dom[j]; - - src_u8 += idx * in_str[j]; - dst_u8 += idx * out_str[j]; - } - args.src = src_u8; - args.tr_src = dst_u8; - (*ker)(&args); - } - }); -}; -} // namespace -std::vector Subgraph::SubgraphExecutor::separately_repack_inputs(const dnnl::stream& strm, - const std::vector& srcMemPtrs) { - auto reordered_in_ptrs = srcMemPtrs; - size_t offset = m_internal_buffer_size; - for (const auto& p : m_repacked_inputs) { - const auto in_idx = p.first; - const auto& repacked_input = p.second; - const auto& desc = repacked_input.desc(); - const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; - - OPENVINO_ASSERT(in_idx < srcMemPtrs.size(), "Incorrect index of input repacked mem ptr"); - const auto& src_mem = srcMemPtrs[in_idx]; - const auto& dst_mem = std::make_shared(strm.get_engine(), desc, data_ptr, false); - - const auto* src = src_mem->getDataAs() + m_start_offset_in[in_idx]; - auto* dst = dst_mem->getDataAs(); - - VectorDims dom; - const auto& shape = dst_mem->getShape().getDims(); - OPENVINO_ASSERT(shape.size() <= m_tensor_rank, "Unsupported shape rank of repacking data"); - init_parallel_domain(shape, m_tensor_rank, 2lu, dom); - - const auto& in_strides = repacked_input.in_offsets(); - const auto& out_strides = repacked_input.out_offsets(); - OPENVINO_ASSERT(everyone_is(m_tensor_rank, in_strides.size(), out_strides.size(), dom.size()), - "Unsupported shape rank of repacking data"); - - const auto& kernel = repacked_input.kernel(); - if (m_tensor_rank == rank6D) - parallel4d_repacking(kernel.get(), dom, in_strides, out_strides, src, dst); - else - parallelNd_repacking(kernel.get(), dom, in_strides, out_strides, src, dst); - - reordered_in_ptrs[in_idx] = dst_mem; - offset += desc->getCurrentMemSize(); - } - return reordered_in_ptrs; -} - -void Subgraph::SubgraphExecutor::in_parallel_repack_inputs(const std::vector& inMemPtrs, - const std::vector& indexes, - int ithr, - jit_snippets_call_args& call_args) { - size_t repacked_offset_idx = 0; - for (const auto& p : m_repacked_inputs) { - const auto& in_idx = p.first; - const auto& repacked_in = p.second; - - const auto& src_offsets = repacked_in.in_offsets(); - size_t src_offset = m_start_offset_in[in_idx]; - for (size_t j = 0; j < indexes.size(); j++) - src_offset += src_offsets[j] * indexes[j]; - - auto* repacked_ptr = get_external_scratchpad_ptr(ithr, in_idx); - - auto& last_processed_src_offset = m_repacked_offsets_by_threads[ithr][repacked_offset_idx]; - if (src_offset != last_processed_src_offset) { - BrgemmCopyBKernel::call_args args; - args.src = inMemPtrs[in_idx]->getDataAs() + src_offset; - args.tr_src = repacked_ptr; - (*repacked_in.kernel())(&args); - - last_processed_src_offset = src_offset; - } - - call_args.src_ptrs[in_idx] = repacked_ptr; - ++repacked_offset_idx; - } -} -#endif // OPENVINO_ARCH_X86_64 - -#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) -void Subgraph::SubgraphExecutor::segfault_detector() { - if (enabled_segfault_detector) { - __sighandler_t signal_handler = [](int signal) { - std::lock_guard guard(err_print_lock); - if (auto segfault_detector_emitter = ov::intel_cpu::g_custom_segfault_handler->local()) - std::cout << segfault_detector_emitter->info() << std::endl; - auto tid = parallel_get_thread_num(); - OPENVINO_THROW("Segfault was caught by the signal handler in subgraph node execution on thread " + - std::to_string(tid)); - }; - struct sigaction new_handler {}; - new_handler.sa_handler = signal_handler; - sigaction(SIGSEGV, &new_handler, nullptr); - } -} -#endif - -void Subgraph::SubgraphExecutor::parallel_for6d( - const std::function& initializer, - const std::function&, size_t)>& caller) { - const auto& dom = m_parallel_exec_domain; - -#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) - segfault_detector(); -#endif - - parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) { - jit_snippets_call_args call_args; - initializer(call_args, ithr); - - size_t start = 0, end = 0; - splitter(m_harness_work_amount, nthr, ithr, start, end); - - std::vector indexes{0, 0, 0, 0, 0}; - parallel_it_init(start, - indexes[0], - dom[0], - indexes[1], - dom[1], - indexes[2], - dom[2], - indexes[3], - dom[3], - indexes[4], - dom[4]); - for (size_t iwork = start; iwork < end; ++iwork) { - caller(call_args, indexes, ithr); - parallel_it_step(indexes[0], - dom[0], - indexes[1], - dom[1], - indexes[2], - dom[2], - indexes[3], - dom[3], - indexes[4], - dom[4]); - } - -#ifdef OPENVINO_ARCH_X86_64 - clean_repacked_offsets(ithr); -#endif // OPENVINO_ARCH_X86_64 - }); -} - -void Subgraph::SubgraphExecutor::parallel_forNd( - const std::function& initializer, - const std::function&, size_t)>& caller) { - const auto& dom = m_parallel_exec_domain; - -#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) - segfault_detector(); -#endif - - parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) { - jit_snippets_call_args call_args; - initializer(call_args, ithr); - - size_t start = 0, end = 0; - splitter(m_harness_work_amount, nthr, ithr, start, end); - - std::vector indexes(dom.size() - 1, 0); - for (size_t iwork = start; iwork < end; ++iwork) { - size_t tmp = iwork; - for (ptrdiff_t j = static_cast(dom.size()) - 2; j >= 0; j--) { - indexes[j] = tmp % dom[j]; - tmp /= dom[j]; - } - - caller(call_args, indexes, ithr); - } - -#ifdef OPENVINO_ARCH_X86_64 - clean_repacked_offsets(ithr); -#endif // OPENVINO_ARCH_X86_64 - }); -} - -void Subgraph::SubgraphExecutor::execute(const dnnl::stream& strm, - const std::vector& inMemPtrs, - const std::vector& outMemPtrs) { -#ifdef OPENVINO_ARCH_X86_64 - if (should_repacking_be_separately()) { - exec_impl(separately_repack_inputs(strm, inMemPtrs), outMemPtrs); - return; - } -#endif // OPENVINO_ARCH_X86_64 - exec_impl(inMemPtrs, outMemPtrs); -} - } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index dd89d7e71182f8..ea7d51650e5cad 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -4,10 +4,8 @@ #pragma once -#include "emitters/snippets/cpu_runtime_configurator.hpp" -#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "executors/subgraph.hpp" #include "node.h" -#include "snippets/op/subgraph.hpp" #if defined(OPENVINO_ARCH_ARM64) # include "cpu/aarch64/cpu_isa_traits.hpp" @@ -15,8 +13,6 @@ # include "cpu/x64/cpu_isa_traits.hpp" #endif -#include - namespace ov { namespace intel_cpu { namespace node { @@ -41,21 +37,6 @@ class Subgraph : public Node { void execute(dnnl::stream strm) override; void executeDynamicImpl(dnnl::stream strm) override; - struct SubgraphAttrs { - // Local copy of subgraph node for canonization & code generation - std::shared_ptr snippet; - uint64_t bodyHash; - std::vector inMemOrders; - std::vector outMemOrders; - std::vector inMemPrecs; - std::vector outMemPrecs; - }; - - // Class for snippet compilation - class SubgraphCodeGenerator; - // Base class for executors - class SubgraphExecutor; - protected: IShapeInfer::Result shapeInfer() const override; @@ -103,118 +84,7 @@ class Subgraph : public Node { // Input shapes that are used in PrepareParams and ShapeInfer to avoid frequent memory allocation mutable std::vector in_shapes; - std::shared_ptr execPtr = nullptr; -}; - -class Subgraph::SubgraphCodeGenerator { -public: - SubgraphCodeGenerator(const std::shared_ptr& snippet_attrs, - const std::shared_ptr& config); - - const std::shared_ptr& get() const { - return schedule; - } - -private: - std::shared_ptr schedule; -}; - -class Subgraph::SubgraphExecutor { -public: - using BufferScratchpadAllocator = std::function; - - SubgraphExecutor(const std::shared_ptr& snippet_attrs, - const std::shared_ptr& snippet, - const std::vector& start_offset_in, - const std::vector& start_offset_out, - const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator, - const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache); - virtual ~SubgraphExecutor() = default; - - void execute(const dnnl::stream& strm, - const std::vector& inMemPtrs, - const std::vector& outMemPtrs); - -protected: - virtual void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) = 0; - - void parallel_for6d(const std::function& initializer, - const std::function&, size_t)>& caller); - void parallel_forNd(const std::function& initializer, - const std::function&, size_t)>& caller); - - inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const { - if (m_buffer_scratchpad_size > 0) - scratchpad_ptr = m_buffer_scratchpad->getDataAs() + ithr * m_buffer_scratchpad_size; - } - - std::shared_ptr m_schedule; - // Holds index of output used as in execution domain - // it should be compatible with a schedule's work size - std::vector m_parallel_exec_domain = {}; - size_t m_harness_work_amount = 0; - - // Buffer scratchpad - MemoryPtr m_buffer_scratchpad = nullptr; - size_t m_buffer_scratchpad_size = 0; - size_t m_internal_buffer_size = 0; - size_t m_tensor_rank = 0; - - const size_t rank6D = 6; - - // Count of threads for parallel_nt - int m_nthreads = 0; - - std::vector m_start_offset_in = {}; - std::vector m_start_offset_out = {}; - -#ifdef SNIPPETS_DEBUG_CAPS - bool enabled_segfault_detector = false; - inline void segfault_detector(); -#endif - -#ifdef OPENVINO_ARCH_X86_64 - std::vector separately_repack_inputs(const dnnl::stream& strm, const std::vector& srcMemPtrs); - void in_parallel_repack_inputs(const std::vector& inMemPtrs, - const std::vector& indexes, - int ithr, - jit_snippets_call_args& call_args); - - inline void* get_external_scratchpad_ptr(size_t ithr, size_t idx) const { - if (m_repacked_inputs.empty()) - return nullptr; - - uint8_t* data_ptr = m_buffer_scratchpad->getDataAs() + m_internal_buffer_size; - for (const auto& p : m_repacked_inputs) { - const auto& desc = p.second.desc(); - const auto size = desc->getCurrentMemSize(); - if (p.first == idx) { - return data_ptr + ithr * size; - } - data_ptr += m_nthreads * size; - } - OPENVINO_THROW("External buffer pointer has not been found"); - } - - // [ Thread Index -> Index of input with repacking data - > last repacked src_offset ] - std::vector> m_repacked_offsets_by_threads = {}; - std::unordered_map m_repacked_inputs = {}; - - inline bool should_repacking_be_separately() const { - return m_repacking_impl_type == CPURuntimeConfig::RepackingImplType::SEPARATE; - } - inline bool should_repacking_be_in_parallel() const { - return m_repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL; - } - inline void clean_repacked_offsets(size_t ithr) { - if (should_repacking_be_in_parallel()) - m_repacked_offsets_by_threads[ithr].assign(m_repacked_inputs.size(), std::numeric_limits::max()); - } - -private: - CPURuntimeConfig::RepackingImplType m_repacking_impl_type = CPURuntimeConfig::RepackingImplType::NONE; -#endif // OPENVINO_ARCH_X86_64 + std::shared_ptr execPtr = nullptr; }; } // namespace node