-
Notifications
You must be signed in to change notification settings - Fork 2.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Snippes][CPU] Created new Executors
- Loading branch information
1 parent
9bb9646
commit 7be609b
Showing
8 changed files
with
855 additions
and
649 deletions.
There are no files selected for viewing
65 changes: 65 additions & 0 deletions
65
src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "nodes/executors/aarch64/subgraph.hpp" | ||
|
||
#include "snippets/op/subgraph.hpp" | ||
|
||
|
||
namespace ov { | ||
namespace intel_cpu { | ||
|
||
void SubgraphStaticExecutor::exec_impl(const std::vector<MemoryPtr>& inMemPtrs, | ||
const std::vector<MemoryPtr>& outMemPtrs) { | ||
const auto& callable = m_schedule->get_callable<kernel>(); | ||
|
||
auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { | ||
init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr); | ||
update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); | ||
}; | ||
auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) { | ||
callable(&call_args, indexes.data()); | ||
}; | ||
|
||
if (m_parallel_exec_domain.size() == rank6D) { | ||
parallel_for6d(initializer, caller); | ||
} else { | ||
parallel_forNd(initializer, caller); | ||
} | ||
} | ||
|
||
void SubgraphDynamicSpecializedExecutor::exec_impl(const std::vector<MemoryPtr>& inMemPtrs, | ||
const std::vector<MemoryPtr>& outMemPtrs) { | ||
const auto& callable = m_schedule->get_callable<dynamic_kernel>(); | ||
|
||
OPENVINO_ASSERT(m_data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!"); | ||
OPENVINO_ASSERT(m_data_offsets.front().size() == m_parallel_exec_domain.size(), | ||
"Data offsets with invalid ranks detected"); | ||
|
||
// Note: we need to reset KernelExecutorTable to the state that was recorded in the | ||
// SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes | ||
m_reset_exec_table_state(); | ||
|
||
std::vector<const uint8_t*> src_ptrs; | ||
std::vector<uint8_t*> dst_ptrs; | ||
init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs, m_start_offset_in, m_start_offset_out); | ||
|
||
auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { | ||
init_call_args(call_args, ithr); | ||
update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); | ||
}; | ||
auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) { | ||
update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); | ||
callable(&call_args); | ||
}; | ||
|
||
if (m_parallel_exec_domain.size() == rank6D) { | ||
parallel_for6d(initializer, caller); | ||
} else { | ||
parallel_forNd(initializer, caller); | ||
} | ||
} | ||
|
||
} // namespace intel_cpu | ||
} // namespace ov |
39 changes: 39 additions & 0 deletions
39
src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#pragma once | ||
|
||
#include "nodes/executors/subgraph.hpp" | ||
|
||
namespace ov { | ||
namespace intel_cpu { | ||
|
||
class SubgraphExecutor : public SubgraphBaseExecutor { | ||
public: | ||
template<typename ...Args> | ||
SubgraphExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config, Args ...args) { | ||
m_buffer_scratchpad = allocator(m_internal_buffer_size); | ||
} | ||
}; | ||
|
||
class SubgraphStaticExecutor : public SubgraphExecutor, public SubgraphStaticBaseExecutor { | ||
public: | ||
template<typename ...Args> | ||
SubgraphStaticExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config, Args ...args) | ||
: SubgraphExecutor(snippet_config, args...), SubgraphStaticBaseExecutor() {} | ||
|
||
void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override; | ||
}; | ||
|
||
class SubgraphDynamicSpecializedExecutor : public SubgraphExecutor, public SubgraphDynamicSpecializedBaseExecutor { | ||
public: | ||
template<typename ...Args> | ||
SubgraphDynamicSpecializedExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config, Args ...args) | ||
: SubgraphExecutor(snippet_config, args...), SubgraphDynamicSpecializedBaseExecutor(snippet_config) {} | ||
|
||
void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override; | ||
}; | ||
|
||
} // namespace intel_cpu | ||
} // namespace ov |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "nodes/executors/subgraph.hpp" | ||
#if defined(OPENVINO_ARCH_ARM64) | ||
# include "emitters/snippets/aarch64/cpu_generator.hpp" | ||
#else | ||
# include "emitters/snippets/x64/cpu_generator.hpp" | ||
#endif | ||
#include "openvino/core/parallel.hpp" | ||
|
||
namespace ov { | ||
namespace intel_cpu { | ||
|
||
SubgraphCodeGenerator::SubgraphCodeGenerator(const std::shared_ptr<SubgraphAttrs>& snippet_attrs, | ||
const std::shared_ptr<CPURuntimeConfig>& config) { | ||
OPENVINO_ASSERT(snippet_attrs, "Subgraph attributes are empty!"); | ||
OPENVINO_ASSERT(config, "Runtime Config is empty!"); | ||
|
||
jit_snippets_compile_args jcp; | ||
jcp.data_offsets = config->io_data_offsets; | ||
SubgraphBaseExecutor::init_parallel_domain(config, jcp.exec_domain); | ||
schedule = | ||
std::make_shared<ov::snippets::Schedule>(snippet_attrs->snippet->generate(reinterpret_cast<const void*>(&jcp))); | ||
} | ||
|
||
SubgraphBaseExecutor::SubgraphBaseExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config, | ||
const std::shared_ptr<SubgraphAttrs>& snippet_attrs, | ||
const std::shared_ptr<SubgraphCodeGenerator>& snippet, | ||
const std::vector<ptrdiff_t>& start_offset_in, | ||
const std::vector<ptrdiff_t>& start_offset_out, | ||
const BufferScratchpadAllocator& allocator, | ||
const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) | ||
: m_schedule(snippet->get()), | ||
m_start_offset_in(start_offset_in), | ||
m_start_offset_out(start_offset_out) { | ||
OPENVINO_ASSERT(m_schedule, "Schedule is empty!"); | ||
OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!"); | ||
init_parallel_domain(snippet_config, m_parallel_exec_domain); | ||
|
||
m_tensor_rank = snippet_config->tensor_rank; | ||
m_harness_work_amount = std::accumulate(m_parallel_exec_domain.cbegin(), | ||
m_parallel_exec_domain.cend(), | ||
size_t(1), | ||
std::multiplies<size_t>()); | ||
m_nthreads = std::min(parallel_get_max_threads(), static_cast<int>(m_harness_work_amount)); | ||
|
||
m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size; | ||
OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), | ||
"Undefined buffer scratchpad size!"); | ||
m_internal_buffer_size = static_cast<size_t>(m_nthreads) * m_buffer_scratchpad_size; | ||
} | ||
|
||
void SubgraphBaseExecutor::init_parallel_domain(const std::vector<size_t>& master_shape, | ||
size_t tensor_rank, | ||
size_t tile_rank, | ||
std::vector<size_t>& domain) { | ||
domain.resize(tensor_rank, 1); | ||
std::fill(domain.begin(), domain.end(), 1); | ||
std::copy(master_shape.cbegin(), | ||
master_shape.cbegin() + (master_shape.size() - tile_rank), | ||
domain.begin() + (tensor_rank - master_shape.size())); | ||
} | ||
|
||
void SubgraphBaseExecutor::init_parallel_domain(const std::shared_ptr<CPURuntimeConfig>& snippet_config, | ||
std::vector<size_t>& domain) { | ||
init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain); | ||
} | ||
void SubgraphBaseExecutor::parallel_for6d( | ||
const std::function<void(jit_snippets_call_args&, size_t)>& initializer, | ||
const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) { | ||
const auto& dom = m_parallel_exec_domain; | ||
|
||
parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) { | ||
jit_snippets_call_args call_args; | ||
initializer(call_args, ithr); | ||
|
||
size_t start = 0, end = 0; | ||
splitter(m_harness_work_amount, nthr, ithr, start, end); | ||
|
||
std::vector<size_t> indexes{0, 0, 0, 0, 0}; | ||
parallel_it_init(start, | ||
indexes[0], | ||
dom[0], | ||
indexes[1], | ||
dom[1], | ||
indexes[2], | ||
dom[2], | ||
indexes[3], | ||
dom[3], | ||
indexes[4], | ||
dom[4]); | ||
for (size_t iwork = start; iwork < end; ++iwork) { | ||
caller(call_args, indexes, ithr); | ||
parallel_it_step(indexes[0], | ||
dom[0], | ||
indexes[1], | ||
dom[1], | ||
indexes[2], | ||
dom[2], | ||
indexes[3], | ||
dom[3], | ||
indexes[4], | ||
dom[4]); | ||
} | ||
}); | ||
} | ||
|
||
void SubgraphBaseExecutor::parallel_forNd( | ||
const std::function<void(jit_snippets_call_args&, size_t)>& initializer, | ||
const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) { | ||
const auto& dom = m_parallel_exec_domain; | ||
|
||
parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) { | ||
jit_snippets_call_args call_args; | ||
initializer(call_args, ithr); | ||
|
||
size_t start = 0, end = 0; | ||
splitter(m_harness_work_amount, nthr, ithr, start, end); | ||
|
||
std::vector<size_t> indexes(dom.size() - 1, 0); | ||
for (size_t iwork = start; iwork < end; ++iwork) { | ||
size_t tmp = iwork; | ||
for (ptrdiff_t j = static_cast<ptrdiff_t>(dom.size()) - 2; j >= 0; j--) { | ||
indexes[j] = tmp % dom[j]; | ||
tmp /= dom[j]; | ||
} | ||
|
||
caller(call_args, indexes, ithr); | ||
} | ||
}); | ||
} | ||
|
||
void SubgraphBaseExecutor::execute(const dnnl::stream& strm, | ||
const std::vector<MemoryPtr>& inMemPtrs, | ||
const std::vector<MemoryPtr>& outMemPtrs) { | ||
exec_impl(inMemPtrs, outMemPtrs); | ||
} | ||
|
||
} // namespace intel_cpu | ||
} // namespace ov |
Oops, something went wrong.