Skip to content

Commit 67dccc2

Browse files
committed
[Snippes][CPU] Created new Executors
1 parent 9bb9646 commit 67dccc2

File tree

8 files changed

+853
-647
lines changed

8 files changed

+853
-647
lines changed
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
// Copyright (C) 2018-2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "nodes/executors/aarch64/subgraph.hpp"
6+
7+
#include "snippets/op/subgraph.hpp"
8+
9+
10+
namespace ov {
11+
namespace intel_cpu {
12+
13+
void SubgraphStaticExecutor::exec_impl(const std::vector<MemoryPtr>& inMemPtrs,
14+
const std::vector<MemoryPtr>& outMemPtrs) {
15+
const auto& callable = m_schedule->get_callable<kernel>();
16+
17+
auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
18+
init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr);
19+
update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
20+
};
21+
auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
22+
callable(&call_args, indexes.data());
23+
};
24+
25+
if (m_parallel_exec_domain.size() == rank6D) {
26+
parallel_for6d(initializer, caller);
27+
} else {
28+
parallel_forNd(initializer, caller);
29+
}
30+
}
31+
32+
void SubgraphDynamicSpecializedExecutor::exec_impl(const std::vector<MemoryPtr>& inMemPtrs,
33+
const std::vector<MemoryPtr>& outMemPtrs) {
34+
const auto& callable = m_schedule->get_callable<dynamic_kernel>();
35+
36+
OPENVINO_ASSERT(m_data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!");
37+
OPENVINO_ASSERT(m_data_offsets.front().size() == m_parallel_exec_domain.size(),
38+
"Data offsets with invalid ranks detected");
39+
40+
// Note: we need to reset KernelExecutorTable to the state that was recorded in the
41+
// SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes
42+
m_reset_exec_table_state();
43+
44+
std::vector<const uint8_t*> src_ptrs;
45+
std::vector<uint8_t*> dst_ptrs;
46+
init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs, m_start_offset_in, m_start_offset_out);
47+
48+
auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
49+
init_call_args(call_args, ithr);
50+
update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
51+
};
52+
auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
53+
update_ptrs(call_args, src_ptrs, dst_ptrs, indexes);
54+
callable(&call_args);
55+
};
56+
57+
if (m_parallel_exec_domain.size() == rank6D) {
58+
parallel_for6d(initializer, caller);
59+
} else {
60+
parallel_forNd(initializer, caller);
61+
}
62+
}
63+
64+
} // namespace intel_cpu
65+
} // namespace ov
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Copyright (C) 2018-2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
7+
#include "nodes/executors/subgraph.hpp"
8+
9+
namespace ov {
10+
namespace intel_cpu {
11+
12+
class SubgraphExecutor : public SubgraphBaseExecutor {
13+
public:
14+
template<typename ...Args>
15+
SubgraphExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config, Args ...args) {
16+
m_buffer_scratchpad = allocator(m_internal_buffer_size);
17+
}
18+
};
19+
20+
class SubgraphStaticExecutor : public SubgraphExecutor, public SubgraphStaticBaseExecutor {
21+
public:
22+
template<typename ...Args>
23+
SubgraphStaticExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config, Args ...args)
24+
: SubgraphExecutor(snippet_config, args...), SubgraphStaticBaseExecutor() {}
25+
26+
void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override;
27+
};
28+
29+
class SubgraphDynamicSpecializedExecutor : public SubgraphExecutor, public SubgraphDynamicSpecializedBaseExecutor {
30+
public:
31+
template<typename ...Args>
32+
SubgraphDynamicSpecializedExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config, Args ...args)
33+
: SubgraphExecutor(snippet_config, args...), SubgraphDynamicSpecializedBaseExecutor(snippet_config) {}
34+
35+
void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override;
36+
};
37+
38+
} // namespace intel_cpu
39+
} // namespace ov
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
// Copyright (C) 2018-2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "nodes/executors/subgraph.hpp"
6+
#if defined(OPENVINO_ARCH_ARM64)
7+
# include "emitters/snippets/aarch64/cpu_generator.hpp"
8+
#else
9+
# include "emitters/snippets/x64/cpu_generator.hpp"
10+
#endif
11+
12+
namespace ov {
13+
namespace intel_cpu {
14+
15+
SubgraphCodeGenerator::SubgraphCodeGenerator(const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
16+
const std::shared_ptr<CPURuntimeConfig>& config) {
17+
OPENVINO_ASSERT(snippet_attrs, "Subgraph attributes are empty!");
18+
OPENVINO_ASSERT(config, "Runtime Config is empty!");
19+
20+
jit_snippets_compile_args jcp;
21+
jcp.data_offsets = config->io_data_offsets;
22+
SubgraphBaseExecutor::init_parallel_domain(config, jcp.exec_domain);
23+
schedule =
24+
std::make_shared<ov::snippets::Schedule>(snippet_attrs->snippet->generate(reinterpret_cast<const void*>(&jcp)));
25+
}
26+
27+
SubgraphBaseExecutor::SubgraphBaseExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
28+
const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
29+
const std::shared_ptr<SubgraphCodeGenerator>& snippet,
30+
const std::vector<ptrdiff_t>& start_offset_in,
31+
const std::vector<ptrdiff_t>& start_offset_out,
32+
const BufferScratchpadAllocator& allocator,
33+
const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache)
34+
: m_schedule(snippet->get()),
35+
m_start_offset_in(start_offset_in),
36+
m_start_offset_out(start_offset_out) {
37+
OPENVINO_ASSERT(m_schedule, "Schedule is empty!");
38+
OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!");
39+
init_parallel_domain(snippet_config, m_parallel_exec_domain);
40+
41+
m_tensor_rank = snippet_config->tensor_rank;
42+
m_harness_work_amount = std::accumulate(m_parallel_exec_domain.cbegin(),
43+
m_parallel_exec_domain.cend(),
44+
size_t(1),
45+
std::multiplies<size_t>());
46+
m_nthreads = std::min(parallel_get_max_threads(), static_cast<int>(m_harness_work_amount));
47+
48+
m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size;
49+
OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size),
50+
"Undefined buffer scratchpad size!");
51+
m_internal_buffer_size = static_cast<size_t>(m_nthreads) * m_buffer_scratchpad_size;
52+
}
53+
54+
void SubgraphBaseExecutor::init_parallel_domain(const std::vector<size_t>& master_shape,
55+
size_t tensor_rank,
56+
size_t tile_rank,
57+
std::vector<size_t>& domain) {
58+
domain.resize(tensor_rank, 1);
59+
std::fill(domain.begin(), domain.end(), 1);
60+
std::copy(master_shape.cbegin(),
61+
master_shape.cbegin() + (master_shape.size() - tile_rank),
62+
domain.begin() + (tensor_rank - master_shape.size()));
63+
}
64+
65+
void SubgraphBaseExecutor::init_parallel_domain(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
66+
std::vector<size_t>& domain) {
67+
init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain);
68+
}
69+
void SubgraphBaseExecutor::parallel_for6d(
70+
const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
71+
const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
72+
const auto& dom = m_parallel_exec_domain;
73+
74+
parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
75+
jit_snippets_call_args call_args;
76+
initializer(call_args, ithr);
77+
78+
size_t start = 0, end = 0;
79+
splitter(m_harness_work_amount, nthr, ithr, start, end);
80+
81+
std::vector<size_t> indexes{0, 0, 0, 0, 0};
82+
parallel_it_init(start,
83+
indexes[0],
84+
dom[0],
85+
indexes[1],
86+
dom[1],
87+
indexes[2],
88+
dom[2],
89+
indexes[3],
90+
dom[3],
91+
indexes[4],
92+
dom[4]);
93+
for (size_t iwork = start; iwork < end; ++iwork) {
94+
caller(call_args, indexes, ithr);
95+
parallel_it_step(indexes[0],
96+
dom[0],
97+
indexes[1],
98+
dom[1],
99+
indexes[2],
100+
dom[2],
101+
indexes[3],
102+
dom[3],
103+
indexes[4],
104+
dom[4]);
105+
}
106+
});
107+
}
108+
109+
void SubgraphBaseExecutor::parallel_forNd(
110+
const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
111+
const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
112+
const auto& dom = m_parallel_exec_domain;
113+
114+
parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
115+
jit_snippets_call_args call_args;
116+
initializer(call_args, ithr);
117+
118+
size_t start = 0, end = 0;
119+
splitter(m_harness_work_amount, nthr, ithr, start, end);
120+
121+
std::vector<size_t> indexes(dom.size() - 1, 0);
122+
for (size_t iwork = start; iwork < end; ++iwork) {
123+
size_t tmp = iwork;
124+
for (ptrdiff_t j = static_cast<ptrdiff_t>(dom.size()) - 2; j >= 0; j--) {
125+
indexes[j] = tmp % dom[j];
126+
tmp /= dom[j];
127+
}
128+
129+
caller(call_args, indexes, ithr);
130+
}
131+
});
132+
}
133+
134+
void SubgraphBaseExecutor::execute(const dnnl::stream& strm,
135+
const std::vector<MemoryPtr>& inMemPtrs,
136+
const std::vector<MemoryPtr>& outMemPtrs) {
137+
exec_impl(inMemPtrs, outMemPtrs);
138+
}
139+
140+
} // namespace intel_cpu
141+
} // namespace ov

0 commit comments

Comments
 (0)