Skip to content

Commit

Permalink
[Snippets][CPU] Small optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
a-sidorova committed Dec 28, 2024
1 parent cced16d commit fea453e
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 81 deletions.
99 changes: 75 additions & 24 deletions src/plugins/intel_cpu/src/nodes/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,19 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor {
init_call_args(call_args, inMemPtrs, outMemPtrs, ithr);
};

auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
using call_functor = std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>;
call_functor caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
callable(&call_args, indexes.data());
};

#ifdef OPENVINO_ARCH_X86_64
if (should_repacking_be_in_parallel())
if (should_repacking_be_in_parallel()) {
caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args);
callable(&call_args, indexes.data());
};
}
#endif // OPENVINO_ARCH_X86_64
callable(&call_args, indexes.data());
};

if (m_parallel_exec_domain.size() == rank6D) {
parallel_for6d(initializer, caller);
Expand Down Expand Up @@ -169,14 +175,21 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor {
init_call_args(call_args, ithr);
};

auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
using call_functor = std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>;
call_functor caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
update_ptrs(call_args, src_ptrs, dst_ptrs, indexes);
callable(&call_args);
};

#ifdef OPENVINO_ARCH_X86_64
if (should_repacking_be_in_parallel())
if (should_repacking_be_in_parallel()) {
caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
update_ptrs(call_args, src_ptrs, dst_ptrs, indexes);
in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args);
callable(&call_args);
};
}
#endif // OPENVINO_ARCH_X86_64
callable(&call_args);
};

if (m_parallel_exec_domain.size() == rank6D) {
parallel_for6d(initializer, caller);
Expand Down Expand Up @@ -973,6 +986,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::Sub
OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!");
init_parallel_domain(snippet_config, m_parallel_exec_domain);

m_tensor_rank = snippet_config->tensor_rank;
m_harness_work_amount = std::accumulate(m_parallel_exec_domain.cbegin(),
m_parallel_exec_domain.cend(),
size_t(1),
Expand Down Expand Up @@ -1021,6 +1035,49 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::Sub
}

#ifdef OPENVINO_ARCH_X86_64
namespace {
inline void parallel4d_repacking(const BrgemmCopyBKernel* ker,
const VectorDims& dom,
const VectorDims& in_str,
const VectorDims& out_str,
const uint8_t* src,
uint8_t* dst) {
parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) {
BrgemmCopyBKernel::call_args args;
args.src = src + d0 * in_str[0] + d1 * in_str[1] + d2 * in_str[2] + d3 * in_str[3];
args.tr_src = dst + d0 * out_str[0] + d1 * out_str[1] + d2 * out_str[2] + d3 * out_str[3];
(*ker)(&args);
});
};
inline void parallelNd_repacking(const BrgemmCopyBKernel* ker,
const VectorDims& dom,
const VectorDims& in_str,
const VectorDims& out_str,
const uint8_t* src,
uint8_t* dst) {
const auto batch = std::accumulate(dom.rbegin() + 2, dom.rend(), 1lu, std::multiplies<size_t>());
parallel_nt_static(0, [&](const int ithr, const int nthr) {
BrgemmCopyBKernel::call_args args;
size_t start = 0, end = 0;
splitter(batch, nthr, ithr, start, end);
for (size_t iwork = start; iwork < end; ++iwork) {
const uint8_t* src_u8 = src;
uint8_t* dst_u8 = dst;
size_t tmp = iwork;
for (ptrdiff_t j = static_cast<ptrdiff_t>(dom.size()) - 3; j >= 0; j--) {
auto idx = tmp % dom[j];
tmp /= dom[j];

src_u8 += idx * in_str[j];
dst_u8 += idx * out_str[j];
}
args.src = src_u8;
args.tr_src = dst_u8;
(*ker)(&args);
}
});
};
} // namespace
std::vector<MemoryPtr> Subgraph::SubgraphExecutor::separately_repack_inputs(const dnnl::stream& strm,
const std::vector<MemoryPtr>& srcMemPtrs) {
auto reordered_in_ptrs = srcMemPtrs;
Expand All @@ -1040,21 +1097,19 @@ std::vector<MemoryPtr> Subgraph::SubgraphExecutor::separately_repack_inputs(cons

VectorDims dom;
const auto& shape = dst_mem->getShape().getDims();
OPENVINO_ASSERT(shape.size() <= rank6D, "Unsupported shape rank of repacking data");
init_parallel_domain(shape, rank6D, 2lu, dom);
OPENVINO_ASSERT(shape.size() <= m_tensor_rank, "Unsupported shape rank of repacking data");
init_parallel_domain(shape, m_tensor_rank, 2lu, dom);

const auto& in_strides = repacked_input.in_offsets();
const auto& out_strides = repacked_input.out_offsets();
OPENVINO_ASSERT(in_strides.size() == rank6D && out_strides.size() == rank6D && dom.size() == rank6D,
OPENVINO_ASSERT(everyone_is(m_tensor_rank, in_strides.size(), out_strides.size(), dom.size()),
"Unsupported shape rank of repacking data");

const auto& kernel = repacked_input.kernel();
parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) {
BrgemmCopyBKernel::call_args args;
args.src = src + d0 * in_strides[0] + d1 * in_strides[1] + d2 * in_strides[2] + d3 * in_strides[3];
args.tr_src = dst + d0 * out_strides[0] + d1 * out_strides[1] + d2 * out_strides[2] + d3 * out_strides[3];
(*kernel)(&args);
});
if (m_tensor_rank == rank6D)
parallel4d_repacking(kernel.get(), dom, in_strides, out_strides, src, dst);
else
parallelNd_repacking(kernel.get(), dom, in_strides, out_strides, src, dst);

reordered_in_ptrs[in_idx] = dst_mem;
offset += desc->getCurrentMemSize();
Expand All @@ -1072,15 +1127,11 @@ void Subgraph::SubgraphExecutor::in_parallel_repack_inputs(const std::vector<Mem
const auto& repacked_in = p.second;

const auto& src_offsets = repacked_in.in_offsets();
const auto& dst_offsets = repacked_in.out_offsets();

size_t src_offset = m_start_offset_in[in_idx], dst_offset = 0;
for (size_t j = 0; j < indexes.size(); j++) {
size_t src_offset = m_start_offset_in[in_idx];
for (size_t j = 0; j < indexes.size(); j++)
src_offset += src_offsets[j] * indexes[j];
dst_offset += dst_offsets[j] * indexes[j];
}

uint8_t* repacked_ptr = get_external_scratchpad_ptr(ithr, in_idx) + dst_offset;
auto* repacked_ptr = get_external_scratchpad_ptr(ithr, in_idx);

auto& last_processed_src_offset = m_repacked_offsets_by_threads[ithr][repacked_offset_idx];
if (src_offset != last_processed_src_offset) {
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_cpu/src/nodes/subgraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ class Subgraph::SubgraphExecutor {
MemoryPtr m_buffer_scratchpad = nullptr;
size_t m_buffer_scratchpad_size = 0;
size_t m_internal_buffer_size = 0;
size_t m_tensor_rank = 0;

const size_t rank6D = 6;

Expand All @@ -180,7 +181,7 @@ class Subgraph::SubgraphExecutor {
int ithr,
jit_snippets_call_args& call_args);

inline uint8_t* get_external_scratchpad_ptr(size_t ithr, size_t idx) const {
inline void* get_external_scratchpad_ptr(size_t ithr, size_t idx) const {
if (m_repacked_inputs.empty())
return nullptr;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,38 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp
}
}

VectorDims BrgemmExternalRepackingAdjuster::get_blk_order(size_t shape_rank) {
VectorDims order(shape_rank - brgemm_kernel_rank);
std::iota(order.begin(), order.end(), 0);
const auto last_idx = shape_rank - 1;
order.insert(order.end(), {last_idx - 1, last_idx, last_idx - 1});
return order;
}

VectorDims BrgemmExternalRepackingAdjuster::get_blk_shape(const VectorDims& shape, ov::element::Type prc) {
const auto vnni_factor = brgemm_utils::compute_vnni_factor(prc);
const auto K = *++shape.rbegin();
const auto N = *shape.rbegin();
const auto new_K = snippets::utils::div_up(K, vnni_factor);
const auto new_N = std::max(N, brgemm_utils::repacking::compute_inner_n_block(prc));
VectorDims blk_shape(shape.begin(), shape.end() - brgemm_kernel_rank);
blk_shape.insert(blk_shape.end(), {new_K, new_N, vnni_factor});
return blk_shape;
}

void BrgemmExternalRepackingAdjuster::update_kernel(const RepackExecutorPtr& executor,
const VectorDims& shape,
const VectorDims& layout,
size_t N,
size_t K,
ov::element::Type prc) {
const auto copy_wei_stride = ov::snippets::utils::get_dim_in_stride(shape, layout, 1) * prc.size();
const auto generic_config = executor->get_config().get_clone_ptr();
auto config = static_cast<BrgemmCopyBKernelConfig*>(generic_config.get());
config->update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, prc));
executor->update_by_config(*config);
}

bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmExternalRepackingAdjuster")
const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_configurator->get_config());
Expand All @@ -49,76 +81,66 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
for (const auto& p : m_executors) {
const auto& i = p.first;
const auto& shape = cpu_config->io_shapes[i];
if (shape == cpu_config->latest_shapes[i])
continue;

const auto& layout = cpu_config->io_layouts[i];
const auto planar_shape = ov::snippets::utils::get_planar_vdims(shape, layout);
const auto& K = *++planar_shape.rbegin();
const auto& N = *planar_shape.rbegin();

// Create CPU Memory descriptor
const auto& precision = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0);
const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision);
const size_t brgemm_kernel_rank = 2;
// Firstly, batch dims are set
VectorDims requested_blocked_shape(planar_shape.begin(), planar_shape.end() - brgemm_kernel_rank);
// Then, the blocked dims are formed
const auto new_K = snippets::utils::div_up(K, vnni_factor);
const auto new_N = std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision));
requested_blocked_shape.insert(requested_blocked_shape.end(), {new_K, new_N, vnni_factor});

VectorDims requested_order(planar_shape.size() - brgemm_kernel_rank);
std::iota(requested_order.begin(), requested_order.end(), 0);
const auto last_idx = planar_shape.size() - 1;
requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1});

const auto desc = std::make_shared<CpuBlockedMemoryDesc>(precision,
Shape(planar_shape),
requested_blocked_shape,
requested_order);

// Create Kernel using BrgemmCopyBExecutor
const auto& executor = p.second;
const auto copy_wei_stride =
ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size();
const auto generic_config = executor->get_config().get_clone_ptr();
auto config = static_cast<BrgemmCopyBKernelConfig*>(generic_config.get());
config->update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, precision));
executor->update_by_config(*config);
const auto& prc = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0);
const auto blk_shape = get_blk_shape(planar_shape, prc);

// Save original input offsets for input before repacking.
const auto in_offsets = cpu_config->io_data_offsets[i];
// src data + dst data per kernel call
const auto src_data = N * K * prc.size();
const auto dst_data =
std::accumulate(blk_shape.rbegin(), blk_shape.rbegin() + 3, prc.size(), std::multiplies<size_t>());
data_size += src_data + dst_data;

ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1);
shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end());
m_configurator->compute_offsets(shape_for_offset, i, 0);
// Save new input offsets for input after repacking.
const auto out_offsets = cpu_config->io_data_offsets[i];
update_kernel(p.second, shape, layout, N, K, prc);
}

cpu_config->repacked_inputs[i] =
CPURuntimeConfig::RepackedInput(executor->get_kernel(), desc, in_offsets, out_offsets);
const auto L2_cache_size = dnnl::utils::get_cache_size(2, true);
const auto fit_into_L2 = data_size < L2_cache_size;
// Heuristic: If external repacking data doesn't fit in the cache L2,
// external repacking should be executed in seperate parallel section before kernel execution.
cpu_config->repacking_impl_type =
fit_into_L2 ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL : CPURuntimeConfig::RepackingImplType::SEPARATE;

// src data + dst data per kernel call
data_size += N * K * precision.size() + new_N * new_K * vnni_factor * precision.size();
}
const auto is_impl_parallel = cpu_config->repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL;

for (const auto& p : m_executors) {
const auto& i = p.first;
const auto& shape = cpu_config->io_shapes[i];
auto& repacked_in = cpu_config->repacked_inputs[i];

const auto& prc = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0);
auto planar_shape = ov::snippets::utils::get_planar_vdims(shape, cpu_config->io_layouts[i]);
auto blk_shape = get_blk_shape(planar_shape, prc);
// In parallel impl, each thread needs buffer with only shape [K_blk, N_blk, VNNI] to store repacking data
if (is_impl_parallel) {
std::fill(planar_shape.rbegin() + brgemm_kernel_rank, planar_shape.rend(), 1);
std::fill(blk_shape.rbegin() + brgemm_kernel_rank + 1, blk_shape.rend(), 1);
}
const auto order = get_blk_order(planar_shape.size());
const auto desc = std::make_shared<CpuBlockedMemoryDesc>(prc, Shape(planar_shape), blk_shape, order);

if (!cpu_config->repacked_inputs.empty()) {
const auto L2_cache_size = dnnl::utils::get_cache_size(2, true);
const auto fit_into_L2 = data_size < L2_cache_size;
// Heuristic: If external repacking data doesn't fit in the cache L2,
// external repacking should be executed in seperate parallel section before kernel execution.
cpu_config->repacking_impl_type = fit_into_L2 ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL
: CPURuntimeConfig::RepackingImplType::SEPARATE;
// Save original input offsets for input before repacking.
const auto in_offsets =
shape == cpu_config->latest_shapes[i] ? repacked_in.in_offsets() : cpu_config->io_data_offsets[i];

// In parallel case Kernel should not add offsets to repacked inputs because
// they will be applied during repacking in execution stage
if (cpu_config->repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL) {
for (const auto& in : cpu_config->repacked_inputs) {
auto& offsets = cpu_config->io_data_offsets[in.first];
std::fill(offsets.begin(), offsets.end(), 0);
}
if (is_impl_parallel) {
auto& offsets = cpu_config->io_data_offsets[i];
std::fill(offsets.begin(), offsets.end(), 0);
} else {
ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1);
shape_for_offset.insert(shape_for_offset.end(), blk_shape.begin(), blk_shape.end());
m_configurator->compute_offsets(shape_for_offset, i, 0);
}
const auto out_offsets = cpu_config->io_data_offsets[i];

repacked_in = CPURuntimeConfig::RepackedInput(p.second->get_kernel(), desc, in_offsets, out_offsets);
}

return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,19 @@ class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::Runt
}

private:
std::unordered_map<size_t, std::shared_ptr<BrgemmCopyBKernelExecutor>> m_executors;
using RepackExecutorPtr = std::shared_ptr<BrgemmCopyBKernelExecutor>;
static VectorDims get_blk_order(size_t shape_rank);
static VectorDims get_blk_shape(const VectorDims& shape, ov::element::Type prc);

void update_kernel(const RepackExecutorPtr& executor,
const VectorDims& shape,
const VectorDims& layout,
size_t N,
size_t K,
ov::element::Type prc);

const static size_t brgemm_kernel_rank = 2;
std::unordered_map<size_t, RepackExecutorPtr> m_executors;
};

} // namespace intel_cpu
Expand Down

0 comments on commit fea453e

Please sign in to comment.