Skip to content

Commit

Permalink
multi thread support, inline intenal call to avoid rsp corrupt
Browse files Browse the repository at this point in the history
  • Loading branch information
chenhu-wang committed Nov 8, 2023
1 parent 92f4238 commit aa0cec4
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 96 deletions.
8 changes: 4 additions & 4 deletions src/common/snippets/include/snippets/lowered/linear_ir.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ namespace lowered {

// Snippets performance count mode
// Disabled - default, w/o perf count for snippets
// Chrono - perf count with chrono call. This is a universal method.
// Chrono - perf count with chrono call. This is a universal method, and support multi-thread case to output perf count data for each thread.
// BackendSpecific - perf count provided by backend. This is for device specific requirment.
// For example, in sake of more light overhead, x86 CPU specific mode via read RDTSC register take ~50ns,
// while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX.
// For example, in sake of more light overhead and more accurate result, x86 CPU specific mode via read RDTSC register is implemented,
// which take ~50ns, while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX. This mode only support single thread.
enum PerfCountMode {
Disabled,
Chrono,
Expand All @@ -33,7 +33,7 @@ class Config {
// True if we should check runtime info for nodes to call specific needed transformations
bool m_need_fill_tail_register = false;
size_t m_loop_depth = 1;
PerfCountMode perf_count_mode = PerfCountMode::Chrono;
PerfCountMode perf_count_mode = PerfCountMode::Disabled;
// Some Subgraphs doesn't support domain optimization due to operations' semantics
bool m_enable_domain_optimization = false;
// Minimal advised work amount for parallel execution.
Expand Down
14 changes: 8 additions & 6 deletions src/common/snippets/include/snippets/op/perf_count.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#pragma once

#include "openvino/op/op.hpp"
#include "openvino/runtime/threading/thread_local.hpp"

namespace ov {
namespace snippets {
Expand Down Expand Up @@ -59,7 +60,7 @@ class PerfCountBegin : public PerfCountBeginBase {
std::chrono::high_resolution_clock::time_point& get_start_time();

private:
std::chrono::high_resolution_clock::time_point start_time_stamp = {};
ov::threading::ThreadLocal<std::chrono::high_resolution_clock::time_point> start_time_stamp;
};

/**
Expand All @@ -73,17 +74,18 @@ class PerfCountEnd : public PerfCountEndBase {
PerfCountEnd(const Output<Node>& pc_begin);
PerfCountEnd() = default;
~PerfCountEnd() {
uint64_t avg = iteration == 0 ? 0 : accumulation / iteration;
std::cout << "accumulation:" << accumulation << "ns, iteration:" << iteration << " avg:" << avg << "ns"<< std::endl;
output_perf_count();
}
void output_perf_count();
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;

std::shared_ptr<PerfCountBegin> get_pc_begin();
void init_pc_begin();
void set_accumulated_time();

private:
uint64_t accumulation = 0ul;
uint32_t iteration = 0u;
ov::threading::ThreadLocal<uint64_t> accumulation;
ov::threading::ThreadLocal<uint32_t> iteration;
std::shared_ptr<PerfCountBegin> m_pc_begin = nullptr;
};

} // namespace op
Expand Down
1 change: 1 addition & 0 deletions src/common/snippets/src/lowered/pass/insert_perf_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ bool InsertPerfCount::run(LinearIR& linear_ir) {

// insert perf_count_end before first result
const auto& perf_count_end = std::make_shared<op::PerfCountEnd>(perf_count_begin->output(0));
perf_count_end->set_friendly_name("last_parameter_to_first_result");
std::vector<PortConnectorPtr> pc_end_inputs;
pc_end_inputs.push_back(perf_count_begin_expr->get_output_port_connector(0));
const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, pc_end_inputs);
Expand Down
47 changes: 38 additions & 9 deletions src/common/snippets/src/op/perf_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,17 @@ std::shared_ptr<Node> PerfCountBegin::clone_with_new_inputs(const OutputVector&
}

std::chrono::high_resolution_clock::time_point& PerfCountBegin::get_start_time() {
return start_time_stamp;
return start_time_stamp.local();
}

void PerfCountBegin::set_start_time() {
start_time_stamp = std::chrono::high_resolution_clock::now();
start_time_stamp.local() = std::chrono::high_resolution_clock::now();
}

//////////////////PerfCountEnd///////////////
PerfCountEnd::PerfCountEnd(const Output<Node>& pc_begin) : PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) {
constructor_validate_and_infer_types();
init_pc_begin();
}

std::shared_ptr<Node> PerfCountEnd::clone_with_new_inputs(const OutputVector& inputs) const {
Expand All @@ -71,15 +72,43 @@ std::shared_ptr<Node> PerfCountEnd::clone_with_new_inputs(const OutputVector& in

void PerfCountEnd::set_accumulated_time() {
auto current_time = std::chrono::high_resolution_clock::now();
auto& start_time = get_pc_begin()->get_start_time();
accumulation += std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - start_time).count();
iteration++;
auto& start_time = m_pc_begin->get_start_time();
accumulation.local() += std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - start_time).count();
iteration.local()++;
}

std::shared_ptr<PerfCountBegin> PerfCountEnd::get_pc_begin() {
const auto& pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin");
return pc_begin;
void PerfCountEnd::init_pc_begin() {
m_pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
NODE_VALIDATION_CHECK(this, m_pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin");
}

void PerfCountEnd::output_perf_count() {
OPENVINO_ASSERT(accumulation.size() == iteration.size(), "accumulation size should be the same as iteration size in perf_count_end node.");
auto iterator_iter = iteration.begin();
auto iterator_acc = accumulation.begin();
int t_num = 0;
std::vector<uint64_t> avg_list;
std::string friendly_name = get_friendly_name();
std::cout << "Perf count data in perfCountEnd node with name " << get_friendly_name() << " is:"<< std::endl;
for (; iterator_iter != iteration.end(); ++iterator_iter, ++iterator_acc) {
const auto iter = *iterator_iter;
const auto acc = *iterator_acc;
uint64_t avg = iter == 0 ? 0 : acc / iter;
avg_list.push_back(avg);
std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl;
t_num++;
}

// max time of all threads: combine for reduce max
auto BinaryFunc = [](const uint64_t& a, const uint64_t& b) {
return a >= b ? a : b;
};
// max accumulation
uint64_t acc_max = accumulation.combine(BinaryFunc);
std::cout << "max accumulated time:" << acc_max << "ns" << std::endl;
// max avg
auto avg_max = std::max_element(avg_list.begin(), avg_list.end(), BinaryFunc);
std::cout << "max avg time:" << *avg_max << "ns" << std::endl;
}

} // namespace op
Expand Down
14 changes: 14 additions & 0 deletions src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,20 @@ struct ThreadLocal {
auto end() const -> Iterator<decltype(_map.end())> const {
return {_map.end()};
}

// CombineFunc has signature T(T,T) or T(const T&, const T&)
template <typename CombineFunc>
T combine(CombineFunc f_combine) {
if (begin() != end()) {
auto ci = begin();
T my_result = *ci;
while (++ci != end())
my_result = f_combine(my_result, *ci);
return my_result;
} else {
return _create();
}
}
};

#endif
Expand Down
73 changes: 0 additions & 73 deletions src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,78 +213,5 @@ void jit_emitter::emit_code(const std::vector<size_t> &in_idxs, const std::vecto
emitter_postamble();
}

void jit_emitter::internal_call_preamble() const {
// gprs
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);

h->sub(h->rsp, n_gprs_to_save * gpr_size);
for (size_t i = 0; i < n_gprs_to_save; ++i)
h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);

// mask regs
// need preserve based on cpu capability, instead of host isa.
// in case there are possibilty that different isa emitters exist in one subgraph KernelEmitter from perf standpoint in the future.
// e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg.
// do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted.
if (cpu::x64::mayiuse(cpu::x64::avx512_core)) {
h->sub(h->rsp, k_mask_num * k_mask_size);
for (size_t i = 0; i < k_mask_num; ++i) {
h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
}
}

// vector regs
// 1. Caller obligation to save vector registers as callee may use them.
// 2. There is an implicit assumption that the host code uses the same
// `isa` as the injector. Once the assumption is wrong, `vecs_count` and
// `vlen` should be replaced with `host_isa::vlen` and
// `host_isa::vecs_count`.
h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
for (size_t i = 0; i < get_max_vecs_count(); ++i) {
push_vec(h->ptr[h->rsp + i * get_vec_length()], i);
}
}

void jit_emitter::internal_call_postamble() const {
// restore vector registers
for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
pop_vec(static_cast<size_t>(i), h->ptr[h->rsp + i * get_vec_length()]);
}
h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());

// restore k reg
if (cpu::x64::mayiuse(cpu::x64::avx512_core)) {
for (int i = k_mask_num - 1; i >= 0; --i) {
h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
}
h->add(h->rsp, k_mask_num * k_mask_size);
}

// restore gpr registers
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
for (int i = n_gprs_to_save - 1; i >= 0; --i)
h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
h->add(h->rsp, n_gprs_to_save * gpr_size);
}

// additional 16 byte for offset, callee can use arbitrary regs.
void jit_emitter::internal_call_rsp_align() const {
h->mov(h->rbx, h->rsp);
h->and_(h->rbx, 0xf);
h->sub(h->rsp, h->rbx);
h->sub(h->rsp, 0x10);
h->mov(h->ptr[h->rsp], h->rbx);
}

void jit_emitter::internal_call_rsp_restore() const {
h->mov(h->rbx, h->ptr[h->rsp]);
h->add(h->rsp, 0x10);
h->add(h->rsp, h->rbx);
}

} // namespace intel_cpu
} // namespace ov
71 changes: 67 additions & 4 deletions src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,73 @@ class jit_emitter : public ov::snippets::Emitter {
}
}

virtual void internal_call_preamble() const;
virtual void internal_call_postamble() const;
virtual void internal_call_rsp_align() const;
virtual void internal_call_rsp_restore() const;
// below 4 functions must be inline funtions to avoid corrupted rsp by function call, so defined inside class declaration.
void internal_call_preamble() const {
// gprs
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);

h->sub(h->rsp, n_gprs_to_save * gpr_size);
for (size_t i = 0; i < n_gprs_to_save; ++i)
h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);

// mask regs
// need preserve based on cpu capability, instead of host isa.
// in case there are possibilty that different isa emitters exist in one subgraph KernelEmitter from perf standpoint in the future.
// e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg.
// do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted.
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
h->sub(h->rsp, k_mask_num * k_mask_size);
for (size_t i = 0; i < k_mask_num; ++i) {
h->kmovq(h->ptr[h->rsp + i * k_mask_size], Xbyak::Opmask(static_cast<int>(i)));
}
}

// vector regs
// 1. Caller obligation to save vector registers as callee may use them.
// 2. There is an implicit assumption that the host code uses the same
// `isa` as the injector. Once the assumption is wrong, `vecs_count` and
// `vlen` should be replaced with `host_isa::vlen` and
// `host_isa::vecs_count`.
h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
for (size_t i = 0; i < get_max_vecs_count(); ++i) {
push_vec(h->ptr[h->rsp + i * get_vec_length()], i);
}
}
void internal_call_postamble() const {
// restore vector registers
for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
pop_vec(static_cast<size_t>(i), h->ptr[h->rsp + i * get_vec_length()]);
}
h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());

// restore k reg
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
for (int i = k_mask_num - 1; i >= 0; --i) {
h->kmovq(Xbyak::Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
}
h->add(h->rsp, k_mask_num * k_mask_size);
}

// restore gpr registers
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
for (int i = n_gprs_to_save - 1; i >= 0; --i)
h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
h->add(h->rsp, n_gprs_to_save * gpr_size);
}
// align stack on 16-byte as ABI reqiures
// callee is responsible to save and restore rbx. rbx must not be changed after call callee.
void internal_call_rsp_align() const {
h->mov(h->rbx, h->rsp);
h->and_(h->rbx, 0xf);
h->sub(h->rsp, h->rbx);
}
void internal_call_rsp_restore() const {
h->add(h->rsp, h->rbx);
}

private:
mutable std::vector<size_t> preserved_vec_idxs;
Expand Down

0 comments on commit aa0cec4

Please sign in to comment.