Skip to content

Commit

Permalink
support multithreads case with thead local handler
Browse files Browse the repository at this point in the history
  • Loading branch information
chenhu-wang committed Oct 11, 2023
1 parent abc6c81 commit 8b94e42
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 13 deletions.
97 changes: 88 additions & 9 deletions src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
#include <vector>
#include "utils/general_utils.h"

using namespace dnnl::impl::cpu;
using namespace dnnl::impl;
using namespace dnnl::impl::cpu;
using namespace dnnl::impl::cpu::x64;
using namespace Xbyak;

namespace ov {
namespace intel_cpu {

jit_emitter* g_debug_err_handler = nullptr;
std::shared_ptr<ThreadLocal<jit_emitter*>> g_debug_err_handler = std::make_shared<ThreadLocal<jit_emitter*>>();

size_t jit_emitter::get_max_vecs_count() const {
return one_of(host_isa_, cpu::x64::avx512_core, cpu::x64::avx512_core) ? 32 : 16;
Expand Down Expand Up @@ -217,16 +218,94 @@ void jit_emitter::emit_code(const std::vector<size_t> &in_idxs, const std::vecto
emitter_postamble();
}

void jit_emitter::internal_call_preamble() const {
// gprs
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);

h->sub(h->rsp, n_gprs_to_save * gpr_size_);
for (size_t i = 0; i < n_gprs_to_save; ++i)
h->mov(h->ptr[h->rsp + i * gpr_size_], gprs_to_save[i]);

// mask regs
// need preserve based on cpu capability, instead of host isa.
// in case there are possibilty that different isa emitters exist in one subgraph KernelEmitter from perf standpoint in the future.
// e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg.
// do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted.
if (cpu::x64::mayiuse(cpu::x64::avx512_core)) {
h->sub(h->rsp, k_mask_num * k_mask_size);
for (size_t i = 0; i < k_mask_num; ++i) {
h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
}
}

// vector regs
// 1. Caller obligation to save vector registers as callee may use them.
// 2. There is an implicit assumption that the host code uses the same
// `isa` as the injector. Once the assumption is wrong, `vecs_count` and
// `vlen` should be replaced with `host_isa::vlen` and
// `host_isa::vecs_count`.
h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
for (size_t i = 0; i < get_max_vecs_count(); ++i) {
push_vec(h->ptr[h->rsp + i * get_vec_length()], i);
}
}

void jit_emitter::internal_call_postamble() const {
// restore vector registers
for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
pop_vec(static_cast<size_t>(i), h->ptr[h->rsp + i * get_vec_length()]);
}
h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());

// restore k reg
if (cpu::x64::mayiuse(cpu::x64::avx512_core)) {
for (int i = k_mask_num - 1; i >= 0; --i) {
h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
}
h->add(h->rsp, k_mask_num * k_mask_size);
}

// restore gpr registers
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
for (int i = n_gprs_to_save - 1; i >= 0; --i)
h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size_]);
h->add(h->rsp, n_gprs_to_save * gpr_size_);
}

// additional 16 byte for offset, callee can use arbitrary regs.
void jit_emitter::internal_call_rsp_align() const {
h->mov(h->rbx, h->rsp);
h->and_(h->rbx, 0xf);
h->sub(h->rsp, h->rbx);
h->sub(h->rsp, 0x10);
h->mov(h->ptr[h->rsp], h->rbx);
}

void jit_emitter::internal_call_rsp_restore() const {
h->mov(h->rbx, h->ptr[h->rsp]);
h->add(h->rsp, 0x10);
h->add(h->rsp, h->rbx);
}

void jit_emitter::build_debug_info() const {
h->push(h->r15);
h->push(h->r14);
internal_call_preamble();

h->mov(h->r15, reinterpret_cast<uint64_t>(&g_debug_err_handler));
h->mov(h->r14, reinterpret_cast<uint64_t>(this));
h->mov(h->qword[h->r15], h->r14);
const auto &set_local_handler_overload = static_cast<void (*)(jit_emitter*)>(set_local_handler);
h->mov(h->rax, reinterpret_cast<size_t>(set_local_handler_overload));
h->mov(abi_param1, reinterpret_cast<uint64_t>(this));
internal_call_rsp_align();
h->call(h->rax);
internal_call_rsp_restore();

internal_call_postamble();
}

h->pop(h->r14);
h->pop(h->r15);
void jit_emitter::set_local_handler(jit_emitter* emitter_address) {
g_debug_err_handler->local() = emitter_address;
}

} // namespace intel_cpu
Expand Down
15 changes: 13 additions & 2 deletions src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,24 @@
//

#pragma once

#include <ie_common.h>
#include <cpu/x64/jit_generator.hpp>

#include "snippets/snippets_isa.hpp"
#include "snippets/generator.hpp"
#include <node.h>

#include "openvino/runtime/threading/thread_local.hpp"

#include <set>

using namespace ov::threading;

namespace ov {
namespace intel_cpu {

class jit_emitter;
extern jit_emitter* g_debug_err_handler;
extern std::shared_ptr<ThreadLocal<jit_emitter*>> g_debug_err_handler;

enum emitter_in_out_map {
vec_to_vec,
Expand Down Expand Up @@ -113,6 +116,8 @@ class jit_emitter : public ov::snippets::Emitter {
mutable std::vector<size_t> aux_gpr_idxs;

static constexpr int k_mask_size = 8;
static constexpr int k_mask_num = 8;
static constexpr int gpr_size_ = 8;

Xbyak::Address table_val(std::string key, size_t key_off_val_shift = 0) const {
auto off = table_off(key, key_off_val_shift);
Expand All @@ -137,7 +142,13 @@ class jit_emitter : public ov::snippets::Emitter {
}
}

virtual void internal_call_preamble() const;
virtual void internal_call_postamble() const;
virtual void internal_call_rsp_align() const;
virtual void internal_call_rsp_restore() const;

void build_debug_info() const;
static void set_local_handler(jit_emitter* emitter_address);

private:
mutable std::vector<size_t> preserved_vec_idxs;
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_cpu/src/nodes/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ void Snippet::SnippetJitExecutor::schedule_6d(const std::vector<MemoryPtr>& inMe
update_ptrs(call_args, inMemPtrs, outMemPtrs);
#ifdef __linux__
__sighandler_t signal_handler = [](int signal) {
ov::intel_cpu::g_debug_err_handler->print_debug_info();
ov::intel_cpu::g_debug_err_handler->local()->print_debug_info();
OPENVINO_THROW("Segfault was caught by the signal handler");
};
struct sigaction new_handler{};
Expand Down Expand Up @@ -502,7 +502,7 @@ void Snippet::SnippetJitExecutor::schedule_nt(const std::vector<MemoryPtr>& inMe
}
#ifdef __linux__
__sighandler_t signal_handler = [](int signal) {
ov::intel_cpu::g_debug_err_handler->print_debug_info();
ov::intel_cpu::g_debug_err_handler->local()->print_debug_info();
OPENVINO_THROW("Segfault was caught by the signal handler");
};
struct sigaction new_handler{};
Expand Down

0 comments on commit 8b94e42

Please sign in to comment.