multi thread support, inline intenal call to avoid rsp corrupt

chenhu-wang · Nov 8, 2023 · aa0cec4 · aa0cec4
1 parent 92f4238
commit aa0cec4
Show file tree

Hide file tree

Showing 7 changed files with 132 additions and 96 deletions.
diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
@@ -16,10 +16,10 @@ namespace lowered {
 
 // Snippets performance count mode
 // Disabled - default, w/o perf count for snippets
-// Chrono - perf count with chrono call. This is a universal method.
+// Chrono - perf count with chrono call. This is a universal method, and support multi-thread case to output perf count data for each thread.
 // BackendSpecific - perf count provided by backend. This is for device specific requirment.
-// For example, in sake of more light overhead, x86 CPU specific mode via read RDTSC register take ~50ns,
-// while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX.
+// For example, in sake of more light overhead and more accurate result, x86 CPU specific mode via read RDTSC register is implemented,
+// which take ~50ns, while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX. This mode only support single thread.
 enum PerfCountMode {
     Disabled,
     Chrono,
@@ -33,7 +33,7 @@ class Config {
     // True if we should check runtime info for nodes to call specific needed transformations
     bool m_need_fill_tail_register = false;
     size_t m_loop_depth = 1;
-    PerfCountMode perf_count_mode = PerfCountMode::Chrono;
+    PerfCountMode perf_count_mode = PerfCountMode::Disabled;
     // Some Subgraphs doesn't support domain optimization due to operations' semantics
     bool m_enable_domain_optimization = false;
     // Minimal advised work amount for parallel execution.

diff --git a/src/common/snippets/include/snippets/op/perf_count.hpp b/src/common/snippets/include/snippets/op/perf_count.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "openvino/op/op.hpp"
+#include "openvino/runtime/threading/thread_local.hpp"
 
 namespace ov {
 namespace snippets {
@@ -59,7 +60,7 @@ class PerfCountBegin : public PerfCountBeginBase {
     std::chrono::high_resolution_clock::time_point& get_start_time();
 
 private:
-    std::chrono::high_resolution_clock::time_point start_time_stamp = {};
+    ov::threading::ThreadLocal<std::chrono::high_resolution_clock::time_point> start_time_stamp;
 };
 
 /**
@@ -73,17 +74,18 @@ class PerfCountEnd : public PerfCountEndBase {
     PerfCountEnd(const Output<Node>& pc_begin);
     PerfCountEnd() = default;
     ~PerfCountEnd() {
-        uint64_t avg = iteration == 0 ? 0 : accumulation / iteration;
-        std::cout << "accumulation:" << accumulation << "ns, iteration:" << iteration << " avg:" << avg << "ns"<< std::endl;
+        output_perf_count();
     }
+    void output_perf_count();
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
 
-    std::shared_ptr<PerfCountBegin> get_pc_begin();
+    void init_pc_begin();
     void set_accumulated_time();
 
 private:
-    uint64_t accumulation = 0ul;
-    uint32_t iteration = 0u;
+    ov::threading::ThreadLocal<uint64_t> accumulation;
+    ov::threading::ThreadLocal<uint32_t> iteration;
+    std::shared_ptr<PerfCountBegin> m_pc_begin = nullptr;
 };
 
 } // namespace op

diff --git a/src/common/snippets/src/lowered/pass/insert_perf_count.cpp b/src/common/snippets/src/lowered/pass/insert_perf_count.cpp
@@ -49,6 +49,7 @@ bool InsertPerfCount::run(LinearIR& linear_ir) {
 
     // insert perf_count_end before first result
     const auto& perf_count_end = std::make_shared<op::PerfCountEnd>(perf_count_begin->output(0));
+    perf_count_end->set_friendly_name("last_parameter_to_first_result");
     std::vector<PortConnectorPtr> pc_end_inputs;
     pc_end_inputs.push_back(perf_count_begin_expr->get_output_port_connector(0));
     const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, pc_end_inputs);

diff --git a/src/common/snippets/src/op/perf_count.cpp b/src/common/snippets/src/op/perf_count.cpp
@@ -53,16 +53,17 @@ std::shared_ptr<Node> PerfCountBegin::clone_with_new_inputs(const OutputVector&
 }
 
 std::chrono::high_resolution_clock::time_point& PerfCountBegin::get_start_time() {
-    return start_time_stamp;
+    return start_time_stamp.local();
 }
 
 void PerfCountBegin::set_start_time() {
-    start_time_stamp = std::chrono::high_resolution_clock::now();
+    start_time_stamp.local() = std::chrono::high_resolution_clock::now();
 }
 
 //////////////////PerfCountEnd///////////////
 PerfCountEnd::PerfCountEnd(const Output<Node>& pc_begin) : PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) {
     constructor_validate_and_infer_types();
+    init_pc_begin();
 }
 
 std::shared_ptr<Node> PerfCountEnd::clone_with_new_inputs(const OutputVector& inputs) const {
@@ -71,15 +72,43 @@ std::shared_ptr<Node> PerfCountEnd::clone_with_new_inputs(const OutputVector& in
 
 void PerfCountEnd::set_accumulated_time() {
     auto current_time = std::chrono::high_resolution_clock::now();
-    auto& start_time = get_pc_begin()->get_start_time();
-    accumulation += std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - start_time).count();
-    iteration++;
+    auto& start_time = m_pc_begin->get_start_time();
+    accumulation.local() += std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - start_time).count();
+    iteration.local()++;
 }
 
-std::shared_ptr<PerfCountBegin> PerfCountEnd::get_pc_begin() {
-    const auto& pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
-    NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin");
-    return  pc_begin;
+void PerfCountEnd::init_pc_begin() {
+    m_pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
+    NODE_VALIDATION_CHECK(this, m_pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin");
+}
+
+void PerfCountEnd::output_perf_count() {
+    OPENVINO_ASSERT(accumulation.size() == iteration.size(), "accumulation size should be the same as iteration size in perf_count_end node.");
+    auto iterator_iter = iteration.begin();
+    auto iterator_acc = accumulation.begin();
+    int t_num = 0;
+    std::vector<uint64_t> avg_list;
+    std::string friendly_name = get_friendly_name();
+    std::cout << "Perf count data in perfCountEnd node with name " << get_friendly_name() << " is:"<< std::endl;
+    for (; iterator_iter != iteration.end(); ++iterator_iter, ++iterator_acc) {
+        const auto iter = *iterator_iter;
+        const auto acc = *iterator_acc;
+        uint64_t avg = iter == 0 ? 0 : acc / iter;
+        avg_list.push_back(avg);
+        std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl;
+        t_num++;
+    }
+
+    // max time of all threads: combine for reduce max
+    auto BinaryFunc = [](const uint64_t& a, const uint64_t& b) {
+        return a >= b ? a : b;
+    };
+    // max accumulation
+    uint64_t acc_max = accumulation.combine(BinaryFunc);
+    std::cout << "max accumulated time:" << acc_max << "ns" << std::endl;
+    // max avg
+    auto avg_max = std::max_element(avg_list.begin(), avg_list.end(), BinaryFunc);
+    std::cout << "max avg time:" << *avg_max << "ns" << std::endl;
 }
 
 } // namespace op

diff --git a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
@@ -116,6 +116,20 @@ struct ThreadLocal {
     auto end() const -> Iterator<decltype(_map.end())> const {
         return {_map.end()};
     }
+
+    // CombineFunc has signature T(T,T) or T(const T&, const T&)
+    template <typename CombineFunc>
+    T combine(CombineFunc f_combine) {
+        if (begin() != end()) {
+            auto ci = begin();
+            T my_result = *ci;
+            while (++ci != end())
+                my_result = f_combine(my_result, *ci);
+            return my_result;
+        } else {
+            return _create();
+        }
+    }
 };
 
 #endif

diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp
@@ -213,78 +213,5 @@ void jit_emitter::emit_code(const std::vector<size_t> &in_idxs, const std::vecto
     emitter_postamble();
 }
 
-void jit_emitter::internal_call_preamble() const {
-    // gprs
-    Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
-                                     h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
-    size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
-
-    h->sub(h->rsp, n_gprs_to_save * gpr_size);
-    for (size_t i = 0; i < n_gprs_to_save; ++i)
-        h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
-
-    // mask regs
-    // need preserve based on cpu capability, instead of host isa.
-    // in case there are possibilty that different isa emitters exist in one subgraph KernelEmitter from perf standpoint in the future.
-    // e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg.
-    // do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted.
-    if (cpu::x64::mayiuse(cpu::x64::avx512_core)) {
-        h->sub(h->rsp, k_mask_num * k_mask_size);
-        for (size_t i = 0; i < k_mask_num; ++i) {
-            h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
-        }
-    }
-
-    // vector regs
-    // 1. Caller obligation to save vector registers as callee may use them.
-    // 2. There is an implicit assumption that the host code uses the same
-    // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
-    // `vlen` should be replaced with `host_isa::vlen` and
-    // `host_isa::vecs_count`.
-    h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
-    for (size_t i = 0; i < get_max_vecs_count(); ++i) {
-        push_vec(h->ptr[h->rsp + i * get_vec_length()], i);
-    }
-}
-
-void jit_emitter::internal_call_postamble() const {
-    // restore vector registers
-    for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
-        pop_vec(static_cast<size_t>(i), h->ptr[h->rsp + i * get_vec_length()]);
-    }
-    h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
-
-    // restore k reg
-    if (cpu::x64::mayiuse(cpu::x64::avx512_core)) {
-        for (int i = k_mask_num - 1; i >= 0; --i) {
-            h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
-        }
-        h->add(h->rsp, k_mask_num * k_mask_size);
-    }
-
-    // restore gpr registers
-    Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
-                                     h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
-    size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
-    for (int i = n_gprs_to_save - 1; i >= 0; --i)
-        h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
-    h->add(h->rsp, n_gprs_to_save * gpr_size);
-}
-
-// additional 16 byte for offset, callee can use arbitrary regs.
-void jit_emitter::internal_call_rsp_align() const {
-    h->mov(h->rbx, h->rsp);
-    h->and_(h->rbx, 0xf);
-    h->sub(h->rsp, h->rbx);
-    h->sub(h->rsp, 0x10);
-    h->mov(h->ptr[h->rsp], h->rbx);
-}
-
-void jit_emitter::internal_call_rsp_restore() const {
-    h->mov(h->rbx, h->ptr[h->rsp]);
-    h->add(h->rsp, 0x10);
-    h->add(h->rsp, h->rbx);
-}
-
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp
@@ -132,10 +132,73 @@ class jit_emitter : public ov::snippets::Emitter {
         }
     }
 
-    virtual void internal_call_preamble() const;
-    virtual void internal_call_postamble() const;
-    virtual void internal_call_rsp_align() const;
-    virtual void internal_call_rsp_restore() const;
+    // below 4 functions must be inline funtions to avoid corrupted rsp by function call, so defined inside class declaration.
+    void internal_call_preamble() const {
+        // gprs
+        Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
+                                         h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
+        size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
+
+        h->sub(h->rsp, n_gprs_to_save * gpr_size);
+        for (size_t i = 0; i < n_gprs_to_save; ++i)
+            h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
+
+        // mask regs
+        // need preserve based on cpu capability, instead of host isa.
+        // in case there are possibilty that different isa emitters exist in one subgraph KernelEmitter from perf standpoint in the future.
+        // e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg.
+        // do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted.
+        if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
+            h->sub(h->rsp, k_mask_num * k_mask_size);
+            for (size_t i = 0; i < k_mask_num; ++i) {
+                h->kmovq(h->ptr[h->rsp + i * k_mask_size], Xbyak::Opmask(static_cast<int>(i)));
+            }
+        }
+
+        // vector regs
+        // 1. Caller obligation to save vector registers as callee may use them.
+        // 2. There is an implicit assumption that the host code uses the same
+        // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
+        // `vlen` should be replaced with `host_isa::vlen` and
+        // `host_isa::vecs_count`.
+        h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
+        for (size_t i = 0; i < get_max_vecs_count(); ++i) {
+            push_vec(h->ptr[h->rsp + i * get_vec_length()], i);
+        }
+    }
+    void internal_call_postamble() const {
+        // restore vector registers
+        for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
+            pop_vec(static_cast<size_t>(i), h->ptr[h->rsp + i * get_vec_length()]);
+        }
+        h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
+
+        // restore k reg
+        if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
+            for (int i = k_mask_num - 1; i >= 0; --i) {
+                h->kmovq(Xbyak::Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
+            }
+            h->add(h->rsp, k_mask_num * k_mask_size);
+        }
+
+        // restore gpr registers
+        Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
+                                         h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
+        size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
+        for (int i = n_gprs_to_save - 1; i >= 0; --i)
+            h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
+        h->add(h->rsp, n_gprs_to_save * gpr_size);
+    }
+    // align stack on 16-byte as ABI reqiures
+    // callee is responsible to save and restore rbx. rbx must not be changed after call callee.
+    void internal_call_rsp_align() const {
+        h->mov(h->rbx, h->rsp);
+        h->and_(h->rbx, 0xf);
+        h->sub(h->rsp, h->rbx);
+    }
+    void internal_call_rsp_restore() const {
+        h->add(h->rsp, h->rbx);
+    }
 
 private:
     mutable std::vector<size_t> preserved_vec_idxs;