From cc9f457c643df56eda507251477acbd16e36134b Mon Sep 17 00:00:00 2001 From: chenhuwa Date: Tue, 17 Oct 2023 14:27:41 +0800 Subject: [PATCH] multi thread support --- .../include/snippets/lowered/linear_ir.hpp | 8 ++-- .../include/snippets/op/perf_count.hpp | 14 +++--- src/common/snippets/src/generator.cpp | 2 - .../src/lowered/pass/insert_perf_count.cpp | 1 + src/common/snippets/src/op/perf_count.cpp | 47 +++++++++++++++---- .../runtime/threading/thread_local.hpp | 14 ++++++ .../src/emitters/x64/jit_emitter.cpp | 15 ------ .../src/emitters/x64/jit_emitter.hpp | 16 ++++++- .../x64/jit_perf_count_chrono_emitters.cpp | 8 ++-- .../emitters/x64/jit_snippets_emitters.cpp | 8 ++-- 10 files changed, 87 insertions(+), 46 deletions(-) diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp index a12281648e9067..5542f0e56a1dbd 100644 --- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp +++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp @@ -16,10 +16,10 @@ namespace lowered { // Snippets performance count mode // Disabled - default, w/o perf count for snippets -// Chrono - perf count with chrono call. This is a universal method. +// Chrono - perf count with chrono call. This is a universal method, and support multi-thread case to output perf count data for each thread. // BackendSpecific - perf count provided by backend. This is for device specific requirment. -// For example, in sake of more light overhead, x86 CPU specific mode via read RDTSC register take ~50ns, -// while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX. +// For example, in sake of more light overhead and more accurate result, x86 CPU specific mode via read RDTSC register is implemented, +// which take ~50ns, while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX. This mode only support single thread. enum PerfCountMode { Disabled, Chrono, @@ -33,7 +33,7 @@ class Config { // True if we should check runtime info for nodes to call specific needed transformations bool m_need_fill_tail_register = false; size_t m_loop_depth = 1; - PerfCountMode perf_count_mode = PerfCountMode::Chrono; + PerfCountMode perf_count_mode = PerfCountMode::Disabled; // Some Subgraphs doesn't support domain optimization due to operations' semantics bool m_enable_domain_optimization = false; // Minimal advised work amount for parallel execution. diff --git a/src/common/snippets/include/snippets/op/perf_count.hpp b/src/common/snippets/include/snippets/op/perf_count.hpp index 071cc8ed79a4c8..be7eccfc5b44aa 100644 --- a/src/common/snippets/include/snippets/op/perf_count.hpp +++ b/src/common/snippets/include/snippets/op/perf_count.hpp @@ -5,6 +5,7 @@ #pragma once #include "openvino/op/op.hpp" +#include "openvino/runtime/threading/thread_local.hpp" namespace ov { namespace snippets { @@ -59,7 +60,7 @@ class PerfCountBegin : public PerfCountBeginBase { std::chrono::high_resolution_clock::time_point& get_start_time(); private: - std::chrono::high_resolution_clock::time_point start_time_stamp = {}; + ov::threading::ThreadLocal start_time_stamp; }; /** @@ -73,17 +74,18 @@ class PerfCountEnd : public PerfCountEndBase { PerfCountEnd(const Output& pc_begin); PerfCountEnd() = default; ~PerfCountEnd() { - uint64_t avg = iteration == 0 ? 0 : accumulation / iteration; - std::cout << "accumulation:" << accumulation << "ns, iteration:" << iteration << " avg:" << avg << "ns"<< std::endl; + output_perf_count(); } + void output_perf_count(); std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; - std::shared_ptr get_pc_begin(); + void init_pc_begin(); void set_accumulated_time(); private: - uint64_t accumulation = 0ul; - uint32_t iteration = 0u; + ov::threading::ThreadLocal accumulation; + ov::threading::ThreadLocal iteration; + std::shared_ptr m_pc_begin = nullptr; }; } // namespace op diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 4b51616faf3603..60e628f710d99a 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -50,8 +50,6 @@ Generator::LoweringResult Generator::generate(lowered::LinearIR& linear_ir, cons if (config.m_save_expressions || config.perf_count_mode != lowered::PerfCountMode::Disabled) lowered_saved = linear_ir; - lowered_saved = linear_ir; - return { target->get_snippet() }; } diff --git a/src/common/snippets/src/lowered/pass/insert_perf_count.cpp b/src/common/snippets/src/lowered/pass/insert_perf_count.cpp index 9b39d143be202b..e61255ac5019d5 100644 --- a/src/common/snippets/src/lowered/pass/insert_perf_count.cpp +++ b/src/common/snippets/src/lowered/pass/insert_perf_count.cpp @@ -49,6 +49,7 @@ bool InsertPerfCount::run(LinearIR& linear_ir) { // insert perf_count_end before first result const auto& perf_count_end = std::make_shared(perf_count_begin->output(0)); + perf_count_end->set_friendly_name("last_parameter_to_first_result"); std::vector pc_end_inputs; pc_end_inputs.push_back(perf_count_begin_expr->get_output_port_connector(0)); const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, pc_end_inputs); diff --git a/src/common/snippets/src/op/perf_count.cpp b/src/common/snippets/src/op/perf_count.cpp index d1908abef26842..f4779cfaaf47d6 100644 --- a/src/common/snippets/src/op/perf_count.cpp +++ b/src/common/snippets/src/op/perf_count.cpp @@ -53,16 +53,17 @@ std::shared_ptr PerfCountBegin::clone_with_new_inputs(const OutputVector& } std::chrono::high_resolution_clock::time_point& PerfCountBegin::get_start_time() { - return start_time_stamp; + return start_time_stamp.local(); } void PerfCountBegin::set_start_time() { - start_time_stamp = std::chrono::high_resolution_clock::now(); + start_time_stamp.local() = std::chrono::high_resolution_clock::now(); } //////////////////PerfCountEnd/////////////// PerfCountEnd::PerfCountEnd(const Output& pc_begin) : PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) { constructor_validate_and_infer_types(); + init_pc_begin(); } std::shared_ptr PerfCountEnd::clone_with_new_inputs(const OutputVector& inputs) const { @@ -71,15 +72,43 @@ std::shared_ptr PerfCountEnd::clone_with_new_inputs(const OutputVector& in void PerfCountEnd::set_accumulated_time() { auto current_time = std::chrono::high_resolution_clock::now(); - auto& start_time = get_pc_begin()->get_start_time(); - accumulation += std::chrono::duration_cast(current_time - start_time).count(); - iteration++; + auto& start_time = m_pc_begin->get_start_time(); + accumulation.local() += std::chrono::duration_cast(current_time - start_time).count(); + iteration.local()++; } -std::shared_ptr PerfCountEnd::get_pc_begin() { - const auto& pc_begin = ov::as_type_ptr(get_input_source_output(get_input_size() - 1).get_node_shared_ptr()); - NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin"); - return pc_begin; +void PerfCountEnd::init_pc_begin() { + m_pc_begin = ov::as_type_ptr(get_input_source_output(get_input_size() - 1).get_node_shared_ptr()); + NODE_VALIDATION_CHECK(this, m_pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin"); +} + +void PerfCountEnd::output_perf_count() { + OPENVINO_ASSERT(accumulation.size() == iteration.size(), "accumulation size should be the same as iteration size in perf_count_end node."); + auto iterator_iter = iteration.begin(); + auto iterator_acc = accumulation.begin(); + int t_num = 0; + std::vector avg_list; + std::string friendly_name = get_friendly_name(); + std::cout << "Perf count data in perfCountEnd node with name " << get_friendly_name() << " is:"<< std::endl; + for (; iterator_iter != iteration.end(); ++iterator_iter, ++iterator_acc) { + const auto iter = *iterator_iter; + const auto acc = *iterator_acc; + uint64_t avg = iter == 0 ? 0 : acc / iter; + avg_list.push_back(avg); + std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl; + t_num++; + } + + // max time of all threads: combine for reduce max + auto BinaryFunc = [](const uint64_t& a, const uint64_t& b) { + return a >= b ? a : b; + }; + // max accumulation + uint64_t acc_max = accumulation.combine(BinaryFunc); + std::cout << "max accumulated time:" << acc_max << "ns" << std::endl; + // max avg + auto avg_max = std::max_element(avg_list.begin(), avg_list.end(), BinaryFunc); + std::cout << "max avg time:" << *avg_max << "ns" << std::endl; } } // namespace op diff --git a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp index 679d9518baa4ab..a92c312ee7273c 100644 --- a/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp +++ b/src/inference/dev_api/openvino/runtime/threading/thread_local.hpp @@ -116,6 +116,20 @@ struct ThreadLocal { auto end() const -> Iterator const { return {_map.end()}; } + + // CombineFunc has signature T(T,T) or T(const T&, const T&) + template + T combine(CombineFunc f_combine) { + if (begin() != end()) { + auto ci = begin(); + T my_result = *ci; + while (++ci != end()) + my_result = f_combine(my_result, *ci); + return my_result; + } else { + return _create(); + } + } }; #endif diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp index 76368db8c2b5a2..92b669c533f7cd 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp @@ -271,20 +271,5 @@ void jit_emitter::internal_call_postamble() const { h->add(h->rsp, n_gprs_to_save * gpr_size); } -// additional 16 byte for offset, callee can use arbitrary regs. -void jit_emitter::internal_call_rsp_align() const { - h->mov(h->rbx, h->rsp); - h->and_(h->rbx, 0xf); - h->sub(h->rsp, h->rbx); - h->sub(h->rsp, 0x10); - h->mov(h->ptr[h->rsp], h->rbx); -} - -void jit_emitter::internal_call_rsp_restore() const { - h->mov(h->rbx, h->ptr[h->rsp]); - h->add(h->rsp, 0x10); - h->add(h->rsp, h->rbx); -} - } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp index 2a645a363d2249..b3c198afdf34cf 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp @@ -134,8 +134,20 @@ class jit_emitter : public ov::snippets::Emitter { virtual void internal_call_preamble() const; virtual void internal_call_postamble() const; - virtual void internal_call_rsp_align() const; - virtual void internal_call_rsp_restore() const; + // Must be inline funtions to avoid corrupt rsp by funtion call. + // Additional 16 byte for offset, callee can use arbitrary regs. + inline void internal_call_rsp_align() const { + h->mov(h->rbx, h->rsp); + h->and_(h->rbx, 0xf); + h->sub(h->rsp, h->rbx); + h->sub(h->rsp, 0x10); + h->mov(h->ptr[h->rsp], h->rbx); + } + inline void internal_call_rsp_restore() const { + h->mov(h->rbx, h->ptr[h->rsp]); + h->add(h->rsp, 0x10); + h->add(h->rsp, h->rbx); + } private: mutable std::vector preserved_vec_idxs; diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp index a94535dfbcbd55..3351a4a02fcd12 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp @@ -35,9 +35,9 @@ void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector &i const auto &set_start_time_overload = static_cast(set_start_time); h->mov(h->rax, reinterpret_cast(set_start_time_overload)); h->mov(abi_param1, reinterpret_cast(m_start_node.get())); - internal_call_rsp_align(); + jit_emitter::internal_call_rsp_align(); h->call(h->rax); - internal_call_rsp_restore(); + jit_emitter::internal_call_rsp_restore(); internal_call_postamble(); } @@ -62,9 +62,9 @@ void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector &in_ const auto &set_accumulated_time_overload = static_cast(set_accumulated_time); h->mov(h->rax, reinterpret_cast(set_accumulated_time_overload)); h->mov(abi_param1, reinterpret_cast(m_end_node.get())); - internal_call_rsp_align(); + jit_emitter::internal_call_rsp_align(); h->call(h->rax); - internal_call_rsp_restore(); + jit_emitter::internal_call_rsp_restore(); internal_call_postamble(); } diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp index b6c21a93cd00c6..65b48cccd3e7c8 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp @@ -1115,9 +1115,9 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c h->mov(abi_param6, static_cast(m_with_comp)); #endif - internal_call_rsp_align(); + jit_emitter::internal_call_rsp_align(); h->call(h->rbp); - internal_call_rsp_restore(); + jit_emitter::internal_call_rsp_restore(); #ifdef _WIN32 h->add(h->rsp, num_args_passed_on_stack * gpr_size); @@ -1321,9 +1321,9 @@ void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b h->mov(abi_param6, K); #endif - internal_call_rsp_align(); + jit_emitter::internal_call_rsp_align(); h->call(h->rbp); - internal_call_rsp_restore(); + jit_emitter::internal_call_rsp_restore(); #ifdef _WIN32 h->add(h->rsp, gpr_size * num_args_passed_on_stack);