Skip to content

Commit

Permalink
multi thread support
Browse files Browse the repository at this point in the history
  • Loading branch information
chenhu-wang committed Oct 17, 2023
1 parent 7ed6cc2 commit e0a9f68
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 14 deletions.
6 changes: 3 additions & 3 deletions src/common/snippets/include/snippets/lowered/linear_ir.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ namespace lowered {
// Disabled - default, w/o perf count for snippets
// Chrono - perf count with chrono call. This is a universal method.
// BackendSpecific - perf count provided by backend. This is for device specific requirment.
// For example, in sake of more light overhead, x86 CPU specific mode via read RDTSC register take ~50ns,
// while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX.
// For example, in sake of more light overhead and more accurate result, x86 CPU specific mode via read RDTSC register is implemented,
// which take ~50ns, while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX.
enum PerfCountMode {
Disabled,
Chrono,
Expand All @@ -33,7 +33,7 @@ class Config {
// True if we should check runtime info for nodes to call specific needed transformations
bool m_need_fill_tail_register = false;
size_t m_loop_depth = 1;
PerfCountMode perf_count_mode = PerfCountMode::Chrono;
PerfCountMode perf_count_mode = PerfCountMode::Disabled;
// Some Subgraphs doesn't support domain optimization due to operations' semantics
bool m_enable_domain_optimization = false;
// Minimal advised work amount for parallel execution.
Expand Down
11 changes: 6 additions & 5 deletions src/common/snippets/include/snippets/op/perf_count.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#pragma once

#include "openvino/op/op.hpp"
#include "openvino/runtime/threading/thread_local.hpp"

namespace ov {
namespace snippets {
Expand Down Expand Up @@ -59,7 +60,7 @@ class PerfCountBegin : public PerfCountBeginBase {
std::chrono::high_resolution_clock::time_point& get_start_time();

private:
std::chrono::high_resolution_clock::time_point start_time_stamp = {};
ov::threading::ThreadLocal<std::chrono::high_resolution_clock::time_point> start_time_stamp;
};

/**
Expand All @@ -73,17 +74,17 @@ class PerfCountEnd : public PerfCountEndBase {
PerfCountEnd(const Output<Node>& pc_begin);
PerfCountEnd() = default;
~PerfCountEnd() {
uint64_t avg = iteration == 0 ? 0 : accumulation / iteration;
std::cout << "accumulation:" << accumulation << "ns, iteration:" << iteration << " avg:" << avg << "ns"<< std::endl;
output_perf_count();
}
void output_perf_count();
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;

std::shared_ptr<PerfCountBegin> get_pc_begin();
void set_accumulated_time();

private:
uint64_t accumulation = 0ul;
uint32_t iteration = 0u;
ov::threading::ThreadLocal<uint64_t> accumulation;
ov::threading::ThreadLocal<uint32_t> iteration;
};

} // namespace op
Expand Down
2 changes: 0 additions & 2 deletions src/common/snippets/src/generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ Generator::LoweringResult Generator::generate(lowered::LinearIR& linear_ir, cons
if (config.m_save_expressions || config.perf_count_mode != lowered::PerfCountMode::Disabled)
lowered_saved = linear_ir;

lowered_saved = linear_ir;

return { target->get_snippet() };
}

Expand Down
35 changes: 31 additions & 4 deletions src/common/snippets/src/op/perf_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@ std::shared_ptr<Node> PerfCountBegin::clone_with_new_inputs(const OutputVector&
}

std::chrono::high_resolution_clock::time_point& PerfCountBegin::get_start_time() {
return start_time_stamp;
return start_time_stamp.local();
}

void PerfCountBegin::set_start_time() {
start_time_stamp = std::chrono::high_resolution_clock::now();
start_time_stamp.local() = std::chrono::high_resolution_clock::now();
}

//////////////////PerfCountEnd///////////////
Expand All @@ -72,8 +72,8 @@ std::shared_ptr<Node> PerfCountEnd::clone_with_new_inputs(const OutputVector& in
void PerfCountEnd::set_accumulated_time() {
auto current_time = std::chrono::high_resolution_clock::now();
auto& start_time = get_pc_begin()->get_start_time();
accumulation += std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - start_time).count();
iteration++;
accumulation.local() += std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - start_time).count();
iteration.local()++;
}

std::shared_ptr<PerfCountBegin> PerfCountEnd::get_pc_begin() {
Expand All @@ -82,6 +82,33 @@ std::shared_ptr<PerfCountBegin> PerfCountEnd::get_pc_begin() {
return pc_begin;
}

void PerfCountEnd::output_perf_count() {
OPENVINO_ASSERT(accumulation.size() == iteration.size(), "accumulation size should be the same as iteration size in perf_count_end node.");
auto iterator_iter = iteration.begin();
auto iterator_acc = accumulation.begin();
int t_num = 0;
std::vector<uint64_t> avg_list;
for (; iterator_iter != iteration.end(); iterator_iter++, iterator_acc++) {
const auto iter = *iterator_iter;
const auto acc = *iterator_acc;
uint64_t avg = iter == 0 ? 0 : acc / iter;
avg_list.push_back(avg);
std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl;
t_num++;
}

// max time of all threads: combine for reduce max
auto BinaryFunc = [](const uint64_t& a, const uint64_t& b) {
return a >= b ? a : b;
};
// max accumulation
uint64_t acc_max = accumulation.combine(BinaryFunc);
std::cout << "max accumulated time:" << acc_max << "ns" << std::endl;
// max avg
auto avg_max = std::max_element(avg_list.begin(), avg_list.end(), BinaryFunc);
std::cout << "max avg time:" << *avg_max << "ns" << std::endl;
}

} // namespace op
} // namespace snippets
} // namespace ov
13 changes: 13 additions & 0 deletions src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,19 @@ struct ThreadLocal {
auto end() const -> Iterator<decltype(_map.end())> const {
return {_map.end()};
}

// CombineFunc has signature T(T,T) or T(const T&, const T&)
template <typename CombineFunc>
T combine(CombineFunc f_combine) {
if (begin() == end()) {
return _create();
}
auto ci = begin();
T my_result = *ci;
while (++ci != end())
my_result = f_combine(my_result, *ci);
return my_result;
}
};

#endif
Expand Down

0 comments on commit e0a9f68

Please sign in to comment.