Skip to content

Commit

Permalink
multi thread support
Browse files Browse the repository at this point in the history
  • Loading branch information
chenhu-wang committed Oct 23, 2023
1 parent 0329882 commit 4dacbcc
Show file tree
Hide file tree
Showing 11 changed files with 142 additions and 46 deletions.
8 changes: 4 additions & 4 deletions src/common/snippets/include/snippets/lowered/linear_ir.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ namespace lowered {

// Snippets performance count mode
// Disabled - default, w/o perf count for snippets
// Chrono - perf count with chrono call. This is a universal method.
// Chrono - perf count with chrono call. This is a universal method, and support multi-thread case to output perf count data for each thread.
// BackendSpecific - perf count provided by backend. This is for device specific requirment.
// For example, in sake of more light overhead, x86 CPU specific mode via read RDTSC register take ~50ns,
// while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX.
// For example, in sake of more light overhead and more accurate result, x86 CPU specific mode via read RDTSC register is implemented,
// which take ~50ns, while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX. This mode only support single thread.
enum PerfCountMode {
Disabled,
Chrono,
Expand All @@ -33,7 +33,7 @@ class Config {
// True if we should check runtime info for nodes to call specific needed transformations
bool m_need_fill_tail_register = false;
size_t m_loop_depth = 1;
PerfCountMode perf_count_mode = PerfCountMode::Chrono;
PerfCountMode perf_count_mode = PerfCountMode::Disabled;
// Some Subgraphs doesn't support domain optimization due to operations' semantics
bool m_enable_domain_optimization = false;
// Minimal advised work amount for parallel execution.
Expand Down
14 changes: 8 additions & 6 deletions src/common/snippets/include/snippets/op/perf_count.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#pragma once

#include "openvino/op/op.hpp"
#include "openvino/runtime/threading/thread_local.hpp"

namespace ov {
namespace snippets {
Expand Down Expand Up @@ -59,7 +60,7 @@ class PerfCountBegin : public PerfCountBeginBase {
std::chrono::high_resolution_clock::time_point& get_start_time();

private:
std::chrono::high_resolution_clock::time_point start_time_stamp = {};
ov::threading::ThreadLocal<std::chrono::high_resolution_clock::time_point> start_time_stamp;
};

/**
Expand All @@ -73,17 +74,18 @@ class PerfCountEnd : public PerfCountEndBase {
PerfCountEnd(const Output<Node>& pc_begin);
PerfCountEnd() = default;
~PerfCountEnd() {
uint64_t avg = iteration == 0 ? 0 : accumulation / iteration;
std::cout << "accumulation:" << accumulation << "ns, iteration:" << iteration << " avg:" << avg << "ns"<< std::endl;
output_perf_count();
}
void output_perf_count();
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;

std::shared_ptr<PerfCountBegin> get_pc_begin();
void init_pc_begin();
void set_accumulated_time();

private:
uint64_t accumulation = 0ul;
uint32_t iteration = 0u;
ov::threading::ThreadLocal<uint64_t> accumulation;
ov::threading::ThreadLocal<uint32_t> iteration;
std::shared_ptr<PerfCountBegin> m_pc_begin = nullptr;
};

} // namespace op
Expand Down
2 changes: 0 additions & 2 deletions src/common/snippets/src/generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ Generator::LoweringResult Generator::generate(lowered::LinearIR& linear_ir, cons
if (config.m_save_expressions || config.perf_count_mode != lowered::PerfCountMode::Disabled)
lowered_saved = linear_ir;

lowered_saved = linear_ir;

return { target->get_snippet() };
}

Expand Down
1 change: 1 addition & 0 deletions src/common/snippets/src/lowered/pass/insert_perf_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ bool InsertPerfCount::run(LinearIR& linear_ir) {

// insert perf_count_end before first result
const auto& perf_count_end = std::make_shared<op::PerfCountEnd>(perf_count_begin->output(0));
perf_count_end->set_friendly_name("last_parameter_to_first_result");
std::vector<PortConnectorPtr> pc_end_inputs;
pc_end_inputs.push_back(perf_count_begin_expr->get_output_port_connector(0));
const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, pc_end_inputs);
Expand Down
47 changes: 38 additions & 9 deletions src/common/snippets/src/op/perf_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,17 @@ std::shared_ptr<Node> PerfCountBegin::clone_with_new_inputs(const OutputVector&
}

std::chrono::high_resolution_clock::time_point& PerfCountBegin::get_start_time() {
return start_time_stamp;
return start_time_stamp.local();
}

void PerfCountBegin::set_start_time() {
start_time_stamp = std::chrono::high_resolution_clock::now();
start_time_stamp.local() = std::chrono::high_resolution_clock::now();
}

//////////////////PerfCountEnd///////////////
PerfCountEnd::PerfCountEnd(const Output<Node>& pc_begin) : PerfCountEndBase({pc_begin}), accumulation(0ul), iteration(0u) {
constructor_validate_and_infer_types();
init_pc_begin();
}

std::shared_ptr<Node> PerfCountEnd::clone_with_new_inputs(const OutputVector& inputs) const {
Expand All @@ -71,15 +72,43 @@ std::shared_ptr<Node> PerfCountEnd::clone_with_new_inputs(const OutputVector& in

void PerfCountEnd::set_accumulated_time() {
auto current_time = std::chrono::high_resolution_clock::now();
auto& start_time = get_pc_begin()->get_start_time();
accumulation += std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - start_time).count();
iteration++;
auto& start_time = m_pc_begin->get_start_time();
accumulation.local() += std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - start_time).count();
iteration.local()++;
}

std::shared_ptr<PerfCountBegin> PerfCountEnd::get_pc_begin() {
const auto& pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin");
return pc_begin;
void PerfCountEnd::init_pc_begin() {
m_pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
NODE_VALIDATION_CHECK(this, m_pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin");
}

void PerfCountEnd::output_perf_count() {
OPENVINO_ASSERT(accumulation.size() == iteration.size(), "accumulation size should be the same as iteration size in perf_count_end node.");
auto iterator_iter = iteration.begin();
auto iterator_acc = accumulation.begin();
int t_num = 0;
std::vector<uint64_t> avg_list;
std::string friendly_name = get_friendly_name();
std::cout << "Perf count data in perfCountEnd node with name " << get_friendly_name() << " is:"<< std::endl;
for (; iterator_iter != iteration.end(); ++iterator_iter, ++iterator_acc) {
const auto iter = *iterator_iter;
const auto acc = *iterator_acc;
uint64_t avg = iter == 0 ? 0 : acc / iter;
avg_list.push_back(avg);
std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl;
t_num++;
}

// max time of all threads: combine for reduce max
auto BinaryFunc = [](const uint64_t& a, const uint64_t& b) {
return a >= b ? a : b;
};
// max accumulation
uint64_t acc_max = accumulation.combine(BinaryFunc);
std::cout << "max accumulated time:" << acc_max << "ns" << std::endl;
// max avg
auto avg_max = std::max_element(avg_list.begin(), avg_list.end(), BinaryFunc);
std::cout << "max avg time:" << *avg_max << "ns" << std::endl;
}

} // namespace op
Expand Down
14 changes: 14 additions & 0 deletions src/inference/dev_api/openvino/runtime/threading/thread_local.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,20 @@ struct ThreadLocal {
auto end() const -> Iterator<decltype(_map.end())> const {
return {_map.end()};
}

// CombineFunc has signature T(T,T) or T(const T&, const T&)
template <typename CombineFunc>
T combine(CombineFunc f_combine) {
if (begin() != end()) {
auto ci = begin();
T my_result = *ci;
while (++ci != end())
my_result = f_combine(my_result, *ci);
return my_result;
} else {
return _create();
}
}
};

#endif
Expand Down
15 changes: 0 additions & 15 deletions src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,20 +271,5 @@ void jit_emitter::internal_call_postamble() const {
h->add(h->rsp, n_gprs_to_save * gpr_size);
}

// additional 16 byte for offset, callee can use arbitrary regs.
void jit_emitter::internal_call_rsp_align() const {
h->mov(h->rbx, h->rsp);
h->and_(h->rbx, 0xf);
h->sub(h->rsp, h->rbx);
h->sub(h->rsp, 0x10);
h->mov(h->ptr[h->rsp], h->rbx);
}

void jit_emitter::internal_call_rsp_restore() const {
h->mov(h->rbx, h->ptr[h->rsp]);
h->add(h->rsp, 0x10);
h->add(h->rsp, h->rbx);
}

} // namespace intel_cpu
} // namespace ov
16 changes: 14 additions & 2 deletions src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,20 @@ class jit_emitter : public ov::snippets::Emitter {

virtual void internal_call_preamble() const;
virtual void internal_call_postamble() const;
virtual void internal_call_rsp_align() const;
virtual void internal_call_rsp_restore() const;
// Must be inline funtions to avoid corrupt rsp by funtion call.
// Additional 16 byte for offset, callee can use arbitrary regs.
void internal_call_rsp_align() const {
h->mov(h->rbx, h->rsp);
h->and_(h->rbx, 0xf);
h->sub(h->rsp, h->rbx);
h->sub(h->rsp, 0x10);
h->mov(h->ptr[h->rsp], h->rbx);
}
void internal_call_rsp_restore() const {
h->mov(h->rbx, h->ptr[h->rsp]);
h->add(h->rsp, 0x10);
h->add(h->rsp, h->rbx);
}

private:
mutable std::vector<size_t> preserved_vec_idxs;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector<size_t> &i
const auto &set_start_time_overload = static_cast<void (*)(snippets::op::PerfCountBegin*)>(set_start_time);
h->mov(h->rax, reinterpret_cast<size_t>(set_start_time_overload));
h->mov(abi_param1, reinterpret_cast<size_t>(m_start_node.get()));
internal_call_rsp_align();
jit_emitter::internal_call_rsp_align();
h->call(h->rax);
internal_call_rsp_restore();
jit_emitter::internal_call_rsp_restore();

internal_call_postamble();
}
Expand All @@ -62,9 +62,9 @@ void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector<size_t> &in_
const auto &set_accumulated_time_overload = static_cast<void (*)(snippets::op::PerfCountEnd*)>(set_accumulated_time);
h->mov(h->rax, reinterpret_cast<size_t>(set_accumulated_time_overload));
h->mov(abi_param1, reinterpret_cast<size_t>(m_end_node.get()));
internal_call_rsp_align();
jit_emitter::internal_call_rsp_align();
h->call(h->rax);
internal_call_rsp_restore();
jit_emitter::internal_call_rsp_restore();

internal_call_postamble();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1115,9 +1115,9 @@ void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, c
h->mov(abi_param6, static_cast<int>(m_with_comp));
#endif

internal_call_rsp_align();
jit_emitter::internal_call_rsp_align();
h->call(h->rbp);
internal_call_rsp_restore();
jit_emitter::internal_call_rsp_restore();

#ifdef _WIN32
h->add(h->rsp, num_args_passed_on_stack * gpr_size);
Expand Down Expand Up @@ -1321,9 +1321,9 @@ void BrgemmCopyBEmitter::emit_kernel_call(const matmul::jit_brgemm_matmul_copy_b
h->mov(abi_param6, K);
#endif

internal_call_rsp_align();
jit_emitter::internal_call_rsp_align();
h->call(h->rbp);
internal_call_rsp_restore();
jit_emitter::internal_call_rsp_restore();

#ifdef _WIN32
h->add(h->rsp, gpr_size * num_args_passed_on_stack);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "shape_inference.hpp"
#include <snippets/shape_inference/shape_infer_instances.hpp>
#include "op/brgemm_copy_b.hpp"
#include "op/brgemm_cpu.hpp"
#include "op/fused_mul_add.hpp"
#include "op/load_convert.hpp"
#include "op/store_convert.hpp"
#include "op/perf_count_rdtsc.hpp"
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"

namespace ov {
namespace snippets {
using ShapeInferPtr = IShapeInferSnippetsFactory::ShapeInferPtr;

ShapeInferPtr CPUShapeInferSnippetsFactory::get_specific_op_shape_infer(const ov::DiscreteTypeInfo& key,
const std::shared_ptr<ov::Node>& op) const {
const auto& maker_iter = specific_ops_registry.find(key);
if (maker_iter != specific_ops_registry.end())
return maker_iter->second(op);
return {};
}


#define SHAPE_INFER_PREDEFINED(OP, InferType) \
{ OP::get_type_info_static(), [](const std::shared_ptr<ov::Node>& n) { return std::make_shared<InferType>();} }
#define SHAPE_INFER_OP_SPECIFIC(OP) \
{ OP::get_type_info_static(), [](const std::shared_ptr<ov::Node>& n) { return std::make_shared<OP::ShapeInfer>(n);} }
#define SHAPE_INFER_OP_SPECIFIC_EXTERNAL(OP, InferType) \
{ OP::get_type_info_static(), [](const std::shared_ptr<ov::Node>& n) { return std::make_shared<InferType>(n);} }

const CPUShapeInferSnippetsFactory::TRegistry CPUShapeInferSnippetsFactory::specific_ops_registry {
SHAPE_INFER_PREDEFINED(ov::intel_cpu::FusedMulAdd, NumpyBroadcastShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::SwishNode, PassThroughShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::LoadConvertSaturation, PassThroughShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::LoadConvertTruncation, PassThroughShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertSaturation, PassThroughShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertTruncation, PassThroughShapeInfer),
<<<<<<< 03700be72740507388c9ddf55c795ad7e777f5d7
SHAPE_INFER_OP_SPECIFIC_EXTERNAL(ov::intel_cpu::BrgemmCPU, BrgemmShapeInfer),
=======
SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscBegin, SingleElementShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscEnd, EmptyShapeInfer),
>>>>>>> review
//
SHAPE_INFER_OP_SPECIFIC(ov::intel_cpu::BrgemmCopyB),
};
#undef SHAPE_INFER_OP_SPECIFIC
#undef SHAPE_INFER_PREDEFINED

} // namespace snippets
} // namespace ov

0 comments on commit 4dacbcc

Please sign in to comment.