Skip to content

Commit

Permalink
apply review2 and enable chrono perf count by default for function test
Browse files Browse the repository at this point in the history
  • Loading branch information
chenhu-wang committed Oct 17, 2023
1 parent 6cfa293 commit 7ed6cc2
Show file tree
Hide file tree
Showing 12 changed files with 27 additions and 77 deletions.
10 changes: 8 additions & 2 deletions src/common/snippets/include/snippets/lowered/linear_ir.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,16 @@ namespace ov {
namespace snippets {
namespace lowered {

// Snippets performance count mode
// Disabled - default, w/o perf count for snippets
// Chrono - perf count with chrono call. This is a universal method.
// BackendSpecific - perf count provided by backend. This is for device specific requirment.
// For example, in sake of more light overhead, x86 CPU specific mode via read RDTSC register take ~50ns,
// while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX.
enum PerfCountMode {
Disabled,
Chrono,
Rdtsc,
BackendSpecific,
};

class Config {
Expand All @@ -27,7 +33,7 @@ class Config {
// True if we should check runtime info for nodes to call specific needed transformations
bool m_need_fill_tail_register = false;
size_t m_loop_depth = 1;
PerfCountMode perf_count_mode = PerfCountMode::Disabled;
PerfCountMode perf_count_mode = PerfCountMode::Chrono;
// Some Subgraphs doesn't support domain optimization due to operations' semantics
bool m_enable_domain_optimization = false;
// Minimal advised work amount for parallel execution.
Expand Down
6 changes: 4 additions & 2 deletions src/common/snippets/src/generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ Generator::LoweringResult Generator::generate(lowered::LinearIR& linear_ir, cons
if (config.m_save_expressions || config.perf_count_mode != lowered::PerfCountMode::Disabled)
lowered_saved = linear_ir;

lowered_saved = linear_ir;

return { target->get_snippet() };
}

Expand All @@ -64,8 +66,8 @@ Generator::opRegType Generator::get_op_reg_type(const std::shared_ptr<Node>& op)
std::dynamic_pointer_cast<op::LoopEnd>(op) ||
std::dynamic_pointer_cast<op::Brgemm>(op) ||
std::dynamic_pointer_cast<op::Buffer>(op) ||
std::dynamic_pointer_cast<op::PerfCountBegin>(op) ||
std::dynamic_pointer_cast<op::PerfCountEnd>(op))
std::dynamic_pointer_cast<op::PerfCountBeginBase>(op) ||
std::dynamic_pointer_cast<op::PerfCountEndBase>(op))
return gpr2gpr;
else if (std::dynamic_pointer_cast<snippets::op::Load>(op) ||
std::dynamic_pointer_cast<snippets::op::BroadcastLoad>(op))
Expand Down
15 changes: 0 additions & 15 deletions src/common/snippets/src/lowered/pass/insert_loops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,6 @@ void InsertLoops::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPtr&
init_params(loop_entries);
init_params(loop_exits);

// begin
// const auto& perf_count_begin = std::make_shared<op::PerfCountBegin>();
// const auto& perf_count_begin_expr = linear_ir.create_expression(perf_count_begin, std::vector<PortConnectorPtr>{});
// linear_ir.insert(loop_begin_pos, perf_count_begin_expr);
// begin

const auto& loop_begin = std::make_shared<op::LoopBegin>();
const auto& loop_begin_expr = linear_ir.create_expression(loop_begin, std::vector<PortConnectorPtr>{});
linear_ir.insert(loop_begin_pos, loop_begin_expr);
Expand All @@ -106,15 +100,6 @@ void InsertLoops::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPtr&
const auto& loop_end_expr = linear_ir.create_expression(loop_end, loop_end_inputs);
const auto& it = linear_ir.insert(loop_end_pos, loop_end_expr);

// end
// const auto& perf_count_end = std::make_shared<op::PerfCountEnd>(perf_count_begin->output(0));
// std::vector<PortConnectorPtr> pc_end_inputs;
// pc_end_inputs.push_back(perf_count_begin_expr->get_output_port_connector(0));
// // const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, pc_end_inputs);
// const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, {perf_count_begin_expr->get_output_port_connector(0)});
// linear_ir.insert(loop_end_pos, perf_count_end_expr);
// end

const auto outer_loop_ids = get_outer_loop_ids(*std::prev(it), loop_id);
loop_begin_expr->set_loop_ids(outer_loop_ids);
loop_end_expr->set_loop_ids(outer_loop_ids);
Expand Down
2 changes: 1 addition & 1 deletion src/common/snippets/src/lowered/pass/insert_perf_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ bool InsertPerfCount::run(LinearIR& linear_ir) {
std::vector<PortConnectorPtr> pc_end_inputs;
pc_end_inputs.push_back(perf_count_begin_expr->get_output_port_connector(0));
const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, pc_end_inputs);
// linear_ir.insert(perf_count_end_pos, perf_count_end_expr);
linear_ir.insert(perf_count_end_pos, perf_count_end_expr);

return true;
}
Expand Down
13 changes: 6 additions & 7 deletions src/common/snippets/src/op/perf_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ void PerfCountBeginBase::validate_and_infer_types() {
OPENVINO_ASSERT(get_output_size() == 1, "PerfCountBegin must have only one output");
const auto& last_output_inputs = get_output_target_inputs(0);
OPENVINO_ASSERT(last_output_inputs.size() == 1, "PerfCountBegin must have exactly one input attached to the last output");
const auto& pc_end = ov::as_type_ptr<PerfCountEnd>(last_output_inputs.begin()->get_node()->shared_from_this());
const auto& pc_end = ov::as_type_ptr<PerfCountEndBase>(last_output_inputs.begin()->get_node()->shared_from_this());
OPENVINO_ASSERT(pc_end != nullptr, "PerfCountBegin must have PerfCountEnd connected to its last output");
}

Expand All @@ -26,17 +26,17 @@ bool PerfCountBeginBase::visit_attributes(AttributeVisitor &visitor) {

void PerfCountBeginBase::validate_and_infer_types_except_PerfCountEnd() {
NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin dosen't expect any inputs");
set_output_type(0, element::f32, ov::PartialShape{ov::Shape{}});
set_output_type(0, element::f32, {});
}

//////////////////PerfCountEndBase/////////////////
PerfCountEndBase::PerfCountEndBase(const std::vector<Output<Node>> &args) : Op(args) {}

void PerfCountEndBase::validate_and_infer_types() {
NODE_VALIDATION_CHECK(this, get_input_size() == 1, "PerfCountEndBase must have one input");
const auto pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_node_shared_ptr(0));
NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEndBase must have PerfCountBegin as the last argument");
set_output_type(0, element::f32, ov::PartialShape{ov::Shape{}});
const auto pc_begin = ov::as_type_ptr<PerfCountBeginBase>(get_input_node_shared_ptr(0));
NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEndBase must have PerfCountBeginBase as the last argument");
set_output_type(0, element::f32, {});
}

bool PerfCountEndBase::visit_attributes(AttributeVisitor &visitor) {
Expand Down Expand Up @@ -78,8 +78,7 @@ void PerfCountEnd::set_accumulated_time() {

std::shared_ptr<PerfCountBegin> PerfCountEnd::get_pc_begin() {
const auto& pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
if (!pc_begin)
throw std::invalid_argument("PerfCountEnd last input is not connected to PerfCountBegin");
NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin");
return pc_begin;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,20 +110,6 @@ INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(ENABLE);
INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(IGNORE_CALLBACK);
INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(DISABLE);

/**
* @brief Defines Snippets performance count mode
* @param DISABLED - default, w/o perf count for snippets
* @param CHRONO - chrono call based perf count. This is a universal method for both arm and x86.
* @param RDTSC - perf count via read rdtsc register. This is more light overhead.
* for example, test show that RDTSC take ~50ns while CHRONO take 260ns for a pair of perf count start and perf count
* end execution. RDTSC is only applicable on x86.
* @ingroup ie_dev_api_plugin_api
*/
INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_KEY(SNIPPETS_PERF_COUNT_MODE);
INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(DISABLED);
INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(CHRONO);
INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(RDTSC);

} // namespace PluginConfigInternalParams

} // namespace InferenceEngine
10 changes: 0 additions & 10 deletions src/plugins/intel_cpu/src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,16 +237,6 @@ void Config::readProperties(const std::map<std::string, std::string> &prop, cons
else
IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_SNIPPETS_MODE
<< ". Expected values: ENABLE/DISABLE/IGNORE_CALLBACK";
} else if (key == PluginConfigInternalParams::KEY_SNIPPETS_PERF_COUNT_MODE) {
if (val == PluginConfigInternalParams::DISABLED)
snippetsPCMode = SnippetsPerfCountMode::Disabled;
else if (val == PluginConfigInternalParams::CHRONO)
snippetsPCMode = SnippetsPerfCountMode::Chrono;
else if (val == PluginConfigInternalParams::RDTSC)
snippetsPCMode = SnippetsPerfCountMode::Rdtsc;
else
IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_SNIPPETS_PERF_COUNT_MODE
<< ". Expected values: DISABLE/CHRONO/RDTSC";
} else if (key == ov::hint::execution_mode.name()) {
if (val == "PERFORMANCE") {
executionMode = ov::hint::ExecutionMode::PERFORMANCE;
Expand Down
7 changes: 0 additions & 7 deletions src/plugins/intel_cpu/src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,6 @@ struct Config {
Disable,
};

enum SnippetsPerfCountMode {
Disabled,
Chrono,
Rdtsc,
};

enum class LatencyThreadingMode {
PER_NUMA_NODE,
PER_SOCKET,
Expand All @@ -59,7 +53,6 @@ struct Config {
bool collectPerfCounters = false;
bool exclusiveAsyncRequests = false;
SnippetsMode snippetsMode = SnippetsMode::Enable;
SnippetsPerfCountMode snippetsPCMode = SnippetsPerfCountMode::Disabled;
std::string dumpToDot = {};
std::string device_id = {};
float fcSparseWeiDecompressionRate = 1.0f;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector<size_t> &in_
internal_call_preamble();

const auto &set_accumulated_time_overload = static_cast<void (*)(snippets::op::PerfCountEnd*)>(set_accumulated_time);
h->mov(h->rax, reinterpret_cast<size_t>(&set_accumulated_time_overload));
h->mov(h->rax, reinterpret_cast<size_t>(set_accumulated_time_overload));
h->mov(abi_param1, reinterpret_cast<size_t>(m_end_node.get()));
internal_call_rsp_align();
h->call(h->rax);
Expand Down
18 changes: 1 addition & 17 deletions src/plugins/intel_cpu/src/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,22 +499,6 @@ static Config::SnippetsMode getSnippetsMode(const std::map<std::string, std::str
IE_THROW() << "Wrong value for property key SNIPPETS_MODE. Expected values: ENABLE/DISABLE/IGNORE_CALLBACK";
}

// static Config::SnippetsPerfCountMode getSnippetsPerfCountMode(const std::map<std::string, std::string>& modelConfig, const Config& engineConfig) {
// const auto& snippetsPCMode = modelConfig.find(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_PERF_COUNT_MODE);
// if (snippetsPCMode == modelConfig.end()) // not set explicitly
// return Config::SnippetsPerfCountMode::Disabled; // disabled by default

// const auto& val = snippetsPCMode->second;
// if (val == PluginConfigInternalParams::CHRONO)
// return Config::SnippetsPerfCountMode::Chrono;
// else if (val == PluginConfigInternalParams::RDTSC)
// return Config::SnippetsPerfCountMode::Rdtsc;
// else if (val == PluginConfigInternalParams::DISABLED)
// return Config::SnippetsPerfCountMode::Disabled;
// else
// IE_THROW() << "Wrong value for property key SNIPPETS_PERF_COUNT_MODE. Expected values: DISABLED/CHRONO/RDTSC";
// }

InferenceEngine::IExecutableNetworkInternal::Ptr
Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std::map<std::string, std::string> &orig_config) {
OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, "Engine::LoadExeNetworkImpl");
Expand Down Expand Up @@ -549,7 +533,7 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
auto nGraphFunc = clonedNetwork.getFunction();
Config::ModelType modelType = getModelType(nGraphFunc);
ov::element::Type inferencePrecision = getInferencePrecision(config, engConfig, modelType);
const Config::SnippetsMode snippetsMode = getSnippetsMode(config, engConfig);
const Config::SnippetsMode snippetsMode = getSnippetsMode(config, engConfig);

DEBUG_LOG(PrintableModel(*nGraphFunc, "org_"));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ using namespace ov;
using namespace ov::intel_cpu;

/////////////////////////PerfCountRdtscBegin//////////////////////
PerfCountRdtscBegin::PerfCountRdtscBegin() : PerfCountBeginBase() {}
PerfCountRdtscBegin::PerfCountRdtscBegin() : PerfCountBeginBase() {
validate_and_infer_types_except_PerfCountEnd();
}

std::shared_ptr<Node> PerfCountRdtscBegin::clone_with_new_inputs(const OutputVector& inputs) const {
return std::make_shared<PerfCountRdtscBegin>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "op/fused_mul_add.hpp"
#include "op/load_convert.hpp"
#include "op/store_convert.hpp"
#include "op/perf_count_rdtsc.hpp"
#include "transformations/cpu_opset/common/op/swish_cpu.hpp"

namespace ov {
Expand Down Expand Up @@ -38,6 +39,8 @@ const CPUShapeInferSnippetsFactory::TRegistry CPUShapeInferSnippetsFactory::spec
SHAPE_INFER_PREDEFINED(ov::intel_cpu::LoadConvertTruncation, PassThroughShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertSaturation, PassThroughShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertTruncation, PassThroughShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscBegin, SingleElementShapeInfer),
SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscEnd, EmptyShapeInfer),
SHAPE_INFER_OP_SPECIFIC_EXTERNAL(ov::intel_cpu::BrgemmCPU, BrgemmShapeInfer),
//
SHAPE_INFER_OP_SPECIFIC(ov::intel_cpu::BrgemmCopyB),
Expand Down

0 comments on commit 7ed6cc2

Please sign in to comment.