apply review2 and enable chrono perf count by default for function test

chenhu-wang · Oct 17, 2023 · 7ed6cc2 · 7ed6cc2
1 parent 6cfa293
commit 7ed6cc2
Show file tree

Hide file tree

Showing 12 changed files with 27 additions and 77 deletions.
diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
@@ -14,10 +14,16 @@ namespace ov {
 namespace snippets {
 namespace lowered {
 
+// Snippets performance count mode
+// Disabled - default, w/o perf count for snippets
+// Chrono - perf count with chrono call. This is a universal method.
+// BackendSpecific - perf count provided by backend. This is for device specific requirment.
+// For example, in sake of more light overhead, x86 CPU specific mode via read RDTSC register take ~50ns,
+// while Chrono mode take 260ns for a pair of perf count start and perf count end execution, on ICX.
 enum PerfCountMode {
     Disabled,
     Chrono,
-    Rdtsc,
+    BackendSpecific,
 };
 
 class Config {
@@ -27,7 +33,7 @@ class Config {
     // True if we should check runtime info for nodes to call specific needed transformations
     bool m_need_fill_tail_register = false;
     size_t m_loop_depth = 1;
-    PerfCountMode perf_count_mode = PerfCountMode::Disabled;
+    PerfCountMode perf_count_mode = PerfCountMode::Chrono;
     // Some Subgraphs doesn't support domain optimization due to operations' semantics
     bool m_enable_domain_optimization = false;
     // Minimal advised work amount for parallel execution.

diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
@@ -50,6 +50,8 @@ Generator::LoweringResult Generator::generate(lowered::LinearIR& linear_ir, cons
     if (config.m_save_expressions || config.perf_count_mode != lowered::PerfCountMode::Disabled)
         lowered_saved = linear_ir;
 
+    lowered_saved = linear_ir;
+
     return { target->get_snippet() };
 }
 
@@ -64,8 +66,8 @@ Generator::opRegType Generator::get_op_reg_type(const std::shared_ptr<Node>& op)
         std::dynamic_pointer_cast<op::LoopEnd>(op) ||
         std::dynamic_pointer_cast<op::Brgemm>(op) ||
         std::dynamic_pointer_cast<op::Buffer>(op) ||
-        std::dynamic_pointer_cast<op::PerfCountBegin>(op) ||
-        std::dynamic_pointer_cast<op::PerfCountEnd>(op))
+        std::dynamic_pointer_cast<op::PerfCountBeginBase>(op) ||
+        std::dynamic_pointer_cast<op::PerfCountEndBase>(op))
         return gpr2gpr;
     else if (std::dynamic_pointer_cast<snippets::op::Load>(op) ||
              std::dynamic_pointer_cast<snippets::op::BroadcastLoad>(op))

diff --git a/src/common/snippets/src/lowered/pass/insert_loops.cpp b/src/common/snippets/src/lowered/pass/insert_loops.cpp
@@ -85,12 +85,6 @@ void InsertLoops::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPtr&
     init_params(loop_entries);
     init_params(loop_exits);
 
-    // begin
-    // const auto& perf_count_begin = std::make_shared<op::PerfCountBegin>();
-    // const auto& perf_count_begin_expr = linear_ir.create_expression(perf_count_begin, std::vector<PortConnectorPtr>{});
-    // linear_ir.insert(loop_begin_pos, perf_count_begin_expr);
-    // begin
-
     const auto& loop_begin = std::make_shared<op::LoopBegin>();
     const auto& loop_begin_expr = linear_ir.create_expression(loop_begin, std::vector<PortConnectorPtr>{});
     linear_ir.insert(loop_begin_pos, loop_begin_expr);
@@ -106,15 +100,6 @@ void InsertLoops::insertion(LinearIR& linear_ir, const LinearIR::LoopManagerPtr&
     const auto& loop_end_expr = linear_ir.create_expression(loop_end, loop_end_inputs);
     const auto& it = linear_ir.insert(loop_end_pos, loop_end_expr);
 
-    // end
-    // const auto& perf_count_end = std::make_shared<op::PerfCountEnd>(perf_count_begin->output(0));
-    // std::vector<PortConnectorPtr> pc_end_inputs;
-    // pc_end_inputs.push_back(perf_count_begin_expr->get_output_port_connector(0));
-    // // const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, pc_end_inputs);
-    // const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, {perf_count_begin_expr->get_output_port_connector(0)});
-    // linear_ir.insert(loop_end_pos, perf_count_end_expr);
-    // end
-
     const auto outer_loop_ids = get_outer_loop_ids(*std::prev(it), loop_id);
     loop_begin_expr->set_loop_ids(outer_loop_ids);
     loop_end_expr->set_loop_ids(outer_loop_ids);

diff --git a/src/common/snippets/src/lowered/pass/insert_perf_count.cpp b/src/common/snippets/src/lowered/pass/insert_perf_count.cpp
@@ -52,7 +52,7 @@ bool InsertPerfCount::run(LinearIR& linear_ir) {
     std::vector<PortConnectorPtr> pc_end_inputs;
     pc_end_inputs.push_back(perf_count_begin_expr->get_output_port_connector(0));
     const auto& perf_count_end_expr = linear_ir.create_expression(perf_count_end, pc_end_inputs);
-    // linear_ir.insert(perf_count_end_pos, perf_count_end_expr);
+    linear_ir.insert(perf_count_end_pos, perf_count_end_expr);
 
     return true;
 }

diff --git a/src/common/snippets/src/op/perf_count.cpp b/src/common/snippets/src/op/perf_count.cpp
@@ -16,7 +16,7 @@ void PerfCountBeginBase::validate_and_infer_types() {
     OPENVINO_ASSERT(get_output_size() == 1, "PerfCountBegin must have only one output");
     const auto& last_output_inputs = get_output_target_inputs(0);
     OPENVINO_ASSERT(last_output_inputs.size() == 1, "PerfCountBegin must have exactly one input attached to the last output");
-    const auto& pc_end = ov::as_type_ptr<PerfCountEnd>(last_output_inputs.begin()->get_node()->shared_from_this());
+    const auto& pc_end = ov::as_type_ptr<PerfCountEndBase>(last_output_inputs.begin()->get_node()->shared_from_this());
     OPENVINO_ASSERT(pc_end != nullptr, "PerfCountBegin must have PerfCountEnd connected to its last output");
 }
 
@@ -26,17 +26,17 @@ bool PerfCountBeginBase::visit_attributes(AttributeVisitor &visitor) {
 
 void PerfCountBeginBase::validate_and_infer_types_except_PerfCountEnd() {
     NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin dosen't expect any inputs");
-    set_output_type(0, element::f32, ov::PartialShape{ov::Shape{}});
+    set_output_type(0, element::f32, {});
 }
 
 //////////////////PerfCountEndBase/////////////////
 PerfCountEndBase::PerfCountEndBase(const std::vector<Output<Node>> &args) : Op(args) {}
 
 void PerfCountEndBase::validate_and_infer_types() {
     NODE_VALIDATION_CHECK(this, get_input_size() == 1, "PerfCountEndBase must have one input");
-    const auto pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_node_shared_ptr(0));
-    NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEndBase must have PerfCountBegin as the last argument");
-    set_output_type(0, element::f32, ov::PartialShape{ov::Shape{}});
+    const auto pc_begin = ov::as_type_ptr<PerfCountBeginBase>(get_input_node_shared_ptr(0));
+    NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEndBase must have PerfCountBeginBase as the last argument");
+    set_output_type(0, element::f32, {});
 }
 
 bool PerfCountEndBase::visit_attributes(AttributeVisitor &visitor) {
@@ -78,8 +78,7 @@ void PerfCountEnd::set_accumulated_time() {
 
 std::shared_ptr<PerfCountBegin> PerfCountEnd::get_pc_begin() {
     const auto& pc_begin = ov::as_type_ptr<PerfCountBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
-    if (!pc_begin)
-        throw std::invalid_argument("PerfCountEnd last input is not connected to PerfCountBegin");
+    NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEnd last input is not connected to PerfCountBegin");
     return  pc_begin;
 }
 

diff --git a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp
@@ -110,20 +110,6 @@ INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(ENABLE);
 INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(IGNORE_CALLBACK);
 INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(DISABLE);
 
-/**
- * @brief Defines Snippets performance count mode
- *      @param DISABLED - default, w/o perf count for snippets
- *      @param CHRONO - chrono call based perf count. This is a universal method for both arm and x86.
- *      @param RDTSC - perf count via read rdtsc register. This is more light overhead.
- * for example, test show that RDTSC take ~50ns while CHRONO take 260ns for a pair of perf count start and perf count
- * end execution. RDTSC is only applicable on x86.
- * @ingroup ie_dev_api_plugin_api
- */
-INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_KEY(SNIPPETS_PERF_COUNT_MODE);
-INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(DISABLED);
-INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(CHRONO);
-INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(RDTSC);
-
 }  // namespace PluginConfigInternalParams
 
 }  // namespace InferenceEngine
diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
@@ -237,16 +237,6 @@ void Config::readProperties(const std::map<std::string, std::string> &prop, cons
             else
                 IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_SNIPPETS_MODE
                             << ". Expected values: ENABLE/DISABLE/IGNORE_CALLBACK";
-        } else if (key == PluginConfigInternalParams::KEY_SNIPPETS_PERF_COUNT_MODE) {
-            if (val == PluginConfigInternalParams::DISABLED)
-                snippetsPCMode = SnippetsPerfCountMode::Disabled;
-            else if (val == PluginConfigInternalParams::CHRONO)
-                snippetsPCMode = SnippetsPerfCountMode::Chrono;
-            else if (val == PluginConfigInternalParams::RDTSC)
-                snippetsPCMode = SnippetsPerfCountMode::Rdtsc;
-            else
-                IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_SNIPPETS_PERF_COUNT_MODE
-                            << ". Expected values: DISABLE/CHRONO/RDTSC";
         } else if (key == ov::hint::execution_mode.name()) {
             if (val == "PERFORMANCE") {
                 executionMode = ov::hint::ExecutionMode::PERFORMANCE;

diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
@@ -39,12 +39,6 @@ struct Config {
         Disable,
     };
 
-    enum SnippetsPerfCountMode {
-        Disabled,
-        Chrono,
-        Rdtsc,
-    };
-
     enum class LatencyThreadingMode {
         PER_NUMA_NODE,
         PER_SOCKET,
@@ -59,7 +53,6 @@ struct Config {
     bool collectPerfCounters = false;
     bool exclusiveAsyncRequests = false;
     SnippetsMode snippetsMode = SnippetsMode::Enable;
-    SnippetsPerfCountMode snippetsPCMode = SnippetsPerfCountMode::Disabled;
     std::string dumpToDot = {};
     std::string device_id = {};
     float fcSparseWeiDecompressionRate = 1.0f;

diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_chrono_emitters.cpp
@@ -60,7 +60,7 @@ void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector<size_t> &in_
     internal_call_preamble();
 
     const auto &set_accumulated_time_overload = static_cast<void (*)(snippets::op::PerfCountEnd*)>(set_accumulated_time);
-    h->mov(h->rax, reinterpret_cast<size_t>(&set_accumulated_time_overload));
+    h->mov(h->rax, reinterpret_cast<size_t>(set_accumulated_time_overload));
     h->mov(abi_param1, reinterpret_cast<size_t>(m_end_node.get()));
     internal_call_rsp_align();
     h->call(h->rax);

diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
@@ -499,22 +499,6 @@ static Config::SnippetsMode getSnippetsMode(const std::map<std::string, std::str
         IE_THROW() << "Wrong value for property key SNIPPETS_MODE. Expected values: ENABLE/DISABLE/IGNORE_CALLBACK";
 }
 
-// static Config::SnippetsPerfCountMode getSnippetsPerfCountMode(const std::map<std::string, std::string>& modelConfig, const Config& engineConfig) {
-//     const auto& snippetsPCMode = modelConfig.find(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_PERF_COUNT_MODE);
-//     if (snippetsPCMode == modelConfig.end()) // not set explicitly
-//         return Config::SnippetsPerfCountMode::Disabled; // disabled by default
-
-//     const auto& val = snippetsPCMode->second;
-//     if (val == PluginConfigInternalParams::CHRONO)
-//         return Config::SnippetsPerfCountMode::Chrono;
-//     else if (val == PluginConfigInternalParams::RDTSC)
-//         return Config::SnippetsPerfCountMode::Rdtsc;
-//     else if (val == PluginConfigInternalParams::DISABLED)
-//         return Config::SnippetsPerfCountMode::Disabled;
-//     else
-//         IE_THROW() << "Wrong value for property key SNIPPETS_PERF_COUNT_MODE. Expected values: DISABLED/CHRONO/RDTSC";
-// }
-
 InferenceEngine::IExecutableNetworkInternal::Ptr
 Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std::map<std::string, std::string> &orig_config) {
     OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, "Engine::LoadExeNetworkImpl");
@@ -549,7 +533,7 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
     auto nGraphFunc = clonedNetwork.getFunction();
     Config::ModelType modelType = getModelType(nGraphFunc);
     ov::element::Type inferencePrecision = getInferencePrecision(config, engConfig, modelType);
-    const Config::SnippetsMode snippetsMode = getSnippetsMode(config, engConfig);  
+    const Config::SnippetsMode snippetsMode = getSnippetsMode(config, engConfig); 
 
     DEBUG_LOG(PrintableModel(*nGraphFunc, "org_"));
 

diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp
@@ -8,7 +8,9 @@ using namespace ov;
 using namespace ov::intel_cpu;
 
 /////////////////////////PerfCountRdtscBegin//////////////////////
-PerfCountRdtscBegin::PerfCountRdtscBegin() : PerfCountBeginBase() {}
+PerfCountRdtscBegin::PerfCountRdtscBegin() : PerfCountBeginBase() {
+    validate_and_infer_types_except_PerfCountEnd();
+}
 
 std::shared_ptr<Node> PerfCountRdtscBegin::clone_with_new_inputs(const OutputVector& inputs) const {
     return std::make_shared<PerfCountRdtscBegin>();

diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp
@@ -9,6 +9,7 @@
 #include "op/fused_mul_add.hpp"
 #include "op/load_convert.hpp"
 #include "op/store_convert.hpp"
+#include "op/perf_count_rdtsc.hpp"
 #include "transformations/cpu_opset/common/op/swish_cpu.hpp"
 
 namespace ov {
@@ -38,6 +39,8 @@ const CPUShapeInferSnippetsFactory::TRegistry CPUShapeInferSnippetsFactory::spec
         SHAPE_INFER_PREDEFINED(ov::intel_cpu::LoadConvertTruncation, PassThroughShapeInfer),
         SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertSaturation, PassThroughShapeInfer),
         SHAPE_INFER_PREDEFINED(ov::intel_cpu::StoreConvertTruncation, PassThroughShapeInfer),
+        SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscBegin, SingleElementShapeInfer),
+        SHAPE_INFER_PREDEFINED(ov::intel_cpu::PerfCountRdtscEnd, EmptyShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(ov::intel_cpu::BrgemmCPU, BrgemmShapeInfer),
         //
         SHAPE_INFER_OP_SPECIFIC(ov::intel_cpu::BrgemmCopyB),