apply review comments 2

chenhu-wang · Nov 8, 2023 · e437e51 · e437e51
1 parent 03752c5
commit e437e51
Show file tree

Hide file tree

Showing 12 changed files with 33 additions and 90 deletions.
diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp
@@ -29,10 +29,8 @@ class Generator;
 class LoweringResult {
     friend class Generator;
     // Some emitters rely on other precompiled kernels.
-    // We need to keep the pointers to such emitters alive, so the kernels would still be accessible at runtime.
+    // We need to keep the pointers to such emitters alive, so the kernels or nodes would still be accessible at runtime.
     std::vector<std::shared_ptr<Emitter>> m_saved_emitters{};
-    // For perf count nodes, kernel will read/write these nodes, so should be alive in execution.
-    std::vector<std::shared_ptr<ov::Node>> m_saved_nodes{};
 
 public:
     std::shared_ptr<CompiledSnippet> compiled_snippet = nullptr;

diff --git a/src/common/snippets/include/snippets/lowered/expression_factory.hpp b/src/common/snippets/include/snippets/lowered/expression_factory.hpp
@@ -24,10 +24,6 @@ class LinearIR::ExpressionFactory {
             return create(loop_begin, params...);
         } else if (const auto loop_end = ov::as_type_ptr<op::LoopEnd>(n)) {
             return create(loop_end, params...);
-        } else if (const auto pc_begin = ov::as_type_ptr<op::PerfCountBegin>(n)) {
-            return create(pc_begin, params...);
-        } else if (const auto pc_end = ov::as_type_ptr<op::PerfCountEnd>(n)) {
-            return create(pc_end, params...);
         }
         return create(n, params...);
     }
@@ -52,8 +48,6 @@ class LinearIR::ExpressionFactory {
     static ExpressionPtr create(const std::shared_ptr<op::LoopBegin>& n, const std::vector<PortConnectorPtr>& inputs, const LinearIR& linear_ir);
     static ExpressionPtr create(const std::shared_ptr<op::LoopEnd>& n, const std::vector<PortConnectorPtr>& inputs, const LinearIR& linear_ir);
     static ExpressionPtr create(const std::shared_ptr<ov::Node>& n, const std::vector<PortConnectorPtr>& inputs, const LinearIR& linear_ir);
-    static ExpressionPtr create(const std::shared_ptr<op::PerfCountBegin>& n, const std::vector<PortConnectorPtr>& inputs, const LinearIR& linear_ir);
-    static ExpressionPtr create(const std::shared_ptr<op::PerfCountEnd>& n, const std::vector<PortConnectorPtr>& inputs, const LinearIR& linear_ir);
 
     // Creates inputs for expression using parent output port connectors
     static void create_expression_inputs(const LinearIR& linear_ir, const ExpressionPtr& expr);

diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp
@@ -15,7 +15,7 @@ namespace pass {
 
 /**
  * @interface InsertPerfCount
- * @brief Insert PerfCountBegin node after last paramter and insert PerfCountEnd node before first result.
+ * @brief Insert PerfCountBegin node after last parameter and insert PerfCountEnd node before first result.
  *  This is a illustration transformation to enable perf count in snippets.
  *  Developers could modify this to insert perf count pairs around interested sequence of nodes.
  * @ingroup snippets

diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -152,7 +152,6 @@ class Subgraph : public ov::op::util::SubGraphOp {
                                       LoweringResult& lowering_result,
                                       const lowered::pass::PassPipeline& backend_passes_pre_common,
                                       const lowered::pass::PassPipeline& backend_passes_post_common) const;
-    void perf_count_transformations(lowered::LinearIR& linear_ir) const;
     void init_config();
     // Count of Subgraph virtual ports:
     //  - Potential non-scalar Constants that will be created after some transformations (At the moment it's relevant only for FakeQuantize decomposition)

diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
@@ -44,23 +44,15 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c
     }
     OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet")
 
-    // Note: some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime.
+    // 1. some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime.
+    // 2. perf count node as field of emitter should be alive at runtime.
     if (linear_ir.get_config().m_save_expressions) {
         for (const auto& expr : linear_ir) {
             const auto& emitter = expr->get_emitter();
             if (uses_precompiled_kernel(emitter))
                 result.m_saved_emitters.emplace_back(emitter);
         }
     }
-    // perf count node should be alive in execution.
-    if (linear_ir.get_config().perf_count_mode != lowered::PerfCountMode::Disabled) {
-        for (const auto& expr : linear_ir) {
-            const auto& node = expr->get_node();
-            if (should_node_alive_in_execution(node)) {
-                result.m_saved_nodes.emplace_back(node);
-            }
-        }
-    }
     result.compiled_snippet = target->get_snippet();
 }
 

diff --git a/src/common/snippets/src/lowered/expression_factory.cpp b/src/common/snippets/src/lowered/expression_factory.cpp
@@ -117,37 +117,6 @@ ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr<op::Loop
     return expr;
 }
 
-ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr<op::PerfCountBegin>& n,
-                                                  const std::vector<PortConnectorPtr>& inputs,
-                                                  const LinearIR& linear_ir) {
-    OPENVINO_ASSERT(inputs.empty(), "PerfCountBegin cannot have inputs");
-    auto expr = std::make_shared<Expression>(Expression(n, linear_ir.m_shape_infer_factory));
-    init_expression_inputs(expr, inputs);
-    create_expression_outputs(expr);
-    expr->validate();
-    return expr;
-}
-
-ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr<op::PerfCountEnd>& n,
-                                                  const std::vector<PortConnectorPtr>& inputs,
-                                                  const LinearIR& linear_ir) {
-    auto expr = std::shared_ptr<Expression>(new Expression(n, linear_ir.m_shape_infer_factory));
-    expr->m_input_port_descriptors.resize(inputs.size(), nullptr);
-    for (size_t i = 0; i < inputs.size() - 1; ++i) {
-        expr->m_input_port_descriptors[i] = std::make_shared<PortDescriptor>();
-    }
-    const auto& last_input = inputs.back()->get_source();
-    OPENVINO_ASSERT(ov::is_type<op::PerfCountBegin>(last_input.get_expr()->get_node()),
-        "PerfCountEnd expression expects PerfCountBegin on last input");
-    expr->m_input_port_descriptors[inputs.size() - 1] = last_input.get_descriptor_ptr()->clone();
-    init_expression_inputs(expr, inputs);
-    // The PerfCountEnd node don't need output port (because of sense of the node). But each node in ngraph must have one output at least.
-    // The port descriptors are automatically created in constructor. We manually clean output ports.
-    expr->m_output_port_descriptors.clear();
-    expr->validate();
-    return expr;
-}
-
 ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr<ov::Node>& n,
                                                   const std::vector<PortConnectorPtr>& inputs,
                                                   const LinearIR& linear_ir) {
@@ -160,7 +129,9 @@ ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr<ov::Node
     expr->validate();
     // todo: here we blindly synchronize input shapes from parent and child. Remove this when shapes will be stored in
     //  port connector itself
-    if (linear_ir.m_shape_infer_factory)
+    if (linear_ir.m_shape_infer_factory &&
+        !ov::is_type<op::PerfCountBeginBase>(n) &&
+        !ov::is_type<op::PerfCountEndBase>(n))
         expr->updateShapes();
     return expr;
 }

diff --git a/src/common/snippets/src/op/perf_count.cpp b/src/common/snippets/src/op/perf_count.cpp
@@ -25,7 +25,7 @@ bool PerfCountBeginBase::visit_attributes(AttributeVisitor &visitor) {
 }
 
 void PerfCountBeginBase::validate_and_infer_types_except_PerfCountEnd() {
-    NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin dosen't expect any inputs");
+    NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin doesn't expect any inputs");
     set_output_type(0, element::f32, {});
 }
 
@@ -34,7 +34,7 @@ PerfCountEndBase::PerfCountEndBase(const std::vector<Output<Node>> &args) : Op(a
 
 void PerfCountEndBase::validate_and_infer_types() {
     NODE_VALIDATION_CHECK(this, get_input_size() == 1, "PerfCountEndBase must have one input");
-    const auto pc_begin = ov::as_type_ptr<PerfCountBeginBase>(get_input_node_shared_ptr(0));
+    const auto& pc_begin = ov::as_type_ptr<PerfCountBeginBase>(get_input_node_shared_ptr(0));
     NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEndBase must have PerfCountBeginBase as the last argument");
     set_output_type(0, element::f32, {});
 }
@@ -87,14 +87,14 @@ void PerfCountEnd::output_perf_count() {
     auto iterator_iter = iteration.begin();
     auto iterator_acc = accumulation.begin();
     int t_num = 0;
-    std::vector<uint64_t> avg_list;
-    std::string friendly_name = get_friendly_name();
+    uint64_t avg_max = 0;
     std::cout << "Perf count data in perfCountEnd node with name " << get_friendly_name() << " is:"<< std::endl;
     for (; iterator_iter != iteration.end(); ++iterator_iter, ++iterator_acc) {
         const auto iter = *iterator_iter;
         const auto acc = *iterator_acc;
         uint64_t avg = iter == 0 ? 0 : acc / iter;
-        avg_list.push_back(avg);
+        if (avg > avg_max)
+            avg_max = avg;
         std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl;
         t_num++;
     }
@@ -107,8 +107,7 @@ void PerfCountEnd::output_perf_count() {
     uint64_t acc_max = accumulation.combine(BinaryFunc);
     std::cout << "max accumulated time:" << acc_max << "ns" << std::endl;
     // max avg
-    auto avg_max = std::max_element(avg_list.begin(), avg_list.end(), BinaryFunc);
-    std::cout << "max avg time:" << *avg_max << "ns" << std::endl;
+    std::cout << "max avg time:" << avg_max << "ns" << std::endl;
 }
 
 } // namespace op

diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
@@ -351,7 +351,8 @@ VectorDims Subgraph::infer_master_shape() {
 std::shared_ptr<lowered::LinearIR>
 Subgraph::convert_body_to_linear_ir(const std::shared_ptr<IShapeInferSnippetsFactory>& shape_infer_factory) {
     lowered::Config lowering_config;
-    lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops;
+    lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops ||
+        (lowering_config.perf_count_mode != lowered::PerfCountMode::Disabled);
     lowering_config.m_need_fill_tail_register = config.m_has_domain_sensitive_ops;
     lowering_config.m_loop_depth = tileRank;
     lowering_config.m_enable_domain_optimization = !config.m_has_domain_sensitive_ops;
@@ -469,15 +470,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
     lowering_result.buffer_scratchpad_size = buffer_allocation_pass->get_scratchpad_size();
 }
 
-void Subgraph::perf_count_transformations(lowered::LinearIR& linear_ir) const {
-    INTERNAL_OP_SCOPE(Subgraph);
-    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::op::perf_count_transformations")
-
-    lowered::pass::PassPipeline perf_count_pipeline;
-    perf_count_pipeline.register_pass<lowered::pass::InsertPerfCount>();
-    perf_count_pipeline.run(linear_ir);
-}
-
 snippets::Schedule Subgraph::generate(const BlockedShapeVector& blocked_input_shapes,
                                       const std::vector<ov::element::Type>& input_precisions,
                                       const std::vector<ov::element::Type>& output_precisions,
@@ -506,7 +498,8 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const lowered::pass::PassPi
     LoweringResult lowering_result;
     control_flow_transformations(linear_ir, lowering_result, backend_passes_pre_common, backend_passes_post_common);
     if (linear_ir.get_config().perf_count_mode == lowered::PerfCountMode::Chrono) {
-        perf_count_transformations(linear_ir);
+        lowered::pass::InsertPerfCount perf_count_pass;
+        perf_count_pass.run(linear_ir);
     }
     m_generator->generate(linear_ir, lowering_result, compile_params);
 

diff --git a/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp
@@ -232,6 +232,10 @@ snippets::Generator::opRegType intel_cpu::CPUGenerator::get_specific_op_reg_type
 }
 bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr<snippets::Emitter>& e) const {
     return std::dynamic_pointer_cast<intel_cpu::BrgemmEmitter>(e) ||
-           std::dynamic_pointer_cast<intel_cpu::BrgemmCopyBEmitter>(e);
+           std::dynamic_pointer_cast<intel_cpu::BrgemmCopyBEmitter>(e) ||
+           std::dynamic_pointer_cast<intel_cpu::jit_perf_count_chrono_start_emitter>(e) ||
+           std::dynamic_pointer_cast<intel_cpu::jit_perf_count_chrono_end_emitter>(e) ||
+           std::dynamic_pointer_cast<intel_cpu::jit_perf_count_rdtsc_start_emitter>(e) ||
+           std::dynamic_pointer_cast<intel_cpu::jit_perf_count_rdtsc_end_emitter>(e);
 }
-} // namespace ov
+} // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp
@@ -18,8 +18,7 @@ namespace intel_cpu {
 
 jit_perf_count_rdtsc_start_emitter::jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
                                             const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
-    auto start_op = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscBegin>(n);
-    m_current_count = &(start_op->start_count);
+    m_start_node = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscBegin>(n);
 }
 
 size_t jit_perf_count_rdtsc_start_emitter::get_inputs_num() const {
@@ -37,7 +36,7 @@ void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector<size_t> &in
     h->shl(h->rdx, 0x20);     // shift to higher half of rdx 0x20(32)
     h->or_(h->rdx, h->rax);   // rdx has current tsc
 
-    h->mov(h->rax, reinterpret_cast<size_t>(m_current_count));
+    h->mov(h->rax, reinterpret_cast<size_t>(&m_start_node->start_count));
     h->mov(qword[h->rax], h->rdx);
 
     h->pop(h->rdx);
@@ -47,10 +46,7 @@ void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector<size_t> &in
 ///////////////////jit_perf_count_rdtsc_end_emitter////////////////////////////////////
 jit_perf_count_rdtsc_end_emitter::jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
     const std::shared_ptr<ov::Node>& n) : jit_emitter(host, host_isa) {
-        auto end_op = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscEnd>(n);
-        m_accumulation = &(end_op->accumulation);
-        m_iteration = &(end_op->iteration);
-        m_start_count = &(end_op->get_pc_begin()->start_count);
+        m_end_node = ov::as_type_ptr<ov::intel_cpu::PerfCountRdtscEnd>(n);
 }
 
 size_t jit_perf_count_rdtsc_end_emitter::get_inputs_num() const {
@@ -68,16 +64,16 @@ void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector<size_t> &in_i
     h->or_(h->rdx, h->rax);  // rdx has current tsc
 
     // tsc duration
-    h->mov(h->rax, reinterpret_cast<size_t>(m_start_count));
+    h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->get_pc_begin()->start_count));
     h->sub(h->rdx, qword[h->rax]);  // rdx has tsc duration
 
-    // m_accumulation = m_accumulation + tsc duration
-    h->mov(h->rax, reinterpret_cast<size_t>(m_accumulation));
+    // accumulation = accumulation + tsc duration
+    h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->accumulation));
     h->add(h->rdx, qword[h->rax]);
     h->mov(qword[h->rax], h->rdx);
 
     // iteration++
-    h->mov(h->rax, reinterpret_cast<size_t>(m_iteration));
+    h->mov(h->rax, reinterpret_cast<size_t>(&m_end_node->iteration));
     h->mov(h->rdx, qword[h->rax]);
     h->add(h->rdx, 0x01);
     h->mov(qword[h->rax], h->rdx);

diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp
@@ -19,7 +19,7 @@ class jit_perf_count_rdtsc_start_emitter : public jit_emitter {
 
 private:
     void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
-    mutable uint64_t* m_current_count = nullptr;
+    std::shared_ptr<ov::intel_cpu::PerfCountRdtscBegin> m_start_node = nullptr;
 };
 
 class jit_perf_count_rdtsc_end_emitter : public jit_emitter {
@@ -30,9 +30,7 @@ class jit_perf_count_rdtsc_end_emitter : public jit_emitter {
 
 private:
     void emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const override;
-    mutable uint64_t* m_start_count = nullptr;
-    mutable uint64_t* m_accumulation = nullptr;
-    mutable uint32_t* m_iteration = nullptr;
+    std::shared_ptr<ov::intel_cpu::PerfCountRdtscEnd> m_end_node = nullptr;
 };
 
 }   // namespace intel_cpu

diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp
@@ -27,7 +27,6 @@ std::shared_ptr<Node> PerfCountRdtscEnd::clone_with_new_inputs(const OutputVecto
 
 std::shared_ptr<PerfCountRdtscBegin> PerfCountRdtscEnd::get_pc_begin() {
     const auto& pc_begin = ov::as_type_ptr<PerfCountRdtscBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
-    if (!pc_begin)
-        throw std::invalid_argument("PerfCountRdtscEnd last input is not connected to PerfCountRdtscBegin");
+    OPENVINO_ASSERT(pc_begin != nullptr, "PerfCountRdtscEnd last input is not connected to PerfCountRdtscBegin");
     return  pc_begin;
 }