diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp index 9e8c6fac25b3ec..10a6609b6e9ef4 100644 --- a/src/common/snippets/include/snippets/generator.hpp +++ b/src/common/snippets/include/snippets/generator.hpp @@ -29,10 +29,8 @@ class Generator; class LoweringResult { friend class Generator; // Some emitters rely on other precompiled kernels. - // We need to keep the pointers to such emitters alive, so the kernels would still be accessible at runtime. + // We need to keep the pointers to such emitters alive, so the kernels or nodes would still be accessible at runtime. std::vector> m_saved_emitters{}; - // For perf count nodes, kernel will read/write these nodes, so should be alive in execution. - std::vector> m_saved_nodes{}; public: std::shared_ptr compiled_snippet = nullptr; diff --git a/src/common/snippets/include/snippets/lowered/expression_factory.hpp b/src/common/snippets/include/snippets/lowered/expression_factory.hpp index 0365db3bd6ed3d..f179abf746c313 100644 --- a/src/common/snippets/include/snippets/lowered/expression_factory.hpp +++ b/src/common/snippets/include/snippets/lowered/expression_factory.hpp @@ -24,10 +24,6 @@ class LinearIR::ExpressionFactory { return create(loop_begin, params...); } else if (const auto loop_end = ov::as_type_ptr(n)) { return create(loop_end, params...); - } else if (const auto pc_begin = ov::as_type_ptr(n)) { - return create(pc_begin, params...); - } else if (const auto pc_end = ov::as_type_ptr(n)) { - return create(pc_end, params...); } return create(n, params...); } @@ -52,8 +48,6 @@ class LinearIR::ExpressionFactory { static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, const LinearIR& linear_ir); static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, const LinearIR& linear_ir); static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, const LinearIR& linear_ir); - static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, const LinearIR& linear_ir); - static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, const LinearIR& linear_ir); // Creates inputs for expression using parent output port connectors static void create_expression_inputs(const LinearIR& linear_ir, const ExpressionPtr& expr); diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp index b30e9b9136ef83..8478a0d931f182 100644 --- a/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/insert_perf_count.hpp @@ -15,7 +15,7 @@ namespace pass { /** * @interface InsertPerfCount - * @brief Insert PerfCountBegin node after last paramter and insert PerfCountEnd node before first result. + * @brief Insert PerfCountBegin node after last parameter and insert PerfCountEnd node before first result. * This is a illustration transformation to enable perf count in snippets. * Developers could modify this to insert perf count pairs around interested sequence of nodes. * @ingroup snippets diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index 84e1582f12f1f6..b642bbd7a23ccb 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -152,7 +152,6 @@ class Subgraph : public ov::op::util::SubGraphOp { LoweringResult& lowering_result, const lowered::pass::PassPipeline& backend_passes_pre_common, const lowered::pass::PassPipeline& backend_passes_post_common) const; - void perf_count_transformations(lowered::LinearIR& linear_ir) const; void init_config(); // Count of Subgraph virtual ports: // - Potential non-scalar Constants that will be created after some transformations (At the moment it's relevant only for FakeQuantize decomposition) diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index ae4457e97c36c4..0dacee4878d598 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -44,7 +44,8 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c } OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet") - // Note: some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime. + // 1. some emitters use precompiled kernels. They need to be saved, so the kernels are accessible at runtime. + // 2. perf count node as field of emitter should be alive at runtime. if (linear_ir.get_config().m_save_expressions) { for (const auto& expr : linear_ir) { const auto& emitter = expr->get_emitter(); @@ -52,15 +53,6 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c result.m_saved_emitters.emplace_back(emitter); } } - // perf count node should be alive in execution. - if (linear_ir.get_config().perf_count_mode != lowered::PerfCountMode::Disabled) { - for (const auto& expr : linear_ir) { - const auto& node = expr->get_node(); - if (should_node_alive_in_execution(node)) { - result.m_saved_nodes.emplace_back(node); - } - } - } result.compiled_snippet = target->get_snippet(); } diff --git a/src/common/snippets/src/lowered/expression_factory.cpp b/src/common/snippets/src/lowered/expression_factory.cpp index 7c6b22c1d07b9d..e2edb5e85e4523 100644 --- a/src/common/snippets/src/lowered/expression_factory.cpp +++ b/src/common/snippets/src/lowered/expression_factory.cpp @@ -117,37 +117,6 @@ ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir) { - OPENVINO_ASSERT(inputs.empty(), "PerfCountBegin cannot have inputs"); - auto expr = std::make_shared(Expression(n, linear_ir.m_shape_infer_factory)); - init_expression_inputs(expr, inputs); - create_expression_outputs(expr); - expr->validate(); - return expr; -} - -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir) { - auto expr = std::shared_ptr(new Expression(n, linear_ir.m_shape_infer_factory)); - expr->m_input_port_descriptors.resize(inputs.size(), nullptr); - for (size_t i = 0; i < inputs.size() - 1; ++i) { - expr->m_input_port_descriptors[i] = std::make_shared(); - } - const auto& last_input = inputs.back()->get_source(); - OPENVINO_ASSERT(ov::is_type(last_input.get_expr()->get_node()), - "PerfCountEnd expression expects PerfCountBegin on last input"); - expr->m_input_port_descriptors[inputs.size() - 1] = last_input.get_descriptor_ptr()->clone(); - init_expression_inputs(expr, inputs); - // The PerfCountEnd node don't need output port (because of sense of the node). But each node in ngraph must have one output at least. - // The port descriptors are automatically created in constructor. We manually clean output ports. - expr->m_output_port_descriptors.clear(); - expr->validate(); - return expr; -} - ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, const std::vector& inputs, const LinearIR& linear_ir) { @@ -160,7 +129,9 @@ ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptrvalidate(); // todo: here we blindly synchronize input shapes from parent and child. Remove this when shapes will be stored in // port connector itself - if (linear_ir.m_shape_infer_factory) + if (linear_ir.m_shape_infer_factory && + !ov::is_type(n) && + !ov::is_type(n)) expr->updateShapes(); return expr; } diff --git a/src/common/snippets/src/op/perf_count.cpp b/src/common/snippets/src/op/perf_count.cpp index f4779cfaaf47d6..66061753373e18 100644 --- a/src/common/snippets/src/op/perf_count.cpp +++ b/src/common/snippets/src/op/perf_count.cpp @@ -25,7 +25,7 @@ bool PerfCountBeginBase::visit_attributes(AttributeVisitor &visitor) { } void PerfCountBeginBase::validate_and_infer_types_except_PerfCountEnd() { - NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin dosen't expect any inputs"); + NODE_VALIDATION_CHECK(this, get_input_size() == 0, "PerfCountBegin doesn't expect any inputs"); set_output_type(0, element::f32, {}); } @@ -34,7 +34,7 @@ PerfCountEndBase::PerfCountEndBase(const std::vector> &args) : Op(a void PerfCountEndBase::validate_and_infer_types() { NODE_VALIDATION_CHECK(this, get_input_size() == 1, "PerfCountEndBase must have one input"); - const auto pc_begin = ov::as_type_ptr(get_input_node_shared_ptr(0)); + const auto& pc_begin = ov::as_type_ptr(get_input_node_shared_ptr(0)); NODE_VALIDATION_CHECK(this, pc_begin != nullptr, "PerfCountEndBase must have PerfCountBeginBase as the last argument"); set_output_type(0, element::f32, {}); } @@ -87,14 +87,14 @@ void PerfCountEnd::output_perf_count() { auto iterator_iter = iteration.begin(); auto iterator_acc = accumulation.begin(); int t_num = 0; - std::vector avg_list; - std::string friendly_name = get_friendly_name(); + uint64_t avg_max = 0; std::cout << "Perf count data in perfCountEnd node with name " << get_friendly_name() << " is:"<< std::endl; for (; iterator_iter != iteration.end(); ++iterator_iter, ++iterator_acc) { const auto iter = *iterator_iter; const auto acc = *iterator_acc; uint64_t avg = iter == 0 ? 0 : acc / iter; - avg_list.push_back(avg); + if (avg > avg_max) + avg_max = avg; std::cout << "accumulated time:" << acc << "ns, iteration:" << iter << " avg time:" << avg << "ns"<< " on thread:" << t_num << std::endl; t_num++; } @@ -107,8 +107,7 @@ void PerfCountEnd::output_perf_count() { uint64_t acc_max = accumulation.combine(BinaryFunc); std::cout << "max accumulated time:" << acc_max << "ns" << std::endl; // max avg - auto avg_max = std::max_element(avg_list.begin(), avg_list.end(), BinaryFunc); - std::cout << "max avg time:" << *avg_max << "ns" << std::endl; + std::cout << "max avg time:" << avg_max << "ns" << std::endl; } } // namespace op diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index d3f0dbabd459f4..661ffe16449851 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -351,7 +351,8 @@ VectorDims Subgraph::infer_master_shape() { std::shared_ptr Subgraph::convert_body_to_linear_ir(const std::shared_ptr& shape_infer_factory) { lowered::Config lowering_config; - lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops; + lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops || + (lowering_config.perf_count_mode != lowered::PerfCountMode::Disabled); lowering_config.m_need_fill_tail_register = config.m_has_domain_sensitive_ops; lowering_config.m_loop_depth = tileRank; lowering_config.m_enable_domain_optimization = !config.m_has_domain_sensitive_ops; @@ -469,15 +470,6 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir, lowering_result.buffer_scratchpad_size = buffer_allocation_pass->get_scratchpad_size(); } -void Subgraph::perf_count_transformations(lowered::LinearIR& linear_ir) const { - INTERNAL_OP_SCOPE(Subgraph); - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::op::perf_count_transformations") - - lowered::pass::PassPipeline perf_count_pipeline; - perf_count_pipeline.register_pass(); - perf_count_pipeline.run(linear_ir); -} - snippets::Schedule Subgraph::generate(const BlockedShapeVector& blocked_input_shapes, const std::vector& input_precisions, const std::vector& output_precisions, @@ -506,7 +498,8 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const lowered::pass::PassPi LoweringResult lowering_result; control_flow_transformations(linear_ir, lowering_result, backend_passes_pre_common, backend_passes_post_common); if (linear_ir.get_config().perf_count_mode == lowered::PerfCountMode::Chrono) { - perf_count_transformations(linear_ir); + lowered::pass::InsertPerfCount perf_count_pass; + perf_count_pass.run(linear_ir); } m_generator->generate(linear_ir, lowering_result, compile_params); diff --git a/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp index 11e51c405790ba..11bf29362779f0 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/cpu_generator.cpp @@ -232,6 +232,10 @@ snippets::Generator::opRegType intel_cpu::CPUGenerator::get_specific_op_reg_type } bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr& e) const { return std::dynamic_pointer_cast(e) || - std::dynamic_pointer_cast(e); + std::dynamic_pointer_cast(e) || + std::dynamic_pointer_cast(e) || + std::dynamic_pointer_cast(e) || + std::dynamic_pointer_cast(e) || + std::dynamic_pointer_cast(e); } -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp index 21e2b4b483607b..7f1ccda3aca62b 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.cpp @@ -18,8 +18,7 @@ namespace intel_cpu { jit_perf_count_rdtsc_start_emitter::jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n) : jit_emitter(host, host_isa) { - auto start_op = ov::as_type_ptr(n); - m_current_count = &(start_op->start_count); + m_start_node = ov::as_type_ptr(n); } size_t jit_perf_count_rdtsc_start_emitter::get_inputs_num() const { @@ -37,7 +36,7 @@ void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector &in h->shl(h->rdx, 0x20); // shift to higher half of rdx 0x20(32) h->or_(h->rdx, h->rax); // rdx has current tsc - h->mov(h->rax, reinterpret_cast(m_current_count)); + h->mov(h->rax, reinterpret_cast(&m_start_node->start_count)); h->mov(qword[h->rax], h->rdx); h->pop(h->rdx); @@ -47,10 +46,7 @@ void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector &in ///////////////////jit_perf_count_rdtsc_end_emitter//////////////////////////////////// jit_perf_count_rdtsc_end_emitter::jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n) : jit_emitter(host, host_isa) { - auto end_op = ov::as_type_ptr(n); - m_accumulation = &(end_op->accumulation); - m_iteration = &(end_op->iteration); - m_start_count = &(end_op->get_pc_begin()->start_count); + m_end_node = ov::as_type_ptr(n); } size_t jit_perf_count_rdtsc_end_emitter::get_inputs_num() const { @@ -68,16 +64,16 @@ void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector &in_i h->or_(h->rdx, h->rax); // rdx has current tsc // tsc duration - h->mov(h->rax, reinterpret_cast(m_start_count)); + h->mov(h->rax, reinterpret_cast(&m_end_node->get_pc_begin()->start_count)); h->sub(h->rdx, qword[h->rax]); // rdx has tsc duration - // m_accumulation = m_accumulation + tsc duration - h->mov(h->rax, reinterpret_cast(m_accumulation)); + // accumulation = accumulation + tsc duration + h->mov(h->rax, reinterpret_cast(&m_end_node->accumulation)); h->add(h->rdx, qword[h->rax]); h->mov(qword[h->rax], h->rdx); // iteration++ - h->mov(h->rax, reinterpret_cast(m_iteration)); + h->mov(h->rax, reinterpret_cast(&m_end_node->iteration)); h->mov(h->rdx, qword[h->rax]); h->add(h->rdx, 0x01); h->mov(qword[h->rax], h->rdx); diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp index 3c831ade9213d9..c6314adc72a084 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_perf_count_rdtsc_emitters.hpp @@ -19,7 +19,7 @@ class jit_perf_count_rdtsc_start_emitter : public jit_emitter { private: void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; - mutable uint64_t* m_current_count = nullptr; + std::shared_ptr m_start_node = nullptr; }; class jit_perf_count_rdtsc_end_emitter : public jit_emitter { @@ -30,9 +30,7 @@ class jit_perf_count_rdtsc_end_emitter : public jit_emitter { private: void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; - mutable uint64_t* m_start_count = nullptr; - mutable uint64_t* m_accumulation = nullptr; - mutable uint32_t* m_iteration = nullptr; + std::shared_ptr m_end_node = nullptr; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp index da03c44b172c03..a3343d5ab74e8c 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/perf_count_rdtsc.cpp @@ -27,7 +27,6 @@ std::shared_ptr PerfCountRdtscEnd::clone_with_new_inputs(const OutputVecto std::shared_ptr PerfCountRdtscEnd::get_pc_begin() { const auto& pc_begin = ov::as_type_ptr(get_input_source_output(get_input_size() - 1).get_node_shared_ptr()); - if (!pc_begin) - throw std::invalid_argument("PerfCountRdtscEnd last input is not connected to PerfCountRdtscBegin"); + OPENVINO_ASSERT(pc_begin != nullptr, "PerfCountRdtscEnd last input is not connected to PerfCountRdtscBegin"); return pc_begin; }