saved node instead of entire LIR for perf count node

chenhu-wang · Nov 8, 2023 · 03752c5 · 03752c5
1 parent aa0cec4
commit 03752c5
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 6 deletions.
diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp
@@ -31,6 +31,8 @@ class LoweringResult {
     // Some emitters rely on other precompiled kernels.
     // We need to keep the pointers to such emitters alive, so the kernels would still be accessible at runtime.
     std::vector<std::shared_ptr<Emitter>> m_saved_emitters{};
+    // For perf count nodes, kernel will read/write these nodes, so should be alive in execution.
+    std::vector<std::shared_ptr<ov::Node>> m_saved_nodes{};
 
 public:
     std::shared_ptr<CompiledSnippet> compiled_snippet = nullptr;
@@ -126,6 +128,14 @@ class Generator {
     * @return bool
     */
     virtual bool uses_precompiled_kernel(const std::shared_ptr<Emitter>& emitter) const { return false; }
+    /**
+    * @brief returns true if a node should be alive in execution.
+    * @return bool
+    */
+    virtual bool should_node_alive_in_execution(const std::shared_ptr<ov::Node>& op) const {
+        return std::dynamic_pointer_cast<op::PerfCountBeginBase>(op) ||
+               std::dynamic_pointer_cast<op::PerfCountEndBase>(op);
+    }
 
     std::shared_ptr<TargetMachine> target;
 };

diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -152,7 +152,7 @@ class Subgraph : public ov::op::util::SubGraphOp {
                                       LoweringResult& lowering_result,
                                       const lowered::pass::PassPipeline& backend_passes_pre_common,
                                       const lowered::pass::PassPipeline& backend_passes_post_common) const;
-    void perf_count_transformations(lowered::LinearIR& linear_ir);
+    void perf_count_transformations(lowered::LinearIR& linear_ir) const;
     void init_config();
     // Count of Subgraph virtual ports:
     //  - Potential non-scalar Constants that will be created after some transformations (At the moment it's relevant only for FakeQuantize decomposition)

diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
@@ -52,6 +52,15 @@ void Generator::generate(lowered::LinearIR& linear_ir, LoweringResult& result, c
                 result.m_saved_emitters.emplace_back(emitter);
         }
     }
+    // perf count node should be alive in execution.
+    if (linear_ir.get_config().perf_count_mode != lowered::PerfCountMode::Disabled) {
+        for (const auto& expr : linear_ir) {
+            const auto& node = expr->get_node();
+            if (should_node_alive_in_execution(node)) {
+                result.m_saved_nodes.emplace_back(node);
+            }
+        }
+    }
     result.compiled_snippet = target->get_snippet();
 }
 
@@ -66,7 +75,7 @@ Generator::opRegType Generator::get_op_reg_type(const std::shared_ptr<Node>& op)
         std::dynamic_pointer_cast<op::LoopEnd>(op) ||
         std::dynamic_pointer_cast<op::Brgemm>(op) ||
         std::dynamic_pointer_cast<op::Buffer>(op) ||
-        std::dynamic_pointer_cast<op::RankNormalization>(op)) ||
+        std::dynamic_pointer_cast<op::RankNormalization>(op) ||
         std::dynamic_pointer_cast<op::PerfCountBeginBase>(op) ||
         std::dynamic_pointer_cast<op::PerfCountEndBase>(op))
         return gpr2gpr;

diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
@@ -469,7 +469,7 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
     lowering_result.buffer_scratchpad_size = buffer_allocation_pass->get_scratchpad_size();
 }
 
-void Subgraph::perf_count_transformations(lowered::LinearIR& linear_ir) {
+void Subgraph::perf_count_transformations(lowered::LinearIR& linear_ir) const {
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::op::perf_count_transformations")
 
@@ -485,8 +485,6 @@ snippets::Schedule Subgraph::generate(const BlockedShapeVector& blocked_input_sh
                                       const lowered::pass::PassPipeline& backend_passes_pre_common,
                                       const lowered::pass::PassPipeline& backend_passes_post_common,
                                       const std::shared_ptr<IShapeInferSnippetsFactory>& factory,
-}
-
                                       const void* compile_params) {
     data_flow_transformations(blocked_input_shapes, input_precisions, output_precisions, data_flow_backend_passes);
     convert_body_to_linear_ir(factory);
@@ -500,7 +498,6 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const lowered::pass::PassPi
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::op::generate")
     OPENVINO_ASSERT(m_generator != nullptr, "generate is called while generator is not set");
 
-
     // actual code emission
     // Note: some transformations performed in the generator, e.g. tail insertion, can break shape propagation
     //  until we fix this behavior, we have to make a copy of LIR before giving it to the generator.