intermediatebuffer_output_share_memory

chenhu-wang · Mar 19, 2024 · 724b0b7 · 724b0b7
1 parent 3305c93
commit 724b0b7
Show file tree

Hide file tree

Showing 9 changed files with 67 additions and 21 deletions.
diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp
@@ -24,6 +24,7 @@ class Generator;
  * @brief Holds all relevant information produced during lowering
  * @param compiled_snippet pointer to interface class that encapsulates compiled binary code
  * @param buffer_scratchpad_size the amount of additional memory required by the binary code to execute.
+ * @param buffer_inplace_output buffer share memory with subgraph output result. -1 means no sharing. i>=0 means share ith output memory.
  * Must be allocated and freed by the backend.
  */
 class LoweringResult {
@@ -35,6 +36,7 @@ class LoweringResult {
 public:
     std::shared_ptr<CompiledSnippet> compiled_snippet = nullptr;
     size_t buffer_scratchpad_size = 0;
+    int buffer_inplace_output = -1;
 };
 
 /**

diff --git a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
@@ -26,7 +26,7 @@ namespace pass {
 class AllocateBuffers: public RangedPass {
 public:
     OPENVINO_RTTI("AllocateBuffers", "RangedPass")
-    AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true);
+    AllocateBuffers(size_t& buffer_scratchpad_size, int& buffer_inplace_output, bool is_optimized = true);
 
     /**
      * @brief Apply the pass to the Linear IR
@@ -44,8 +44,10 @@ class AllocateBuffers: public RangedPass {
 
     using BufferCluster = std::set<ExpressionPtr>;
     using BufferClusters = std::vector<BufferCluster>;
+
 private:
     size_t& m_buffer_scratchpad_size;
+    int& m_buffer_inplace_output;
     bool m_is_optimized_mode = true;
 };
 

diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
@@ -19,8 +19,8 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized)
-    : m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized) {}
+AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, int& buffer_inplace_output, bool is_optimized)
+    : m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized), m_buffer_inplace_output(buffer_inplace_output) {}
 
 void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset) {
     // If Buffer has offset We set this offset in the connected MemoryAccess ops
@@ -78,6 +78,7 @@ bool AllocateBuffers::run(lowered::LinearIR& linear_ir, lowered::LinearIR::const
         pipeline.register_pass<SolveBufferMemory>(m_buffer_scratchpad_size, buffer_clusters);
         pipeline.register_pass<NormalizeBufferIDs>();
         pipeline.run(linear_ir);
+        m_buffer_inplace_output = 0;
     } else {
         InitBuffersDefault(m_buffer_scratchpad_size).run(linear_ir, linear_ir.cbegin(), linear_ir.cend());
     }

diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
@@ -56,6 +56,8 @@
 #include <memory>
 #include <array>
 
+#include "snippets/lowered/pass/serialize_control_flow.hpp"
+
 using namespace std;
 using namespace ov::op::util;
 
@@ -441,6 +443,12 @@ void Subgraph::data_flow_transformations(const BlockedShapeVector& blocked_input
 
     manager.register_positioned_passes(backend_passes);
     manager.run_passes(body_ptr());
+
+    // ov::pass::Manager magr;
+    // std::string xmlo = "data_flow.xml";
+    // std::string bino = "data_flow.bin";
+    // magr.register_pass<ov::pass::Serialize>(xmlo, bino);
+    // magr.run_passes(body_ptr());
 }
 
 void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
@@ -473,7 +481,8 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
     pipeline.register_pass<lowered::pass::ValidateLoops>();
     pipeline.register_pass<lowered::pass::InitLoops>();
     pipeline.register_pass<lowered::pass::InsertLoops>();
-    pipeline.register_pass<lowered::pass::AllocateBuffers>(lowering_result.buffer_scratchpad_size, linear_ir.get_config().m_are_buffers_optimized);
+    pipeline.register_pass<lowered::pass::AllocateBuffers>(lowering_result.buffer_scratchpad_size, lowering_result.buffer_inplace_output,
+        linear_ir.get_config().m_are_buffers_optimized);
     pipeline.register_pass<lowered::pass::CleanRepeatedDataPointerShifts>();
     pipeline.register_positioned_passes(lowered_backend_passes);
     pipeline.register_pass<lowered::pass::Validate>(); // must be last
@@ -514,6 +523,11 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const std::shared_ptr<lower
         perf_count_pass.run(linear_ir, linear_ir.cbegin(), linear_ir.cend());
     }
 #endif
+
+    // std::string xmlo = "LIR.xml";
+    // lowered::pass::SerializeControlFlow SerializeLIR(xmlo);
+    // SerializeLIR.run(linear_ir);
+
     m_generator->generate(linear_ir, lowering_result, compile_params);
 
     VectorDims parallel_exec_domain = linear_ir.get_master_shape();

diff --git a/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp b/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp
@@ -17,7 +17,8 @@ typedef std::tuple<
     bool,   // Optimized pipeline
     bool,   // With SplitLoops opt
     size_t, // Expected Buffer size in bytes
-    size_t  // Expected unique Buffer IDs count
+    size_t // Expected unique Buffer IDs count
+    // int     // buffer output inplace
 > BufferAllocationParams;
 
 class BufferAllocationTest : public testing::TestWithParam<BufferAllocationParams> {
@@ -46,6 +47,7 @@ class BufferAllocationTest : public testing::TestWithParam<BufferAllocationParam
 
     bool m_is_buffer_optimized = true;
     bool m_with_split_loops = true;
+    int m_buffer_inplace_out = -1;
 };
 
 class EltwiseBufferAllocationTest : public BufferAllocationTest {

diff --git a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
@@ -76,7 +76,7 @@ void BufferAllocationTest::ApplyTransformations(const std::shared_ptr<ov::snippe
     pipeline.register_pass<ov::snippets::lowered::pass::InsertLoadStore>(m_vector_size);
     pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
     pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
-    pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, m_is_buffer_optimized);
+    pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, m_buffer_inplace_out, m_is_buffer_optimized);
     pipeline.run(m_linear_ir);
 }
 

diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -505,16 +505,20 @@ void Snippet::SnippetJitExecutor::exec(const std::vector<MemoryPtr>& inMemPtrs,
 }
 
 void Snippet::SnippetJitExecutor::update_ptrs(jit_snippets_call_args& call_args,
-    const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) {
+    const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs, size_t buffer_offset) {
     for (size_t i = 0; i < inMemPtrs.size(); i++)
         call_args.src_ptrs[i] = inMemPtrs[i]->getDataAs<const uint8_t>() + start_offset_in[i];
 
     for (size_t i = 0; i < outMemPtrs.size(); i++)
         call_args.dst_ptrs[i] = outMemPtrs[i]->getDataAs<uint8_t>() + start_offset_out[i];
 
-    if (buffer_scratchpad_size > 0) {
-        call_args.buffer_scratchpad_ptr =
+    if (buffer_inplace_output >= 0) {
+        call_args.buffer_scratchpad_ptr = call_args.dst_ptrs[buffer_inplace_output] + buffer_offset * dataSize[buffer_inplace_output + numInput];
+    } else {
+        if (buffer_scratchpad_size > 0) {
+            call_args.buffer_scratchpad_ptr =
                 reinterpret_cast<uint8_t*>(buffer_scratchpad.data()) + parallel_get_thread_num() * buffer_scratchpad_size;
+        }
     }
 }
 
@@ -547,7 +551,12 @@ void Snippet::SnippetJitExecutor::schedule_6d(const std::vector<MemoryPtr>& inMe
         [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) {
             int64_t indexes[] = {d0, d1, d2, d3, d4};
             jit_snippets_call_args call_args;
-            update_ptrs(call_args, inMemPtrs, outMemPtrs);
+            size_t buffer_offset = 0;
+            if (buffer_inplace_output >= 0) {
+                for (size_t i = 0; i < sizeof(indexes) / sizeof(indexes[0]); i++)
+                    buffer_offset += indexes[i] * master_shape_stride[i];
+            }
+            update_ptrs(call_args, inMemPtrs, outMemPtrs, buffer_offset);
             callable(&call_args, indexes);
         });
 }
@@ -558,9 +567,6 @@ void Snippet::SnippetJitExecutor::schedule_nt(const std::vector<MemoryPtr>& inMe
     segfault_detector();
 #endif
     parallel_nt(0, [&](const int ithr, const int nthr) {
-        jit_snippets_call_args call_args;
-        update_ptrs(call_args, inMemPtrs, outMemPtrs);
-
         size_t start = 0, end = 0;
         splitter(harnessWorkAmount, nthr, ithr, start, end);
 
@@ -571,7 +577,13 @@ void Snippet::SnippetJitExecutor::schedule_nt(const std::vector<MemoryPtr>& inMe
                 indexes[j] = static_cast<int64_t>(tmp % work_size[j]);
                 tmp /= work_size[j];
             }
-
+            size_t buffer_offset = 0;
+            if (buffer_inplace_output >= 0) {
+                for (size_t i = 0; i < indexes.size(); i++)
+                    buffer_offset += indexes[i] * master_shape_stride[i];
+            }
+            jit_snippets_call_args call_args;
+            update_ptrs(call_args, inMemPtrs, outMemPtrs, buffer_offset);
             schedule.get_callable<kernel>()(&call_args, indexes.data());
         }
     });
@@ -595,10 +607,10 @@ Snippet::SnippetJitExecutor::SnippetJitExecutor(SnippetAttrs attrs, bool is_dyna
             in_shapes.emplace_back(s);
         snippetAttrs.snippet->shape_infer(in_shapes);
     }
-    const VectorDims& canonicalShape = snippetAttrs.snippet->infer_master_shape();
+    master_shape = snippetAttrs.snippet->infer_master_shape();
 
     // initialize by maximum output dimension. Dimensions of outputs should be broadcastable
-    tensorRank = std::max(static_cast<size_t>(rank6D), canonicalShape.size());
+    tensorRank = std::max(static_cast<size_t>(rank6D), master_shape.size());
     auto initDataSizes = [this]() {
         dataSize.resize(numInput + numOutput);
         for (size_t i = 0; i < numInput; i++)
@@ -608,18 +620,26 @@ Snippet::SnippetJitExecutor::SnippetJitExecutor(SnippetAttrs attrs, bool is_dyna
     };
     initDataSizes();
 
-    if (snippets::utils::is_dynamic_vdims(canonicalShape))
+    if (snippets::utils::is_dynamic_vdims(master_shape))
         OPENVINO_THROW("Snippets: Canonicalization returned dynamic shape in static pipeline");
 
     // generate
     jit_snippets_compile_args jcp;
     jcp.parallel_executor_ndims = tensorRank;
     generate(&jcp);
-    buffer_scratchpad_size = schedule.lowering_result.buffer_scratchpad_size;
-    buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0);
+    buffer_inplace_output = schedule.lowering_result.buffer_inplace_output;
+    if (buffer_inplace_output == -1) {
+        buffer_scratchpad_size = schedule.lowering_result.buffer_scratchpad_size;
+        buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0);
+    }
     parallel_exec_domain = schedule.parallel_exec_domain;
     harnessWorkAmount = std::accumulate(parallel_exec_domain.begin(), parallel_exec_domain.end(), 1, std::multiplies<size_t>());
     parallel_exec_domain = getNormalizedDimsBySize(parallel_exec_domain, tensorRank);
+    master_shape = getNormalizedDimsBySize(master_shape, tensorRank);
+    master_shape_stride = std::vector<size_t>(master_shape.size(), 1);
+    for (int i = master_shape_stride.size() - 2 ; i >= 0; i--) {
+        master_shape_stride[i] = master_shape_stride[i + 1] * master_shape[i + 1];
+    }
 }
 
 void Snippet::SnippetJitExecutor::generate(const jit_snippets_compile_args* jcp) {

diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h
@@ -101,7 +101,8 @@ class Snippet : public Node {
             size_t numOutput = 0;
 
             void generate(const jit_snippets_compile_args*);
-            inline void update_ptrs(jit_snippets_call_args&, const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs);
+            inline void update_ptrs(jit_snippets_call_args&, const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs,
+                size_t buffer_offset);
             // Evaluates generated snippet using parallel backend
             void schedule_6d(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs);
             void schedule_nt(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs);
@@ -125,6 +126,9 @@ class Snippet : public Node {
             // Buffer scratchpad
             std::vector<uint8_t> buffer_scratchpad = {};
             size_t buffer_scratchpad_size = 0;
+            int buffer_inplace_output = -1;
+            VectorDims master_shape = {};
+            VectorDims master_shape_stride = {};
 
 #ifdef SNIPPETS_DEBUG_CAPS
             inline void segfault_detector();

diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp
@@ -88,7 +88,8 @@ class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCP
         pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
         pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
         pipeline.register_pass<ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape>();
-        pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, m_is_buffer_optimized);
+        int inplace = -1;
+        pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, inplace, m_is_buffer_optimized);
         pipeline.run(m_linear_ir);
     }