diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp index 4c10f112bc2c42..f22ba76cba19e4 100644 --- a/src/common/snippets/include/snippets/generator.hpp +++ b/src/common/snippets/include/snippets/generator.hpp @@ -24,6 +24,7 @@ class Generator; * @brief Holds all relevant information produced during lowering * @param compiled_snippet pointer to interface class that encapsulates compiled binary code * @param buffer_scratchpad_size the amount of additional memory required by the binary code to execute. + * @param buffer_inplace_output buffer share memory with subgraph output result. -1 means no sharing. i>=0 means share ith output memory. * Must be allocated and freed by the backend. */ class LoweringResult { @@ -35,6 +36,7 @@ class LoweringResult { public: std::shared_ptr compiled_snippet = nullptr; size_t buffer_scratchpad_size = 0; + int buffer_inplace_output = -1; }; /** diff --git a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp index 1ec9598ec1d2c2..b31a8ced9da702 100644 --- a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp @@ -26,7 +26,7 @@ namespace pass { class AllocateBuffers: public RangedPass { public: OPENVINO_RTTI("AllocateBuffers", "RangedPass") - AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true); + AllocateBuffers(size_t& buffer_scratchpad_size, int& buffer_inplace_output, bool is_optimized = true); /** * @brief Apply the pass to the Linear IR @@ -44,8 +44,10 @@ class AllocateBuffers: public RangedPass { using BufferCluster = std::set; using BufferClusters = std::vector; + private: size_t& m_buffer_scratchpad_size; + int& m_buffer_inplace_output; bool m_is_optimized_mode = true; }; diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp index cfdab4b48287c7..ffc842c6af4078 100644 --- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp @@ -19,8 +19,8 @@ namespace snippets { namespace lowered { namespace pass { -AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized) - : m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized) {} +AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, int& buffer_inplace_output, bool is_optimized) + : m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized), m_buffer_inplace_output(buffer_inplace_output) {} void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset) { // If Buffer has offset We set this offset in the connected MemoryAccess ops @@ -78,6 +78,7 @@ bool AllocateBuffers::run(lowered::LinearIR& linear_ir, lowered::LinearIR::const pipeline.register_pass(m_buffer_scratchpad_size, buffer_clusters); pipeline.register_pass(); pipeline.run(linear_ir); + m_buffer_inplace_output = 0; } else { InitBuffersDefault(m_buffer_scratchpad_size).run(linear_ir, linear_ir.cbegin(), linear_ir.cend()); } diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 3e050ee97cdd88..30da2c387ff422 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -56,6 +56,8 @@ #include #include +#include "snippets/lowered/pass/serialize_control_flow.hpp" + using namespace std; using namespace ov::op::util; @@ -441,6 +443,12 @@ void Subgraph::data_flow_transformations(const BlockedShapeVector& blocked_input manager.register_positioned_passes(backend_passes); manager.run_passes(body_ptr()); + + // ov::pass::Manager magr; + // std::string xmlo = "data_flow.xml"; + // std::string bino = "data_flow.bin"; + // magr.register_pass(xmlo, bino); + // magr.run_passes(body_ptr()); } void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir, @@ -473,7 +481,8 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir, pipeline.register_pass(); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(lowering_result.buffer_scratchpad_size, linear_ir.get_config().m_are_buffers_optimized); + pipeline.register_pass(lowering_result.buffer_scratchpad_size, lowering_result.buffer_inplace_output, + linear_ir.get_config().m_are_buffers_optimized); pipeline.register_pass(); pipeline.register_positioned_passes(lowered_backend_passes); pipeline.register_pass(); // must be last @@ -514,6 +523,11 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const std::shared_ptrgenerate(linear_ir, lowering_result, compile_params); VectorDims parallel_exec_domain = linear_ir.get_master_shape(); diff --git a/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp b/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp index dd5dd631437cd8..e6bafc19ef1700 100644 --- a/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp +++ b/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp @@ -17,7 +17,8 @@ typedef std::tuple< bool, // Optimized pipeline bool, // With SplitLoops opt size_t, // Expected Buffer size in bytes - size_t // Expected unique Buffer IDs count + size_t // Expected unique Buffer IDs count + // int // buffer output inplace > BufferAllocationParams; class BufferAllocationTest : public testing::TestWithParam { @@ -46,6 +47,7 @@ class BufferAllocationTest : public testing::TestWithParam(m_vector_size); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(m_buffer_scratchpad, m_is_buffer_optimized); + pipeline.register_pass(m_buffer_scratchpad, m_buffer_inplace_out, m_is_buffer_optimized); pipeline.run(m_linear_ir); } diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index f7a50ffa14852f..791be15c197a1c 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -505,16 +505,20 @@ void Snippet::SnippetJitExecutor::exec(const std::vector& inMemPtrs, } void Snippet::SnippetJitExecutor::update_ptrs(jit_snippets_call_args& call_args, - const std::vector& inMemPtrs, const std::vector& outMemPtrs) { + const std::vector& inMemPtrs, const std::vector& outMemPtrs, size_t buffer_offset) { for (size_t i = 0; i < inMemPtrs.size(); i++) call_args.src_ptrs[i] = inMemPtrs[i]->getDataAs() + start_offset_in[i]; for (size_t i = 0; i < outMemPtrs.size(); i++) call_args.dst_ptrs[i] = outMemPtrs[i]->getDataAs() + start_offset_out[i]; - if (buffer_scratchpad_size > 0) { - call_args.buffer_scratchpad_ptr = + if (buffer_inplace_output >= 0) { + call_args.buffer_scratchpad_ptr = call_args.dst_ptrs[buffer_inplace_output] + buffer_offset * dataSize[buffer_inplace_output + numInput]; + } else { + if (buffer_scratchpad_size > 0) { + call_args.buffer_scratchpad_ptr = reinterpret_cast(buffer_scratchpad.data()) + parallel_get_thread_num() * buffer_scratchpad_size; + } } } @@ -547,7 +551,12 @@ void Snippet::SnippetJitExecutor::schedule_6d(const std::vector& inMe [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) { int64_t indexes[] = {d0, d1, d2, d3, d4}; jit_snippets_call_args call_args; - update_ptrs(call_args, inMemPtrs, outMemPtrs); + size_t buffer_offset = 0; + if (buffer_inplace_output >= 0) { + for (size_t i = 0; i < sizeof(indexes) / sizeof(indexes[0]); i++) + buffer_offset += indexes[i] * master_shape_stride[i]; + } + update_ptrs(call_args, inMemPtrs, outMemPtrs, buffer_offset); callable(&call_args, indexes); }); } @@ -558,9 +567,6 @@ void Snippet::SnippetJitExecutor::schedule_nt(const std::vector& inMe segfault_detector(); #endif parallel_nt(0, [&](const int ithr, const int nthr) { - jit_snippets_call_args call_args; - update_ptrs(call_args, inMemPtrs, outMemPtrs); - size_t start = 0, end = 0; splitter(harnessWorkAmount, nthr, ithr, start, end); @@ -571,7 +577,13 @@ void Snippet::SnippetJitExecutor::schedule_nt(const std::vector& inMe indexes[j] = static_cast(tmp % work_size[j]); tmp /= work_size[j]; } - + size_t buffer_offset = 0; + if (buffer_inplace_output >= 0) { + for (size_t i = 0; i < indexes.size(); i++) + buffer_offset += indexes[i] * master_shape_stride[i]; + } + jit_snippets_call_args call_args; + update_ptrs(call_args, inMemPtrs, outMemPtrs, buffer_offset); schedule.get_callable()(&call_args, indexes.data()); } }); @@ -595,10 +607,10 @@ Snippet::SnippetJitExecutor::SnippetJitExecutor(SnippetAttrs attrs, bool is_dyna in_shapes.emplace_back(s); snippetAttrs.snippet->shape_infer(in_shapes); } - const VectorDims& canonicalShape = snippetAttrs.snippet->infer_master_shape(); + master_shape = snippetAttrs.snippet->infer_master_shape(); // initialize by maximum output dimension. Dimensions of outputs should be broadcastable - tensorRank = std::max(static_cast(rank6D), canonicalShape.size()); + tensorRank = std::max(static_cast(rank6D), master_shape.size()); auto initDataSizes = [this]() { dataSize.resize(numInput + numOutput); for (size_t i = 0; i < numInput; i++) @@ -608,18 +620,26 @@ Snippet::SnippetJitExecutor::SnippetJitExecutor(SnippetAttrs attrs, bool is_dyna }; initDataSizes(); - if (snippets::utils::is_dynamic_vdims(canonicalShape)) + if (snippets::utils::is_dynamic_vdims(master_shape)) OPENVINO_THROW("Snippets: Canonicalization returned dynamic shape in static pipeline"); // generate jit_snippets_compile_args jcp; jcp.parallel_executor_ndims = tensorRank; generate(&jcp); - buffer_scratchpad_size = schedule.lowering_result.buffer_scratchpad_size; - buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0); + buffer_inplace_output = schedule.lowering_result.buffer_inplace_output; + if (buffer_inplace_output == -1) { + buffer_scratchpad_size = schedule.lowering_result.buffer_scratchpad_size; + buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0); + } parallel_exec_domain = schedule.parallel_exec_domain; harnessWorkAmount = std::accumulate(parallel_exec_domain.begin(), parallel_exec_domain.end(), 1, std::multiplies()); parallel_exec_domain = getNormalizedDimsBySize(parallel_exec_domain, tensorRank); + master_shape = getNormalizedDimsBySize(master_shape, tensorRank); + master_shape_stride = std::vector(master_shape.size(), 1); + for (int i = master_shape_stride.size() - 2 ; i >= 0; i--) { + master_shape_stride[i] = master_shape_stride[i + 1] * master_shape[i + 1]; + } } void Snippet::SnippetJitExecutor::generate(const jit_snippets_compile_args* jcp) { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 9ce3a3b71b760b..89f5221e09f978 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -101,7 +101,8 @@ class Snippet : public Node { size_t numOutput = 0; void generate(const jit_snippets_compile_args*); - inline void update_ptrs(jit_snippets_call_args&, const std::vector& inMemPtrs, const std::vector& outMemPtrs); + inline void update_ptrs(jit_snippets_call_args&, const std::vector& inMemPtrs, const std::vector& outMemPtrs, + size_t buffer_offset); // Evaluates generated snippet using parallel backend void schedule_6d(const std::vector& inMemPtrs, const std::vector& outMemPtrs); void schedule_nt(const std::vector& inMemPtrs, const std::vector& outMemPtrs); @@ -125,6 +126,9 @@ class Snippet : public Node { // Buffer scratchpad std::vector buffer_scratchpad = {}; size_t buffer_scratchpad_size = 0; + int buffer_inplace_output = -1; + VectorDims master_shape = {}; + VectorDims master_shape_stride = {}; #ifdef SNIPPETS_DEBUG_CAPS inline void segfault_detector(); diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp index b66ce1919f6d23..07c84885796093 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp @@ -88,7 +88,8 @@ class BufferAllocationCPUTest : public testing::TestWithParam(); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(m_buffer_scratchpad, m_is_buffer_optimized); + int inplace = -1; + pipeline.register_pass(m_buffer_scratchpad, inplace, m_is_buffer_optimized); pipeline.run(m_linear_ir); }