Skip to content

Commit

Permalink
intermediatebuffer_output_share_memory
Browse files Browse the repository at this point in the history
  • Loading branch information
chenhu-wang committed Mar 19, 2024
1 parent 3305c93 commit 724b0b7
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 21 deletions.
2 changes: 2 additions & 0 deletions src/common/snippets/include/snippets/generator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class Generator;
* @brief Holds all relevant information produced during lowering
* @param compiled_snippet pointer to interface class that encapsulates compiled binary code
* @param buffer_scratchpad_size the amount of additional memory required by the binary code to execute.
* @param buffer_inplace_output buffer share memory with subgraph output result. -1 means no sharing. i>=0 means share ith output memory.
* Must be allocated and freed by the backend.
*/
class LoweringResult {
Expand All @@ -35,6 +36,7 @@ class LoweringResult {
public:
std::shared_ptr<CompiledSnippet> compiled_snippet = nullptr;
size_t buffer_scratchpad_size = 0;
int buffer_inplace_output = -1;
};

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace pass {
class AllocateBuffers: public RangedPass {
public:
OPENVINO_RTTI("AllocateBuffers", "RangedPass")
AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true);
AllocateBuffers(size_t& buffer_scratchpad_size, int& buffer_inplace_output, bool is_optimized = true);

/**
* @brief Apply the pass to the Linear IR
Expand All @@ -44,8 +44,10 @@ class AllocateBuffers: public RangedPass {

using BufferCluster = std::set<ExpressionPtr>;
using BufferClusters = std::vector<BufferCluster>;

private:
size_t& m_buffer_scratchpad_size;
int& m_buffer_inplace_output;
bool m_is_optimized_mode = true;
};

Expand Down
5 changes: 3 additions & 2 deletions src/common/snippets/src/lowered/pass/allocate_buffers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ namespace snippets {
namespace lowered {
namespace pass {

AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized)
: m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized) {}
AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, int& buffer_inplace_output, bool is_optimized)
: m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized), m_buffer_inplace_output(buffer_inplace_output) {}

void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset) {
// If Buffer has offset We set this offset in the connected MemoryAccess ops
Expand Down Expand Up @@ -78,6 +78,7 @@ bool AllocateBuffers::run(lowered::LinearIR& linear_ir, lowered::LinearIR::const
pipeline.register_pass<SolveBufferMemory>(m_buffer_scratchpad_size, buffer_clusters);
pipeline.register_pass<NormalizeBufferIDs>();
pipeline.run(linear_ir);
m_buffer_inplace_output = 0;
} else {
InitBuffersDefault(m_buffer_scratchpad_size).run(linear_ir, linear_ir.cbegin(), linear_ir.cend());
}
Expand Down
16 changes: 15 additions & 1 deletion src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@
#include <memory>
#include <array>

#include "snippets/lowered/pass/serialize_control_flow.hpp"

using namespace std;
using namespace ov::op::util;

Expand Down Expand Up @@ -441,6 +443,12 @@ void Subgraph::data_flow_transformations(const BlockedShapeVector& blocked_input

manager.register_positioned_passes(backend_passes);
manager.run_passes(body_ptr());

// ov::pass::Manager magr;
// std::string xmlo = "data_flow.xml";
// std::string bino = "data_flow.bin";
// magr.register_pass<ov::pass::Serialize>(xmlo, bino);
// magr.run_passes(body_ptr());
}

void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
Expand Down Expand Up @@ -473,7 +481,8 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
pipeline.register_pass<lowered::pass::ValidateLoops>();
pipeline.register_pass<lowered::pass::InitLoops>();
pipeline.register_pass<lowered::pass::InsertLoops>();
pipeline.register_pass<lowered::pass::AllocateBuffers>(lowering_result.buffer_scratchpad_size, linear_ir.get_config().m_are_buffers_optimized);
pipeline.register_pass<lowered::pass::AllocateBuffers>(lowering_result.buffer_scratchpad_size, lowering_result.buffer_inplace_output,
linear_ir.get_config().m_are_buffers_optimized);
pipeline.register_pass<lowered::pass::CleanRepeatedDataPointerShifts>();
pipeline.register_positioned_passes(lowered_backend_passes);
pipeline.register_pass<lowered::pass::Validate>(); // must be last
Expand Down Expand Up @@ -514,6 +523,11 @@ snippets::Schedule Subgraph::generate_from_linear_ir(const std::shared_ptr<lower
perf_count_pass.run(linear_ir, linear_ir.cbegin(), linear_ir.cend());
}
#endif

// std::string xmlo = "LIR.xml";
// lowered::pass::SerializeControlFlow SerializeLIR(xmlo);
// SerializeLIR.run(linear_ir);

m_generator->generate(linear_ir, lowering_result, compile_params);

VectorDims parallel_exec_domain = linear_ir.get_master_shape();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ typedef std::tuple<
bool, // Optimized pipeline
bool, // With SplitLoops opt
size_t, // Expected Buffer size in bytes
size_t // Expected unique Buffer IDs count
size_t // Expected unique Buffer IDs count
// int // buffer output inplace
> BufferAllocationParams;

class BufferAllocationTest : public testing::TestWithParam<BufferAllocationParams> {
Expand Down Expand Up @@ -46,6 +47,7 @@ class BufferAllocationTest : public testing::TestWithParam<BufferAllocationParam

bool m_is_buffer_optimized = true;
bool m_with_split_loops = true;
int m_buffer_inplace_out = -1;
};

class EltwiseBufferAllocationTest : public BufferAllocationTest {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ void BufferAllocationTest::ApplyTransformations(const std::shared_ptr<ov::snippe
pipeline.register_pass<ov::snippets::lowered::pass::InsertLoadStore>(m_vector_size);
pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, m_is_buffer_optimized);
pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, m_buffer_inplace_out, m_is_buffer_optimized);
pipeline.run(m_linear_ir);
}

Expand Down
46 changes: 33 additions & 13 deletions src/plugins/intel_cpu/src/nodes/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -505,16 +505,20 @@ void Snippet::SnippetJitExecutor::exec(const std::vector<MemoryPtr>& inMemPtrs,
}

void Snippet::SnippetJitExecutor::update_ptrs(jit_snippets_call_args& call_args,
const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) {
const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs, size_t buffer_offset) {
for (size_t i = 0; i < inMemPtrs.size(); i++)
call_args.src_ptrs[i] = inMemPtrs[i]->getDataAs<const uint8_t>() + start_offset_in[i];

for (size_t i = 0; i < outMemPtrs.size(); i++)
call_args.dst_ptrs[i] = outMemPtrs[i]->getDataAs<uint8_t>() + start_offset_out[i];

if (buffer_scratchpad_size > 0) {
call_args.buffer_scratchpad_ptr =
if (buffer_inplace_output >= 0) {
call_args.buffer_scratchpad_ptr = call_args.dst_ptrs[buffer_inplace_output] + buffer_offset * dataSize[buffer_inplace_output + numInput];
} else {
if (buffer_scratchpad_size > 0) {
call_args.buffer_scratchpad_ptr =
reinterpret_cast<uint8_t*>(buffer_scratchpad.data()) + parallel_get_thread_num() * buffer_scratchpad_size;
}
}
}

Expand Down Expand Up @@ -547,7 +551,12 @@ void Snippet::SnippetJitExecutor::schedule_6d(const std::vector<MemoryPtr>& inMe
[&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) {
int64_t indexes[] = {d0, d1, d2, d3, d4};
jit_snippets_call_args call_args;
update_ptrs(call_args, inMemPtrs, outMemPtrs);
size_t buffer_offset = 0;
if (buffer_inplace_output >= 0) {
for (size_t i = 0; i < sizeof(indexes) / sizeof(indexes[0]); i++)
buffer_offset += indexes[i] * master_shape_stride[i];
}
update_ptrs(call_args, inMemPtrs, outMemPtrs, buffer_offset);
callable(&call_args, indexes);
});
}
Expand All @@ -558,9 +567,6 @@ void Snippet::SnippetJitExecutor::schedule_nt(const std::vector<MemoryPtr>& inMe
segfault_detector();
#endif
parallel_nt(0, [&](const int ithr, const int nthr) {
jit_snippets_call_args call_args;
update_ptrs(call_args, inMemPtrs, outMemPtrs);

size_t start = 0, end = 0;
splitter(harnessWorkAmount, nthr, ithr, start, end);

Expand All @@ -571,7 +577,13 @@ void Snippet::SnippetJitExecutor::schedule_nt(const std::vector<MemoryPtr>& inMe
indexes[j] = static_cast<int64_t>(tmp % work_size[j]);
tmp /= work_size[j];
}

size_t buffer_offset = 0;
if (buffer_inplace_output >= 0) {
for (size_t i = 0; i < indexes.size(); i++)
buffer_offset += indexes[i] * master_shape_stride[i];
}
jit_snippets_call_args call_args;
update_ptrs(call_args, inMemPtrs, outMemPtrs, buffer_offset);
schedule.get_callable<kernel>()(&call_args, indexes.data());
}
});
Expand All @@ -595,10 +607,10 @@ Snippet::SnippetJitExecutor::SnippetJitExecutor(SnippetAttrs attrs, bool is_dyna
in_shapes.emplace_back(s);
snippetAttrs.snippet->shape_infer(in_shapes);
}
const VectorDims& canonicalShape = snippetAttrs.snippet->infer_master_shape();
master_shape = snippetAttrs.snippet->infer_master_shape();

// initialize by maximum output dimension. Dimensions of outputs should be broadcastable
tensorRank = std::max(static_cast<size_t>(rank6D), canonicalShape.size());
tensorRank = std::max(static_cast<size_t>(rank6D), master_shape.size());
auto initDataSizes = [this]() {
dataSize.resize(numInput + numOutput);
for (size_t i = 0; i < numInput; i++)
Expand All @@ -608,18 +620,26 @@ Snippet::SnippetJitExecutor::SnippetJitExecutor(SnippetAttrs attrs, bool is_dyna
};
initDataSizes();

if (snippets::utils::is_dynamic_vdims(canonicalShape))
if (snippets::utils::is_dynamic_vdims(master_shape))
OPENVINO_THROW("Snippets: Canonicalization returned dynamic shape in static pipeline");

// generate
jit_snippets_compile_args jcp;
jcp.parallel_executor_ndims = tensorRank;
generate(&jcp);
buffer_scratchpad_size = schedule.lowering_result.buffer_scratchpad_size;
buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0);
buffer_inplace_output = schedule.lowering_result.buffer_inplace_output;
if (buffer_inplace_output == -1) {
buffer_scratchpad_size = schedule.lowering_result.buffer_scratchpad_size;
buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0);
}
parallel_exec_domain = schedule.parallel_exec_domain;
harnessWorkAmount = std::accumulate(parallel_exec_domain.begin(), parallel_exec_domain.end(), 1, std::multiplies<size_t>());
parallel_exec_domain = getNormalizedDimsBySize(parallel_exec_domain, tensorRank);
master_shape = getNormalizedDimsBySize(master_shape, tensorRank);
master_shape_stride = std::vector<size_t>(master_shape.size(), 1);
for (int i = master_shape_stride.size() - 2 ; i >= 0; i--) {
master_shape_stride[i] = master_shape_stride[i + 1] * master_shape[i + 1];
}
}

void Snippet::SnippetJitExecutor::generate(const jit_snippets_compile_args* jcp) {
Expand Down
6 changes: 5 additions & 1 deletion src/plugins/intel_cpu/src/nodes/subgraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ class Snippet : public Node {
size_t numOutput = 0;

void generate(const jit_snippets_compile_args*);
inline void update_ptrs(jit_snippets_call_args&, const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs);
inline void update_ptrs(jit_snippets_call_args&, const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs,
size_t buffer_offset);
// Evaluates generated snippet using parallel backend
void schedule_6d(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs);
void schedule_nt(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs);
Expand All @@ -125,6 +126,9 @@ class Snippet : public Node {
// Buffer scratchpad
std::vector<uint8_t> buffer_scratchpad = {};
size_t buffer_scratchpad_size = 0;
int buffer_inplace_output = -1;
VectorDims master_shape = {};
VectorDims master_shape_stride = {};

#ifdef SNIPPETS_DEBUG_CAPS
inline void segfault_detector();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCP
pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
pipeline.register_pass<ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape>();
pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, m_is_buffer_optimized);
int inplace = -1;
pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, inplace, m_is_buffer_optimized);
pipeline.run(m_linear_ir);
}

Expand Down

0 comments on commit 724b0b7

Please sign in to comment.