openvinotoolkit
diff --git a/‎src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp
Lines changed: 65 additions & 0 deletions b/‎src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.cpp
Lines changed: 65 additions & 0 deletions
diff --git a/‎src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp
Lines changed: 39 additions & 0 deletions b/‎src/plugins/intel_cpu/src/nodes/executors/aarch64/subgraph.hpp
Lines changed: 39 additions & 0 deletions
diff --git a/‎src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp
Lines changed: 141 additions & 0 deletions b/‎src/plugins/intel_cpu/src/nodes/executors/subgraph.cpp
Lines changed: 141 additions & 0 deletions
@@ -0,0 +1,65 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "nodes/executors/aarch64/subgraph.hpp"
+
+#include "snippets/op/subgraph.hpp"
+
+
+namespace ov {
+namespace intel_cpu {
+
+void SubgraphStaticExecutor::exec_impl(const std::vector<MemoryPtr>& inMemPtrs,
+                                       const std::vector<MemoryPtr>& outMemPtrs) {
+    const auto& callable = m_schedule->get_callable<kernel>();
+
+    auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
+        init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out, ithr);
+        update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
+    };
+    auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
+        callable(&call_args, indexes.data());
+    };
+
+    if (m_parallel_exec_domain.size() == rank6D) {
+        parallel_for6d(initializer, caller);
+    } else {
+        parallel_forNd(initializer, caller);
+    }
+}
+
+void SubgraphDynamicSpecializedExecutor::exec_impl(const std::vector<MemoryPtr>& inMemPtrs,
+                                                   const std::vector<MemoryPtr>& outMemPtrs) {
+    const auto& callable = m_schedule->get_callable<dynamic_kernel>();
+
+    OPENVINO_ASSERT(m_data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!");
+    OPENVINO_ASSERT(m_data_offsets.front().size() == m_parallel_exec_domain.size(),
+                    "Data offsets with invalid ranks detected");
+
+    // Note: we need to reset KernelExecutorTable to the state that was recorded in the
+    // SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes
+    m_reset_exec_table_state();
+
+    std::vector<const uint8_t*> src_ptrs;
+    std::vector<uint8_t*> dst_ptrs;
+    init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs, m_start_offset_in, m_start_offset_out);
+
+    auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
+        init_call_args(call_args, ithr);
+        update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr);
+    };
+    auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
+        update_ptrs(call_args, src_ptrs, dst_ptrs, indexes);
+        callable(&call_args);
+    };
+
+    if (m_parallel_exec_domain.size() == rank6D) {
+        parallel_for6d(initializer, caller);
+    } else {
+        parallel_forNd(initializer, caller);
+    }
+}
+
+}  // namespace intel_cpu
+}  // namespace ov
@@ -0,0 +1,39 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "nodes/executors/subgraph.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+class SubgraphExecutor : public SubgraphBaseExecutor {
+public:
+    template<typename ...Args>
+    SubgraphExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config, Args ...args) {
+        m_buffer_scratchpad = allocator(m_internal_buffer_size);
+    }
+};
+
+class SubgraphStaticExecutor : public SubgraphExecutor, public SubgraphStaticBaseExecutor {
+public:
+    template<typename ...Args>
+    SubgraphStaticExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config, Args ...args)
+        : SubgraphExecutor(snippet_config, args...), SubgraphStaticBaseExecutor() {}
+
+    void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override;
+};
+
+class SubgraphDynamicSpecializedExecutor : public SubgraphExecutor, public SubgraphDynamicSpecializedBaseExecutor {
+public:
+    template<typename ...Args>
+    SubgraphDynamicSpecializedExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config, Args ...args)
+        : SubgraphExecutor(snippet_config, args...), SubgraphDynamicSpecializedBaseExecutor(snippet_config) {}
+
+    void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override;
+};
+
+}  // namespace intel_cpu
+}  // namespace ov
@@ -0,0 +1,141 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "nodes/executors/subgraph.hpp"
+#if defined(OPENVINO_ARCH_ARM64)
+#    include "emitters/snippets/aarch64/cpu_generator.hpp"
+#else
+#    include "emitters/snippets/x64/cpu_generator.hpp"
+#endif
+
+namespace ov {
+namespace intel_cpu {
+
+SubgraphCodeGenerator::SubgraphCodeGenerator(const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
+                                             const std::shared_ptr<CPURuntimeConfig>& config) {
+    OPENVINO_ASSERT(snippet_attrs, "Subgraph attributes are empty!");
+    OPENVINO_ASSERT(config, "Runtime Config is empty!");
+
+    jit_snippets_compile_args jcp;
+    jcp.data_offsets = config->io_data_offsets;
+    SubgraphBaseExecutor::init_parallel_domain(config, jcp.exec_domain);
+    schedule =
+        std::make_shared<ov::snippets::Schedule>(snippet_attrs->snippet->generate(reinterpret_cast<const void*>(&jcp)));
+}
+
+SubgraphBaseExecutor::SubgraphBaseExecutor(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
+                                           const std::shared_ptr<SubgraphAttrs>& snippet_attrs,
+                                           const std::shared_ptr<SubgraphCodeGenerator>& snippet,
+                                           const std::vector<ptrdiff_t>& start_offset_in,
+                                           const std::vector<ptrdiff_t>& start_offset_out,
+                                           const BufferScratchpadAllocator& allocator,
+                                           const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache)
+    : m_schedule(snippet->get()),
+      m_start_offset_in(start_offset_in),
+      m_start_offset_out(start_offset_out) {
+    OPENVINO_ASSERT(m_schedule, "Schedule is empty!");
+    OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!");
+    init_parallel_domain(snippet_config, m_parallel_exec_domain);
+
+    m_tensor_rank = snippet_config->tensor_rank;
+    m_harness_work_amount = std::accumulate(m_parallel_exec_domain.cbegin(),
+                                            m_parallel_exec_domain.cend(),
+                                            size_t(1),
+                                            std::multiplies<size_t>());
+    m_nthreads = std::min(parallel_get_max_threads(), static_cast<int>(m_harness_work_amount));
+
+    m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size;
+    OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size),
+                    "Undefined buffer scratchpad size!");
+    m_internal_buffer_size = static_cast<size_t>(m_nthreads) * m_buffer_scratchpad_size;
+}
+
+void SubgraphBaseExecutor::init_parallel_domain(const std::vector<size_t>& master_shape,
+                                                size_t tensor_rank,
+                                                size_t tile_rank,
+                                                std::vector<size_t>& domain) {
+    domain.resize(tensor_rank, 1);
+    std::fill(domain.begin(), domain.end(), 1);
+    std::copy(master_shape.cbegin(),
+              master_shape.cbegin() + (master_shape.size() - tile_rank),
+              domain.begin() + (tensor_rank - master_shape.size()));
+}
+
+void SubgraphBaseExecutor::init_parallel_domain(const std::shared_ptr<CPURuntimeConfig>& snippet_config,
+                                                std::vector<size_t>& domain) {
+    init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain);
+}
+void SubgraphBaseExecutor::parallel_for6d(
+    const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
+    const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
+    const auto& dom = m_parallel_exec_domain;
+
+    parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
+        jit_snippets_call_args call_args;
+        initializer(call_args, ithr);
+
+        size_t start = 0, end = 0;
+        splitter(m_harness_work_amount, nthr, ithr, start, end);
+
+        std::vector<size_t> indexes{0, 0, 0, 0, 0};
+        parallel_it_init(start,
+                         indexes[0],
+                         dom[0],
+                         indexes[1],
+                         dom[1],
+                         indexes[2],
+                         dom[2],
+                         indexes[3],
+                         dom[3],
+                         indexes[4],
+                         dom[4]);
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            caller(call_args, indexes, ithr);
+            parallel_it_step(indexes[0],
+                             dom[0],
+                             indexes[1],
+                             dom[1],
+                             indexes[2],
+                             dom[2],
+                             indexes[3],
+                             dom[3],
+                             indexes[4],
+                             dom[4]);
+        }
+    });
+}
+
+void SubgraphBaseExecutor::parallel_forNd(
+    const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
+    const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
+    const auto& dom = m_parallel_exec_domain;
+
+    parallel_nt_static(m_nthreads, [&](const int ithr, const int nthr) {
+        jit_snippets_call_args call_args;
+        initializer(call_args, ithr);
+
+        size_t start = 0, end = 0;
+        splitter(m_harness_work_amount, nthr, ithr, start, end);
+
+        std::vector<size_t> indexes(dom.size() - 1, 0);
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            size_t tmp = iwork;
+            for (ptrdiff_t j = static_cast<ptrdiff_t>(dom.size()) - 2; j >= 0; j--) {
+                indexes[j] = tmp % dom[j];
+                tmp /= dom[j];
+            }
+
+            caller(call_args, indexes, ithr);
+        }
+    });
+}
+
+void SubgraphBaseExecutor::execute(const dnnl::stream& strm,
+                                   const std::vector<MemoryPtr>& inMemPtrs,
+                                   const std::vector<MemoryPtr>& outMemPtrs) {
+    exec_impl(inMemPtrs, outMemPtrs);
+}
+
+}  // namespace intel_cpu
+}  // namespace ov