diff --git a/src/plugins/intel_cpu/src/allocation_context.hpp b/src/plugins/intel_cpu/src/allocation_context.hpp new file mode 100644 index 00000000000000..8affe814807004 --- /dev/null +++ b/src/plugins/intel_cpu/src/allocation_context.hpp @@ -0,0 +1,26 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +namespace ov { +namespace intel_cpu { + +class Node; +class Edge; + +using GlobalExecutionIndex = std::unordered_map, std::pair>; + +struct AllocationContext { + std::vector> edges; + GlobalExecutionIndex execIndex; + std::vector syncPoints; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp index e4d1662f96a308..ddfb26d8d42421 100644 --- a/src/plugins/intel_cpu/src/compiled_model.cpp +++ b/src/plugins/intel_cpu/src/compiled_model.cpp @@ -10,9 +10,11 @@ #include "async_infer_request.h" #include "config.h" #include "cpu/x64/cpu_isa_traits.hpp" +#include "graph.h" #include "infer_request.h" #include "itt.h" #include "low_precision/low_precision.hpp" +#include "memory_control.hpp" #include "memory_state.h" #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/intel_cpu/properties.hpp" @@ -163,15 +165,16 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const { std::lock_guard lock{*m_mutex.get()}; auto isQuantizedFlag = (m_cfg.lpTransformsMode == Config::On) && ov::pass::low_precision::LowPrecision::isFunctionQuantized(m_model); - ctx = std::make_shared(m_cfg, m_socketWeights[socketId], isQuantizedFlag, streamsExecutor, m_sub_memory_manager); } + const std::shared_ptr model = m_model; - graphLock._graph.CreateGraph(model, ctx); + graphLock._graph.Init(model, ctx); + graphLock._graph.Activate(); } catch (...) { exception = std::current_exception(); } @@ -355,7 +358,7 @@ void CompiledModel::release_memory() { "Attempt to call release_memory() on a compiled model in a busy state. Please ensure that all " "infer requests are completed before releasing memory."); auto ctx = graph.getGraphContext(); - ctx->getNetworkMemoryControl()->releaseMemory(); + ctx->releaseMemory(); } } diff --git a/src/plugins/intel_cpu/src/compiled_model.h b/src/plugins/intel_cpu/src/compiled_model.h index 1f9cc3c0fdb590..8e13928a8ad328 100644 --- a/src/plugins/intel_cpu/src/compiled_model.h +++ b/src/plugins/intel_cpu/src/compiled_model.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include @@ -13,7 +14,6 @@ #include "openvino/runtime/iinfer_request.hpp" #include "openvino/runtime/iplugin.hpp" #include "openvino/runtime/isync_infer_request.hpp" -#include "openvino/runtime/threading/thread_local.hpp" #include "sub_memory_manager.hpp" namespace ov { diff --git a/src/plugins/intel_cpu/src/edge.cpp b/src/plugins/intel_cpu/src/edge.cpp index f7578f65ce3bbb..1c8fb0919103f0 100644 --- a/src/plugins/intel_cpu/src/edge.cpp +++ b/src/plugins/intel_cpu/src/edge.cpp @@ -272,7 +272,7 @@ Edge::ReorderStatus Edge::needReorder() { } void Edge::reuse(MemoryPtr ptr) { - OPENVINO_ASSERT(ptr != nullptr, "Attempt to reuse initialized memory in ", *this); + OPENVINO_ASSERT(ptr != nullptr, "Attempt to reuse uninitialized memory in ", *this); memoryPtr = std::move(ptr); changeStatus(Status::Allocated); @@ -461,13 +461,18 @@ const MemoryDesc& Edge::getOutputDesc() const { return *memDescPtr; } -const MemoryDesc& Edge::getDesc() const { +const MemoryDesc& Edge::getOriginalDesc() const { + OPENVINO_ASSERT(!one_of(status, Status::Validated, Status::Allocated), + "Desc of an Allocated edge ", + *this, + " must be accessed through the memory object"); + if (getInputDesc().getPrecision() == element::undefined) { return getInputDesc(); } if (!getInputDesc().isCompatible(getOutputDesc())) { - OPENVINO_THROW("Cannot get descriptor for edge: ", getParent()->getName(), "->", getChild()->getName()); + OPENVINO_THROW("Cannot get descriptor for edge: ", *this); } return getInputDesc(); @@ -498,7 +503,7 @@ void Edge::validate() { getChild(); if (status != Status::Allocated || !memoryPtr) { - OPENVINO_THROW("Error memory is not allocated!"); + OPENVINO_THROW("Error memory is not allocated for edge: ", *this); } status = Status::Validated; } diff --git a/src/plugins/intel_cpu/src/edge.h b/src/plugins/intel_cpu/src/edge.h index 2f05bf1c92282d..cdcb4616bd8f15 100644 --- a/src/plugins/intel_cpu/src/edge.h +++ b/src/plugins/intel_cpu/src/edge.h @@ -26,7 +26,13 @@ class Edge { public: Edge(const std::shared_ptr& parent, const std::shared_ptr& child, int pr_port = 0, int ch_port = 0); - enum class Status { Uninitialized, NeedAllocation, NotAllocated, Allocated, Validated }; + enum class Status { + Uninitialized, // base edge is unknown yet + NeedAllocation, // edge is the base edge + NotAllocated, // edge references another edge + Allocated, // edge memory is allocated + Validated // edge is validated + }; enum class ReorderStatus { Regular = 0, Optimized = 1, No = 2 }; @@ -84,10 +90,11 @@ class Edge { EdgePtr getSharedEdge(std::nothrow_t) const; bool hasDefinedMaxSize() const { - return getDesc().hasDefinedMaxSize(); + return getOriginalDesc().hasDefinedMaxSize(); } std::string hash() const; + const MemoryDesc& getOriginalDesc() const; private: std::weak_ptr parent; @@ -105,7 +112,6 @@ class Edge { PortDescBaseCPtr getInputPortDesc() const; PortDescBaseCPtr getOutputPortDesc() const; - const MemoryDesc& getDesc() const; bool enforceReorder(); void collectConsumers(std::vector>& result) const; diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 9dee4424e7a925..6c02870fd2185e 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include #include @@ -17,12 +19,16 @@ #include #include +#include "allocation_context.hpp" #include "common/primitive_desc_iface.hpp" +#include "cpu_types.h" #include "edge.h" +#include "graph_context.h" #include "graph_dumper.h" #include "graph_optimizer.h" #include "infer_request.h" #include "itt.h" +#include "memory_control.hpp" #include "memory_desc/cpu_memory_desc_utils.h" #include "memory_desc/dnnl_blocked_memory_desc.h" #include "node.h" @@ -32,6 +38,7 @@ #include "nodes/input.h" #include "nodes/memory.hpp" #include "nodes/reorder.h" +#include "nodes/tensoriterator.h" #include "openvino/core/except.hpp" #include "openvino/core/model.hpp" #include "openvino/core/node.hpp" @@ -68,10 +75,10 @@ void Graph::CreateGraph(NET& model, const GraphContext::CPtr& context) { Activate(); } -void Graph::CreateGraph(const std::vector& graphNodes, - const std::vector& graphEdges, - const GraphContext::CPtr& context, - std::string name) { +void Graph::Init(const std::vector& graphNodes, + const std::vector& graphEdges, + const GraphContext::CPtr& context, + std::string name) { if (IsReady()) { ForgetGraphData(); } @@ -97,6 +104,13 @@ void Graph::CreateGraph(const std::vector& graphNodes, } Configure(); +} + +void Graph::CreateGraph(const std::vector& graphNodes, + const std::vector& graphEdges, + const GraphContext::CPtr& context, + std::string name) { + Init(graphNodes, graphEdges, context, std::move(name)); Activate(); } @@ -294,8 +308,8 @@ static std::tuple, std::vector> ExtractExecutableNo if (!node->isConstant() && // constants are executed once in scope of compile_model !staticZeroDims && // never execute static nodes with zero dim input / output tensors - (CPU_DEBUG_CAPS_ALWAYS_TRUE(node->isExecutable()) || // execute all executable nodes - dynamicNonInputOutput)) { // plus dynamic ones, except inputs / outputs + (CPU_DEBUG_CAPS_ALWAYS_TRUE(!node->neverExecute()) || // execute all executable nodes + dynamicNonInputOutput)) { // plus dynamic ones, except inputs / outputs graphIdToExecutableId[i] = executableGraphNodes.size(); executableGraphNodes.emplace_back(node); } @@ -336,43 +350,12 @@ void Graph::Init(const std::shared_ptr& model, Configure(); } -static void UseExternalInputMemory(const std::map& inputNodesMap, - const std::vector& memory) { - for (size_t i = 0; i < memory.size(); i++) { - const auto& node = inputNodesMap.at(i); - - auto childEdges = node->getChildEdgesAtPort(0); - for (const auto& childEdge : childEdges) { - OPENVINO_ASSERT(childEdge->getStatus() == Edge::Status::Uninitialized, "Unexpected edge status"); - - childEdge->reuse(memory[i]); - } - } -} - -static void UseExternalOutputMemory(const std::map& outputNodesMap, - const std::vector& memory) { - for (size_t i = 0; i < memory.size(); i++) { - const auto& node = outputNodesMap.at(i); - - const auto& parentEdge = node->getParentEdgeAt(0); - OPENVINO_ASSERT(parentEdge->getStatus() == Edge::Status::Uninitialized, "Unexpected edge status"); - - parentEdge->reuse(memory[i]); - } -} - -void Graph::Activate(const std::vector& externalInputMemory, - const std::vector& externalOutputMemory) { - OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); - - const bool hasDynNodes = ProcessDynNodes(); - const auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; - - UseExternalInputMemory(inputNodesMap, externalInputMemory); - UseExternalOutputMemory(outputNodesMap, externalOutputMemory); - - Allocate(syncNodesInds); +void Graph::Activate() { + // @todo It is possible that execution graph is already created in scope of + // the allocation context collection from the outer graph so the state for inner graph is "Ready" + // We probably want to avoid such uncertancy + // OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status: ", static_cast(status)); + Allocate(); CreatePrimitivesAndExecConstants(); @@ -382,23 +365,6 @@ void Graph::Activate(const std::vector& externalInputMemory, } #endif - std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = - ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); - - if (hasDynNodes) { - status = Status::ReadyDynamic; - // Here we use the following heuristic: if the number of sync nodes is less than 10 times of the number of exec - // nodes, it does make sense to use Sequential dynamic shapes processing due to the high overheads on context - // switching when the dynamic shapes are being processed in parallel and there are a lot of sync points. Also - // this rule works for short graphs (usually subgraphs) when the amount of nodes is to low to process them in - // parallel. - const auto exec2sync = m_executableGraphNodes.size() / m_executableSyncNodesInds.size(); - if (exec2sync < 10 || parallel_get_max_threads() < 2) { - status = Status::ReadyDynamicSeq; - } - } else { - status = Status::ReadyStatic; - } CPU_DEBUG_CAP_ENABLE(serialize(*this)); } @@ -743,98 +709,280 @@ void Graph::ResolveComplexInplaceConflicts() { } } -static inline bool isConstOutput(const EdgePtr& edge) { - return edge->getParent()->isConstant() && !edge->getChild()->isConstant(); -} +/** + * Partition the \clusters of Edges, by moving to the end and allocating at the same time + * the clusters that cannot be handled as part of the generic memory solver algorithm. + * Such clusters meet one of the following criteria: + * - base edge of a cluster is already Allocated + * - base edge of a cluster is a "ov::element::string" type of edge + * - base edge of a cluster is a Constant edge + * + * @return a remaining number of clusters to process (left partition) + */ +static size_t AllocateStringsAndConstants(EdgeClusters& clusters, const GraphContext::CPtr& context) { + auto allocateConstantEdge = [&context](const EdgePtr& edge) { + if (edge->getParent()->getType() == Type::Input) { + auto constNode = std::static_pointer_cast(edge->getParent()); + edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); + } else { + edge->externalAllocate(context->getWeightsCache()); + } + }; -void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { - edgeClusters edge_clusters = MemoryControl::findEdgeClusters(graphEdges); + auto allocateStringMemory = [&context](const EdgePtr& edge) { + auto memory = std::make_shared(context->getEngine(), edge->getOriginalDesc()); + edge->reuse(memory); + return memory->getStringMemoryBlockPtr(); + }; - size_t remaining_edge_clusters_count = edge_clusters.size(); + auto notAllocatedPartitionEnd = std::partition( + clusters.begin(), + clusters.end(), + [&allocateStringMemory, &allocateConstantEdge, &context](const EdgeCluster& cluster) { + if (cluster.empty()) { + return false; + } - // Resolve special cases: - for (size_t i = 0; i < remaining_edge_clusters_count;) { - auto& cluster = edge_clusters[i]; - bool erase = false; - for (auto& edge : cluster) { - // Remove already allocated edges from the mem reuse algo - if (edge->getStatus() == Edge::Status::Allocated) { - erase = true; - break; + auto baseEdgeIt = std::find_if(cluster.begin(), cluster.end(), [](const EdgePtr& edge) { + return one_of(edge->getStatus(), Edge::Status::Allocated, Edge::Status::NeedAllocation); + }); + + OPENVINO_ASSERT(baseEdgeIt != cluster.end(), "Unexpected cluster state"); + + const auto& baseEdge = *baseEdgeIt; + if (baseEdge->getStatus() == Edge::Status::Allocated) { + return false; } - // Special allocation for string tensors - if (edge->getDesc().getPrecision() == element::string && - edge->getStatus() == Edge::Status::NeedAllocation) { - StringMemory::StringMemoryBlockPtr memBlcok; - if (edge->getParent()->isConstant()) { - if (edge->getParent()->getType() == Type::Input) { - auto constNode = static_cast(edge->getParent().get()); - edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); - } else { - edge->externalAllocate(m_context->getWeightsCache()); + // Allocate a cluster of the constants + if (baseEdge->getParent()->isConstant()) { + allocateConstantEdge(baseEdge); + return false; + } + + // Allocate a non-constant string cluster + if (baseEdge->getOriginalDesc().getPrecision() == element::string) { + OPENVINO_ASSERT(std::all_of(cluster.begin(), + cluster.end(), + [](const EdgePtr& edge) { + return edge->getOriginalDesc().getPrecision() == element::string; + }), + "All edges in the string cluster must be strings."); + auto memBlock = allocateStringMemory(baseEdge); + for (auto& edge : cluster) { + if (edge->getStatus() == Edge::Status::NotAllocated) { + edge->reuse( + std::make_shared(context->getEngine(), edge->getOriginalDesc(), memBlock)); } - auto stringMemory = dynamic_cast(edge->getMemoryPtr().get()); - OPENVINO_ASSERT(stringMemory, - "[CPU] Edge between nodes '", - edge->getParent()->getName(), - "' and '", - edge->getChild()->getName(), - "' must have StringMemory."); - memBlcok = stringMemory->getStringMemoryBlockPtr(); - } else { - auto memory = std::make_shared(getEngine(), edge->getDesc()); - edge->reuse(memory); - memBlcok = memory->getStringMemoryBlockPtr(); } - for (auto& edge_c : cluster) { - if (edge_c == edge) { - continue; - } - OPENVINO_ASSERT(edge_c->getDesc().getPrecision() == element::string, - "All edges in the cluster must be string."); - if (edge_c->getStatus() == Edge::Status::NotAllocated) { - auto memory = std::make_shared(getEngine(), edge_c->getDesc(), memBlcok); - edge_c->reuse(memory); - } else { - OPENVINO_THROW("[CPU] String tensors allocation in the cluster. Edge between nodes '", - edge_c->getParent()->getName(), - "' and '", - edge_c->getChild()->getName(), - "' has an unexpected status: ", - static_cast(edge_c->getStatus())); - } + return false; + } + + return true; + }); + + return std::distance(clusters.begin(), notAllocatedPartitionEnd); +} + +static void AllocateBaseEdges(const EdgeClusters& edgeClusters, const MemoryControl::MemorySolution& memorySolution) { + // attach all the not yet allocated edges to the memory control + for (auto&& item : memorySolution) { + int count = 0; + for (auto&& edge : edgeClusters[item.first]) { + if (edge->getStatus() == Edge::Status::NeedAllocation) { + edge->allocate(item.second); + // TODO: WA for some test (like strided_slice_test) which use tensors with + // shapes {0}. And it is implicitly converted into {1} tensor. + // Zeroing of input data allow pass tests. + if (edge->getParent()->getType() == Type::Input && edge->getMemory().getDesc().hasDefinedMaxSize()) { + edge->getMemoryPtr()->nullify(); } - erase = true; - continue; + + count++; } + } + OPENVINO_ASSERT(count == 1, "Expected exactly one allocation. Actual number of allocations: ", count); + } +} - // Special allocation for constants - if (edge->getStatus() != Edge::Status::NeedAllocation || !edge->getParent()->isConstant()) { +static void AllocatedReferencingEdges(const EdgeClusters& clusters) { + for (auto& cluster : clusters) { + for (auto& edge : cluster) { + if (edge->getStatus() != Edge::Status::NotAllocated) { continue; } - if (edge->getParent()->getType() == Type::Input) { - auto constNode = std::static_pointer_cast(edge->getParent()); - edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); - } else { - edge->externalAllocate(m_context->getWeightsCache()); + + std::vector edges_to_process; + edges_to_process.push_back(edge); + for (auto next_edge = edge->getSharedEdge(std::nothrow); next_edge; + next_edge = next_edge->getSharedEdge(std::nothrow)) { + edges_to_process.push_back(next_edge); } - erase = true; + + std::for_each(edges_to_process.rbegin(), edges_to_process.rend(), [](const EdgePtr& edge) { + if (edge->getStatus() == Edge::Status::NotAllocated) { + if (edge->inPlace(Edge::LOOK_DOWN)) { + edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); + } else if (edge->inPlace(Edge::LOOK_UP)) { + edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); + } else { + auto sharedEdge = edge->getSharedEdge(); + auto sharedEdgeParent = sharedEdge->getParent(); + edge->allocate(sharedEdge->getMemoryPtr()->getMemoryBlock()); + DEBUG_LOG(*edge, " sharedEdge with ", *sharedEdge); + } + } + }); } + } +} - if (erase) { - std::swap(edge_clusters[i], edge_clusters[remaining_edge_clusters_count - 1]); - --remaining_edge_clusters_count; - } else { - ++i; +std::vector Graph::CreateExecutionGraph() { + const bool hasDynNodes = ProcessDynNodes(); + auto syncNodesInds = hasDynNodes ? IdentifySyncPoints(graphNodes) : std::vector{}; + + std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = + ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); + + if (hasDynNodes) { + status = Status::ReadyDynamic; + // Here we use the following heuristic: if the number of sync nodes is less than 10 times of the number of exec + // nodes, it does make sense to use Sequential dynamic shapes processing due to the high overheads on context + // switching when the dynamic shapes are being processed in parallel and there are a lot of sync points. Also + // this rule works for short graphs (usually subgraphs) when the amount of nodes is to low to process them in + // parallel. + const auto exec2sync = m_executableGraphNodes.size() / m_executableSyncNodesInds.size(); + if (exec2sync < 10 || parallel_get_max_threads() < 2) { + status = Status::ReadyDynamicSeq; } + } else { + status = Status::ReadyStatic; + } + + return syncNodesInds; +} + +static void ResolveInOutInPlaceEdges(const std::vector& edges) { + for (const auto& edge : edges) { + if (edge->getStatus() == Edge::Status::Uninitialized) { + if (edge->getParent()->getParentEdges().empty() && + one_of(edge->getParent()->getType(), Type::MemoryInput) && edge->inPlace(Edge::LOOK_UP)) { + edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); + } else if (edge->getChild()->getChildEdges().empty() && + one_of(edge->getChild()->getType(), Type::MemoryOutput) && edge->inPlace(Edge::LOOK_DOWN)) { + edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); + } + } + } +} + +int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) { + auto syncNodesInds = CreateExecutionGraph(); + + ResolveInOutInPlaceEdges(graphEdges); + + // nodes are expected to be topologically sorted + for (size_t execIndex = 0, syncNodeIdx = 0; execIndex < graphNodes.size(); execIndex++) { + const auto& node = graphNodes[execIndex]; + const auto inputExecIndex = offset; + // register local sync node idx to global allocation context as well + if (syncNodeIdx < syncNodesInds.size() && syncNodesInds[syncNodeIdx] == execIndex) { + context.syncPoints.push_back(inputExecIndex); + syncNodeIdx++; + } + + // an offset is the number of nodes in the internal graph minus the current node (-1) + offset = node->registerToAllocationContext(inputExecIndex, context); + const auto outputExecIndex = offset; + offset++; + context.execIndex[node] = {inputExecIndex, outputExecIndex}; + } + + context.edges.insert(context.edges.end(), graphEdges.begin(), graphEdges.end()); + + return offset - 1; +} + +static void InitEdgeStatus(const std::vector& edges) { + for (auto& edge : edges) { + edge->init(); + } +} + +static void ValidateEdgeStatus(const std::vector& edges) { + for (auto& edge : edges) { + edge->validate(); + } +} + +/** + * Forms clusters of edges. + * An edge cluster is a collection of edges, with the following properties: + * - base edge is an edge with a Memory which other edges point to by means of inplace logic + * - first edge of a cluster is a base edge with a status either NeedAllocation or Allocated + * - rest of the edges in a cluster are NotAllocated ones, since they point to another edge + */ +static EdgeClusters FormEdgeClusters(const std::vector& graphEdges) { + using EdgeClusterIdxMap = std::unordered_map; + EdgeClusters edgeClusters; + EdgeClusterIdxMap edgeClusterIndices; + + for (auto& edge : graphEdges) { + if (edgeClusterIndices.count(edge)) { + continue; // edge is visited + } + + size_t clusterIdx = edgeClusters.size(); + EdgePtr lastSharedEdge = nullptr; + + // find cluster index + for (auto shared_edge = edge->getSharedEdge(std::nothrow); shared_edge; + shared_edge = shared_edge->getSharedEdge(std::nothrow)) { + auto shared_edge_it = edgeClusterIndices.find(shared_edge); + if (shared_edge_it != edgeClusterIndices.end()) { + clusterIdx = shared_edge_it->second; + lastSharedEdge = shared_edge; + break; + } + } + + if (clusterIdx == edgeClusters.size()) { + edgeClusters.emplace_back(EdgeCluster{edge}); + } + + // use recursive approach to ensure that the base edge is placed as a first entry of a cluster + std::function addToCluster; + addToCluster = + [&addToCluster, &edgeClusterIndices, &clusterIdx, &edgeClusters, &lastSharedEdge](const EdgePtr& edge) { + if (edge == lastSharedEdge) { + return; + } + + addToCluster(edge->getSharedEdge(std::nothrow)); + + if (edgeClusterIndices.emplace(edge, clusterIdx).second) { + edgeClusters[clusterIdx].push_back(edge); + } + }; + + addToCluster(edge); } + return edgeClusters; +} + +static MemoryRegions FormMemoryRegions(const EdgeClusters& clusters, + size_t remaining, + const GlobalExecutionIndex& globalExecIndex) { + auto isConstOutput = [](const EdgePtr& edge) { + return edge->getParent()->isConstant() && !edge->getChild()->isConstant(); + }; + // Markup the memory regions - std::vector memoryRegions; - memoryRegions.reserve(remaining_edge_clusters_count); + MemoryRegions memoryRegions; + memoryRegions.reserve(remaining); - for (size_t i = 0; i < remaining_edge_clusters_count; ++i) { + for (size_t i = 0; i < remaining; ++i) { MemoryRegion reg = {std::numeric_limits::max(), 0, 0, @@ -844,11 +992,26 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { int64_t boxSize = 0; bool isConst = false, isOutput = false, isInput = false; - for (auto& edge : edge_clusters[i]) { - int e_start = edge->getParent()->getExecIndex(); - int e_finish = edge->getChild()->getExecIndex(); - auto&& desc = edge->getDesc(); + for (auto& edge : clusters[i]) { + const auto& parent = edge->getParent(); + const auto& child = edge->getChild(); + + auto usesInOutMemoryMultipleTimes = [](const NodePtr& node) { + if (auto tensorIterator = std::dynamic_pointer_cast(node)) { + return tensorIterator->usesInOutMemoryMultipleTimes(); + } + + return false; + }; + // If node uses its input / output memory multiple times in scope of a single execution (i.e TensorIterator) + // prolong the lifetime of a memory region till execution is finished + int e_start = usesInOutMemoryMultipleTimes(parent) ? globalExecIndex.at(parent).first + : globalExecIndex.at(parent).second; + int e_finish = usesInOutMemoryMultipleTimes(child) ? globalExecIndex.at(child).second + : globalExecIndex.at(child).first; + + auto&& desc = edge->getOriginalDesc(); if (boxSize != -1 && desc.isDefined()) { int64_t e_size = @@ -870,8 +1033,8 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { reg.alloc_type = allocType; isConst |= isConstOutput(edge); - isOutput |= edge->getChild()->getType() == Type::Output; - isInput |= edge->getParent()->getType() == Type::Input; + isOutput |= child->getType() == Type::Output; + isInput |= parent->getType() == Type::Input; } reg.size = boxSize; @@ -891,128 +1054,107 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { memoryRegions.push_back(reg); } - // special processing of the dynamic output edges - auto it = std::remove_if(memoryRegions.begin(), memoryRegions.end(), [&](const MemoryRegion& region) { - if (region.size >= 0 || !one_of(region.type, MemoryRegion::RegionType::OUTPUT, MemoryRegion::RegionType::IO)) { - return false; - } - bool result = false; - for (auto& edge : edge_clusters[region.id]) { - auto child = edge->getChild(); - if (child->getType() == Type::Output && edge->getStatus() == Edge::Status::NeedAllocation) { - auto proxyMemBlock = std::make_shared(); - DEBUG_LOG("ProxyMemoryBlock ", proxyMemBlock, " ", this); - edge->allocate(proxyMemBlock); - - // Store the output memory blocks. - // So that, the infer requests can be able to access them. - int count = 0; - for (auto& output : outputNodesMap) { - if (output.second == child) { - outputNodesMemBlocksMap[output.first] = proxyMemBlock; - count++; - } - } - // sometimes there are unused output ports. - OPENVINO_ASSERT(count <= 1, "CPU plugin cannot find output node. count ", count); - result = true; - } - } - return result; - }); + return memoryRegions; +} - memoryRegions.erase(it, memoryRegions.end()); +static Graph::OutputMemoryBlocks FilterOutDynamicOutputEdges(MemoryRegions& memoryRegions, + const EdgeClusters& clusters, + const std::map& outputNodes) { + Graph::OutputMemoryBlocks outputMemBlocks; + memoryRegions.erase( + std::remove_if(memoryRegions.begin(), + memoryRegions.end(), + [&](const MemoryRegion& region) { + if (region.size >= 0 || + !one_of(region.type, MemoryRegion::RegionType::OUTPUT, MemoryRegion::RegionType::IO)) { + return false; + } + bool result = false; + for (auto& edge : clusters[region.id]) { + auto child = edge->getChild(); + if (child->getType() == Type::Output && + edge->getStatus() == Edge::Status::NeedAllocation) { + auto proxyMemBlock = std::make_shared(); + DEBUG_LOG("ProxyMemoryBlock ", proxyMemBlock); + + edge->allocate(proxyMemBlock); + + // Store the output memory blocks. + // So that, the infer requests can be able to access them. + // @todo Can we just get them from outputNodesMap instead? + int count = 0; + for (auto& output : outputNodes) { + if (output.second == child) { + outputMemBlocks[output.first] = proxyMemBlock; + count++; + } + } + // sometimes there are unused output ports. + OPENVINO_ASSERT(count <= 1, "CPU plugin cannot find output node. count ", count); + result = true; + } + } + return result; + }), + memoryRegions.end()); + + return outputMemBlocks; +} - // Set up the memory control subsystem. - this->m_pMemoryControl = &(getGraphContext()->getNetworkMemoryControl()->createMemoryControlUnit(syncNodesInds)); - auto memoryBlocks = m_pMemoryControl->insert(memoryRegions); +/** + * Solve memory reuse + * Ideally only MemorySolution should be returned + * For now we have to additionally return: + * 1) EdgeClusters - to propagate the solution through the graph + * 2) OutputMemoryBlocks - to allow memory sharing between graph and infer request + */ +static std::tuple SolveMemoryReuse( + const std::shared_ptr& memoryControl, + const AllocationContext& allocationContext, + const GraphContext::CPtr& graphContext, + const std::map& outputNodesMap) { + const auto& edges = allocationContext.edges; - // attach all the not yet allocated edges to the memory contol - for (auto&& item : memoryBlocks) { - int count = 0; - for (auto&& edge : edge_clusters[item.first]) { - if (edge->getStatus() == Edge::Status::NeedAllocation) { - edge->allocate(item.second); + auto edgeClusters = FormEdgeClusters(edges); - // TODO: WA for some test (like strided_slice_test) which use tensors with - // shapes {0}. And it is implicitly converted into {1} tensor. - // Zeroing of input data allow pass tests. - if (edge->getParent()->type == Type::Input && edge->hasDefinedMaxSize()) { - edge->getMemoryPtr()->nullify(); - } + const size_t remainingEdgeClustersCount = AllocateStringsAndConstants(edgeClusters, graphContext); - count++; - } - } - OPENVINO_ASSERT(count == 1); - } + auto memoryRegions = FormMemoryRegions(edgeClusters, remainingEdgeClustersCount, allocationContext.execIndex); - m_pMemoryControl->allocateMemory(); + auto outputNodesMemBlocks = FilterOutDynamicOutputEdges(memoryRegions, edgeClusters, outputNodesMap); - // Resolve all other edges with status NotAllocated and in-place - for (auto& cluster : edge_clusters) { - for (auto& edge : cluster) { - if (edge->getStatus() != Edge::Status::NotAllocated) { - continue; - } - std::vector edges_to_process; - edges_to_process.push_back(edge); - for (auto next_edge = edge->getSharedEdge(std::nothrow); next_edge; - next_edge = next_edge->getSharedEdge(std::nothrow)) { - edges_to_process.push_back(next_edge); - } - std::for_each(edges_to_process.rbegin(), edges_to_process.rend(), [](const EdgePtr& edge) { - if (edge->getStatus() == Edge::Status::NotAllocated) { - if (edge->inPlace(Edge::LOOK_DOWN)) { - edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); - } else if (edge->inPlace(Edge::LOOK_UP)) { - edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); - } else { - auto sharedEdge = edge->getSharedEdge(); - auto sharedEdgeParent = sharedEdge->getParent(); - edge->allocate(sharedEdge->getMemoryPtr()->getMemoryBlock()); - DEBUG_LOG(*edge, " sharedEdge with ", *sharedEdge); - } - } - }); - } - } + memoryControl->insert(memoryRegions, allocationContext.syncPoints); + auto memoryBlocks = memoryControl->solve(); + + return std::make_tuple(memoryBlocks, edgeClusters, outputNodesMemBlocks); } -void Graph::Allocate(const std::vector& syncNodesInds) { - OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::Allocate"); +void Graph::Allocate() { + auto memoryControl = m_context->getMemoryControl(); - // resolve inplace dead end nodes - for (const auto& edge : graphEdges) { - if (edge->getStatus() == Edge::Status::Uninitialized) { - if (edge->getParent()->getParentEdges().empty() && - one_of(edge->getParent()->getType(), Type::Input, Type::MemoryInput) && edge->inPlace(Edge::LOOK_UP)) { - edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); - } else if (edge->getChild()->getChildEdges().empty() && - one_of(edge->getChild()->getType(), Type::Output, Type::MemoryOutput) && - edge->inPlace(Edge::LOOK_DOWN)) { - edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); - } - } + if (memoryControl->allocated()) { + return; // memory is already allocated globally } - // resolve edges. Define which will be a view on others - // NeedAllocation - real blob - // NotAllocated - view on other blob, peer or in-place - for (auto& edge : graphEdges) { - edge->init(); - } + AllocationContext allocationContext; + RegisterToAllocationContext(0, allocationContext); - // Allocate memory space for all edges marked with NeedAllocation - AllocateWithReuse(syncNodesInds); + const auto& edges = allocationContext.edges; + InitEdgeStatus(edges); - // Check all getters. Should work. - for (auto& edge : graphEdges) { - edge->validate(); - } + auto [solution, edgeClusters, m_outputNodesMemBlocks] = + SolveMemoryReuse(memoryControl, allocationContext, m_context, outputNodesMap); + + AllocateBaseEdges(edgeClusters, solution); + + memoryControl->allocateMemory(); + + AllocatedReferencingEdges(edgeClusters); + + ValidateEdgeStatus(edges); } -bool Graph::ProcessDynNodes() { +bool Graph::ProcessDynNodes() const { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::ProcessDynNodes"); const bool containsDynamicNodes = std::any_of(graphNodes.begin(), graphNodes.end(), [](const NodePtr& node) { @@ -1484,13 +1626,7 @@ void Graph::Infer(SyncInferRequest* request) { DEBUG_LOG("Infer graph: ", GetName(), ". Status: ", static_cast(status)); const int numaId = GetNumaNodeId(m_context); - if (!m_pMemoryControl) { - OPENVINO_THROW("Memory control unit is not initilized in graph: ", GetName()); - } - - if (!m_pMemoryControl->allocated()) { - m_pMemoryControl->allocateMemory(); - } + m_context->allocateMemory(); switch (status) { case Status::ReadyDynamic: @@ -1780,8 +1916,8 @@ NodePtr Graph::InsertReorder(const EdgePtr& edge, // Due to the specificity of GraphOptimizer::MergeTransposeAndReorder() that isOptimized flag uses, we shouldn't do // these checks. if (!isOptimized) { - reorder->getParentEdgeAt(0)->getDesc(); - reorder->getChildEdgeAt(0)->getDesc(); + reorder->getParentEdgeAt(0)->getOriginalDesc(); + reorder->getChildEdgeAt(0)->getOriginalDesc(); } return reorder; diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index b28d2983104682..fffaa66ea1a609 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -9,6 +9,7 @@ #include #include +#include "allocation_context.hpp" #include "config.h" #include "cpu_memory.h" #include "edge.h" @@ -17,7 +18,6 @@ #include "memory_state.h" #include "node.h" #include "nodes/input.h" -#include "openvino/core/node_vector.hpp" #include "openvino/runtime/profiling_info.hpp" #include "openvino/runtime/so_ptr.hpp" #include "proxy_mem_blk.h" @@ -32,7 +32,8 @@ class MemoryStateNode; class Graph { public: - typedef std::shared_ptr Ptr; + using Ptr = std::shared_ptr; + using OutputMemoryBlocks = std::unordered_map; enum class Status { NotReady = 0, @@ -64,9 +65,23 @@ class Graph { return m_context->getConfig(); } + /** + * Obsolete way of creating graph + * To enable layout propagation and global memory reuse + * two-stage creation should be used instead: + * - Init() + * - Activate() + */ template void CreateGraph(NET& model, const GraphContext::CPtr& context); + /** + * Obsolete way of creating graph + * To enable layout propagation and global memory reuse + * two-stage creation should be used instead: + * - Init() + * - Activate() + */ void CreateGraph(const std::vector& graphNodes, const std::vector& graphEdges, const GraphContext::CPtr& context, @@ -225,6 +240,11 @@ class Graph { return graphHasDynamicInput; } + void Init(const std::vector& graphNodes, + const std::vector& graphEdges, + const GraphContext::CPtr& context, + std::string name); + /** * Init graph using \p model, \p context, \p inputConfigs and \p outputConfigs */ @@ -234,13 +254,31 @@ class Graph { const std::vector& outputConfigs = {}); /** - * Activate execution graph using \p externalInputMemory and \p externalOutputMemory + * Activate execution graph */ - void Activate(const std::vector& externalInputMemory = {}, - const std::vector& externalOutputMemory = {}); + void Activate(); - const std::unordered_map& getOutputNodesMemBlocksMap() { - return outputNodesMemBlocksMap; + /** + * Register the graph in the global allocation context by transforming + * local execution data into the global one: + * 1) Local execution indices are transformed into global ones, represented by input and output execution index + * where output execution index is an index of the last node of the inner graph + * 2) Local sync node indices are transformed into global ones using global input execution index + * 3) Local edges are added to the global list of edges + * + * Example graph with subgraphs: + * 0 -> 1 -> 2 -> 3 [0 -> 1 -> 2] -> 4 [0 -> 1] -> 5 + * + * Virtually flatten: + * 0(0) -> 1(1) -> 2(2) -> 3(5) [3 -> 4 -> 5] -> 6(7) [6 -> 7] -> 8 + * + * This is basically an equivalent to the actually flatten graph: + * 0 -> 1 -> 2 -> [3 -> 4 -> 5] -> [6 -> 7] -> 8 + */ + int RegisterToAllocationContext(int offset, AllocationContext& context); + + const std::unordered_map& getOutputNodesMemBlocksMap() const { + return m_outputNodesMemBlocks; } protected: @@ -271,6 +309,7 @@ class Graph { const std::vector& outputConfigs = {}); void Configure(bool optimize = true); + void Allocate(); void InitNodes(); void InitDescriptors(); @@ -278,10 +317,10 @@ class Graph { void InitOptimalPrimitiveDescriptors(); void ResolveEdgeConflicts(); void ResolveComplexInplaceConflicts(); - bool ProcessDynNodes(); - void Allocate(const std::vector& syncNodesInds); - void AllocateWithReuse(const std::vector& syncNodesInds); + bool ProcessDynNodes() const; + void AllocateWithReuse(const std::vector& syncNodesInds, GlobalExecutionIndex globalExecIndex); void CreatePrimitivesAndExecConstants() const; + std::vector CreateExecutionGraph(); /** * Execute a given \p node within \p request using \p numaId @@ -322,7 +361,7 @@ class Graph { std::map inputNodesMap; std::map outputNodesMap; - std::unordered_map outputNodesMemBlocksMap; + OutputMemoryBlocks m_outputNodesMemBlocks; // these node pointers (from graphNodes) are to avoid regular checking for // constantness of nodes in Infer methods and calls of @@ -332,8 +371,6 @@ class Graph { GraphContext::CPtr m_context; dnnl::stream m_stream; - - MemoryControl* m_pMemoryControl = nullptr; }; using GraphPtr = std::shared_ptr; diff --git a/src/plugins/intel_cpu/src/graph_context.cpp b/src/plugins/intel_cpu/src/graph_context.cpp index fcdbda48142165..d2a292c1b4909d 100644 --- a/src/plugins/intel_cpu/src/graph_context.cpp +++ b/src/plugins/intel_cpu/src/graph_context.cpp @@ -25,7 +25,8 @@ GraphContext::GraphContext(Config config, m_subMemoryManager(std::move(sub_memory_manager)), m_numNumaNodes(1), m_memoryStatesRegister(std::make_shared()), - m_networkMemoryControl(std::make_shared()) { + m_auxiliaryNetworkMemoryControl(std::make_shared()), + m_memoryControl(m_auxiliaryNetworkMemoryControl->createMemoryControlUnit()) { if (m_streamExecutor) { m_cpuStreamExecutor = std::dynamic_pointer_cast(m_streamExecutor); m_numaNodeId = m_cpuStreamExecutor ? m_cpuStreamExecutor->get_numa_node_id() : 0; diff --git a/src/plugins/intel_cpu/src/graph_context.h b/src/plugins/intel_cpu/src/graph_context.h index 8389bc389505fe..8e02fa6b03183b 100644 --- a/src/plugins/intel_cpu/src/graph_context.h +++ b/src/plugins/intel_cpu/src/graph_context.h @@ -7,6 +7,7 @@ #include "cache/multi_cache.h" #include "config.h" #include "dnnl_scratch_pad.h" +#include "memory_control.hpp" #include "openvino/runtime/threading/cpu_streams_executor.hpp" #include "sub_memory_manager.hpp" #include "weights_cache.hpp" @@ -18,6 +19,7 @@ namespace node { class MemoryStatesRegister; } // namespace node +class MemoryControl; class NetworkMemoryControl; class GraphContext { @@ -73,33 +75,55 @@ class GraphContext { return m_memoryStatesRegister; } - const std::shared_ptr& getNetworkMemoryControl() const { - return m_networkMemoryControl; + const std::shared_ptr& getMemoryControl() const { + return m_memoryControl; } -private: - Config m_config; // network-level config - - WeightsSharing::Ptr m_weightsCache; // per NUMA node caches for sharing weights data - - MultiCachePtr m_rtParamsCache; // primitive cache - DnnlScratchPadPtr m_rtScratchPad; // scratch pad - - bool m_isGraphQuantizedFlag = false; + const std::shared_ptr& getAuxiliaryNetworkMemoryControl() const { + return m_auxiliaryNetworkMemoryControl; + } - std::vector m_rtScratchPads; // scratch pad (each sub-stream has its own copy) + void releaseMemory() const { + m_auxiliaryNetworkMemoryControl->releaseMemory(); + } - ov::threading::IStreamsExecutor::Ptr m_streamExecutor; // stream executor for current graph + void allocateMemory() const { + for (const auto& controlUnit : m_auxiliaryNetworkMemoryControl->controlUnits()) { + if (!controlUnit->allocated()) { + controlUnit->allocateMemory(); + } + } + } - ov::threading::CPUStreamsExecutor::Ptr m_cpuStreamExecutor; // cpu stream executor for current graph +private: + // model-level config + Config m_config; + // per NUMA node caches for sharing weights data + WeightsSharing::Ptr m_weightsCache; + // primitive cache + MultiCachePtr m_rtParamsCache; + // global scratch pad + DnnlScratchPadPtr m_rtScratchPad; + bool m_isGraphQuantizedFlag = false; + // scratch pad per sub-stream + std::vector m_rtScratchPads; + // stream executor for current graph + ov::threading::IStreamsExecutor::Ptr m_streamExecutor; + // cpu stream executor for current graph + ov::threading::CPUStreamsExecutor::Ptr m_cpuStreamExecutor; + // numa submemory manager std::shared_ptr m_subMemoryManager; int m_numNumaNodes = 1; int m_numaNodeId = 0; std::shared_ptr m_memoryStatesRegister; - std::shared_ptr m_networkMemoryControl; + // auxiliary object to allow creating additional memory control objects if the main one cannot be used + // i.e. fallback graph for dynamic in-place + std::shared_ptr m_auxiliaryNetworkMemoryControl; + // main memory control object, which is supposed to be globally reused + MemoryControl::Ptr m_memoryControl; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/memory_control.cpp b/src/plugins/intel_cpu/src/memory_control.cpp index 42a453c5b4d760..e8782bb32cdad4 100644 --- a/src/plugins/intel_cpu/src/memory_control.cpp +++ b/src/plugins/intel_cpu/src/memory_control.cpp @@ -4,11 +4,13 @@ #include "memory_control.hpp" +#include +#include #include #include -#include "node.h" #include "openvino/runtime/memory_solver.hpp" +#include "utils/general_utils.h" namespace ov { namespace intel_cpu { @@ -86,8 +88,8 @@ class MemoryBlockWithRelease : public IMemoryBlockObserver { class IMemoryManager { public: virtual ~IMemoryManager() = default; - virtual void insert(const MemoryRegion& reg) = 0; - virtual const MemoryControl::MemoryBlockMap& lastSolution() = 0; + virtual void insert(const MemoryRegion& reg, const std::vector& syncInds) = 0; + virtual const MemoryControl::MemorySolution& lastSolution() = 0; virtual void allocate() = 0; virtual void release() = 0; }; @@ -101,11 +103,12 @@ std::shared_ptr makeDnnlMemoryBlock(Args&&... args) { class MemoryManagerIO : public IMemoryManager { public: - void insert(const MemoryRegion& reg) override { + void insert(const MemoryRegion& reg, const std::vector& syncInds) override { + (void)syncInds; m_blocks.insert({reg.id, makeDnnlMemoryBlock()}); } - const MemoryControl::MemoryBlockMap& lastSolution() override { + const MemoryControl::MemorySolution& lastSolution() override { return m_blocks; } @@ -117,16 +120,17 @@ class MemoryManagerIO : public IMemoryManager { } private: - MemoryControl::MemoryBlockMap m_blocks; + MemoryControl::MemorySolution m_blocks; }; class MemoryManagerStatic : public IMemoryManager { public: - void insert(const MemoryRegion& reg) override { + void insert(const MemoryRegion& reg, const std::vector& syncInds) override { + (void)syncInds; m_boxes.emplace_back(MemorySolver::Box{reg.start, reg.finish, reg.size, reg.id}); } - const MemoryControl::MemoryBlockMap& lastSolution() override { + const MemoryControl::MemorySolution& lastSolution() override { if (!m_boxes.empty() && m_blocks.empty()) { solve(); } @@ -165,7 +169,7 @@ class MemoryManagerStatic : public IMemoryManager { } private: - MemoryControl::MemoryBlockMap m_blocks; + MemoryControl::MemorySolution m_blocks; std::vector m_boxes; std::shared_ptr m_workspace; size_t m_totalSize = 0; @@ -173,18 +177,17 @@ class MemoryManagerStatic : public IMemoryManager { class MemoryManageNonOverlapingSets : public IMemoryManager { public: - MemoryManageNonOverlapingSets(std::vector syncInds) : m_syncInds(std::move(syncInds)) {} - void insert(const MemoryRegion& reg) override { + void insert(const MemoryRegion& reg, const std::vector& syncInds) override { MemorySolver::Box box = {reg.start, reg.finish, reg.size, reg.id}; if (-1 != reg.finish) { // We have to extend the lifespan of tensors that are crossing a sync point border in order to save // the intermediate computation results from possible loss due to the tensor resize - auto itr_upper = std::upper_bound(m_syncInds.begin(), m_syncInds.end(), box.finish, [](int y, int x) { + auto itr_upper = std::upper_bound(syncInds.begin(), syncInds.end(), box.finish, [](int y, int x) { return y <= x; }); - auto itr_lower = std::lower_bound(m_syncInds.begin(), m_syncInds.end(), box.start); + auto itr_lower = std::lower_bound(syncInds.begin(), syncInds.end(), box.start); if (itr_lower != itr_upper) { // across sections - if (itr_upper == m_syncInds.end()) { + if (itr_upper == syncInds.end()) { box.finish = -1; } else { box.finish = *itr_upper; @@ -194,10 +197,10 @@ class MemoryManageNonOverlapingSets : public IMemoryManager { m_boxes.emplace_back(box); } - const MemoryControl::MemoryBlockMap& lastSolution() override { + const MemoryControl::MemorySolution& lastSolution() override { if (!m_boxes.empty() && m_blocks.empty()) { solve(); - m_blocks = MemoryControl::MemoryBlockMap{m_internalBlocks.begin(), m_internalBlocks.end()}; + m_blocks = MemoryControl::MemorySolution{m_internalBlocks.begin(), m_internalBlocks.end()}; } return m_blocks; } @@ -243,11 +246,10 @@ class MemoryManageNonOverlapingSets : public IMemoryManager { } private: - MemoryControl::MemoryBlockMap m_blocks; - std::unordered_map> + MemoryControl::MemorySolution m_blocks; + std::unordered_map> m_internalBlocks; std::vector m_boxes; - std::vector m_syncInds; }; } // namespace @@ -261,16 +263,16 @@ class MemoryControl::RegionHandler { : m_cond(std::move(cond)), m_memManager(std::move(memManager)) {} - bool insert(const MemoryRegion& reg) { + bool insert(const MemoryRegion& reg, const std::vector& syncInds) { if (!m_cond(reg)) { return false; } - m_memManager->insert(reg); + m_memManager->insert(reg, syncInds); return true; } - const MemoryControl::MemoryBlockMap& lastSolution() const { + const MemoryControl::MemorySolution& lastSolution() const { return m_memManager->lastSolution(); } @@ -297,10 +299,8 @@ MemoryControl::RegionHandlerPtr buildHandler(F&& f, Args&&... args) { } // namespace -MemoryControl::MemoryControl(std::vector syncInds) { +MemoryControl::MemoryControl() { // init handlers - - // handler for dynamic tensors m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { if (reg.size < 0 || MemoryRegion::RegionType::VARIABLE != reg.type || MemoryRegion::AllocType::POD != reg.alloc_type) { @@ -310,15 +310,13 @@ MemoryControl::MemoryControl(std::vector syncInds) { })); // handler for static tensors - m_handlers.emplace_back(buildHandler( - [](const MemoryRegion& reg) { - if (reg.size >= 0 || MemoryRegion::RegionType::VARIABLE != reg.type || - MemoryRegion::AllocType::POD != reg.alloc_type) { - return false; - } - return true; - }, - std::move(syncInds))); + m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { + if (reg.size >= 0 || MemoryRegion::RegionType::VARIABLE != reg.type || + MemoryRegion::AllocType::POD != reg.alloc_type) { + return false; + } + return true; + })); // handler for I/O tensors, so far simply individual blocks m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { @@ -329,22 +327,23 @@ MemoryControl::MemoryControl(std::vector syncInds) { })); } -void MemoryControl::insert(const MemoryRegion& region) { +void MemoryControl::insert(const MemoryRegion& region, const std::vector& syncInds) { for (auto&& handler : m_handlers) { - if (handler->insert(region)) { + if (handler->insert(region, syncInds)) { return; } } OPENVINO_THROW("No suitable hanlder was found for the given memory region"); } -MemoryControl::MemoryBlockMap MemoryControl::insert(const std::vector& regions) { +void MemoryControl::insert(const std::vector& regions, const std::vector& syncInds) { for (auto&& region : regions) { - insert(region); + insert(region, syncInds); } +} - MemoryControl::MemoryBlockMap blocksMap; - blocksMap.reserve(regions.size()); +MemoryControl::MemorySolution MemoryControl::solve() { + MemoryControl::MemorySolution blocksMap; for (auto&& handler : m_handlers) { auto&& solution = handler->lastSolution(); @@ -371,54 +370,9 @@ void MemoryControl::releaseMemory() { m_allocated = false; } -edgeClusters MemoryControl::findEdgeClusters(const std::vector& graphEdges) { - typedef std::unordered_map edge_cluster_idx_map_t; - - edgeClusters edge_clusters; - edge_cluster_idx_map_t edge_cluster_indices; - - for (auto& edge : graphEdges) { - auto edge_it = edge_cluster_indices.find(edge); - if (edge_it != edge_cluster_indices.end()) { - continue; // edge is visited - } - - size_t cluster_idx = edge_clusters.size(); - EdgePtr last_shared_edge = nullptr; - - // find cluster index - for (auto shared_edge = edge->getSharedEdge(std::nothrow); shared_edge; - shared_edge = shared_edge->getSharedEdge(std::nothrow)) { - auto shared_edge_it = edge_cluster_indices.find(shared_edge); - if (shared_edge_it != edge_cluster_indices.end()) { - cluster_idx = shared_edge_it->second; - last_shared_edge = shared_edge; - break; - } - } - - // add shared edges to cluster - edge_cluster_indices.emplace(edge, cluster_idx); - - if (cluster_idx == edge_clusters.size()) { - edge_clusters.emplace_back(edgeCluster{edge}); - } else { - edge_clusters[cluster_idx].emplace(edge); - } - - for (auto shared_edge = edge->getSharedEdge(std::nothrow); shared_edge != last_shared_edge; - shared_edge = shared_edge->getSharedEdge(std::nothrow)) { - edge_cluster_indices.emplace(shared_edge, cluster_idx); - edge_clusters[cluster_idx].emplace(shared_edge); - } - } - - return edge_clusters; -} - -MemoryControl& NetworkMemoryControl::createMemoryControlUnit(std::vector syncInds) { - m_controlUnits.emplace_back(std::unique_ptr(new MemoryControl(std::move(syncInds)))); - return *(m_controlUnits.back()); +MemoryControl::Ptr NetworkMemoryControl::createMemoryControlUnit() { + m_controlUnits.emplace_back(std::shared_ptr(new MemoryControl())); + return m_controlUnits.back(); } void NetworkMemoryControl::allocateMemory() { diff --git a/src/plugins/intel_cpu/src/memory_control.hpp b/src/plugins/intel_cpu/src/memory_control.hpp index b057d0afbbcf37..a792a225e16d22 100644 --- a/src/plugins/intel_cpu/src/memory_control.hpp +++ b/src/plugins/intel_cpu/src/memory_control.hpp @@ -9,8 +9,8 @@ namespace ov { namespace intel_cpu { -using edgeCluster = std::unordered_set; -using edgeClusters = std::vector; +using EdgeCluster = std::vector; +using EdgeClusters = std::vector; struct MemoryRegion { int start; // Execution order index of first use. @@ -22,17 +22,21 @@ struct MemoryRegion { enum class AllocType : uint8_t { POD, STRING, UNKNOWN } alloc_type; }; +using MemoryRegions = std::vector; + class MemoryControl { public: class RegionHandler; using RegionHandlerPtr = std::shared_ptr; - using MemoryBlockMap = std::unordered_map; + using MemorySolution = std::unordered_map; + using Ptr = std::shared_ptr; + using CPtr = std::shared_ptr; public: - static edgeClusters findEdgeClusters(const std::vector& graphEdges); + void insert(const MemoryRegions& regions, const std::vector& syncInds); - MemoryBlockMap insert(const std::vector& regions); + MemorySolution solve(); bool allocated() const { return m_allocated; @@ -42,13 +46,12 @@ class MemoryControl { void releaseMemory(); private: - explicit MemoryControl(std::vector syncInds); - void insert(const MemoryRegion& region); + MemoryControl(); + void insert(const MemoryRegion& region, const std::vector& syncInds); friend class NetworkMemoryControl; private: - std::vector m_syncInds; std::vector m_handlers; bool m_allocated = false; }; @@ -56,17 +59,19 @@ class MemoryControl { class NetworkMemoryControl { public: NetworkMemoryControl() = default; - MemoryControl& createMemoryControlUnit(std::vector syncInds); + + MemoryControl::Ptr createMemoryControlUnit(); void allocateMemory(); void releaseMemory(); -private: - using value_type = std::unique_ptr; + const std::vector& controlUnits() const { + return m_controlUnits; + } private: - std::vector m_controlUnits; + std::vector m_controlUnits; }; } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 791d036fbeaff7..2614b6af85db5d 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -1815,7 +1815,7 @@ bool Node::isOutputTensorAtPortEmpty(size_t port) const { return outputShapes[port].hasZeroDims(); } auto&& mem = getChildEdgeAt(port)->getMemory(); - if (mem.isDefined()) { + if (mem.isDefined() && !mem.getDesc().empty()) { return mem.getShape().hasZeroDims(); } return false; diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index 60b6568562ec5c..6ac03b91b39934 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -15,6 +15,7 @@ #include #include +#include "allocation_context.hpp" #include "cpu_memory.h" #include "cpu_shape.h" #include "cpu_types.h" @@ -130,6 +131,40 @@ class NodeDesc { executorFactory = std::move(factory); } + bool hasZeroInputDims() const { + const auto& inputConfigs = getConfig().inConfs; + return std::any_of(inputConfigs.begin(), inputConfigs.end(), [](const PortConfig& portConfig) { + return portConfig.hasZeroDims(); + }); + } + + bool hasZeroInputDimsAtPort(size_t portIdx) const { + const auto& inputConfigs = getConfig().inConfs; + OPENVINO_ASSERT(portIdx < inputConfigs.size(), + "Attempt to get NodeDesc input configuration for port ", + portIdx, + ". Number of inputs is ", + inputConfigs.size()); + return inputConfigs[portIdx].hasZeroDims(); + } + + bool hasZeroOutputDims() const { + const auto& outputConfigs = getConfig().outConfs; + return std::any_of(outputConfigs.begin(), outputConfigs.end(), [](const PortConfig& portConfig) { + return portConfig.hasZeroDims(); + }); + } + + bool hasZeroOutputDimsAtPort(size_t portIdx) const { + const auto& outputConfigs = getConfig().outConfs; + OPENVINO_ASSERT(portIdx < outputConfigs.size(), + "Attempt to get NodeDesc output configuration for port ", + portIdx, + ". Number of outputs is ", + outputConfigs.size()); + return outputConfigs[portIdx].hasZeroDims(); + } + private: NodeConfig config; impl_desc_type implementationType; @@ -292,6 +327,9 @@ class Node { bool isInPlace() const; + virtual bool neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDims(); + } // must be called only after Graph::ResolveEdgeConflicts() virtual bool isExecutable() const { return !hasEmptyInputTensors(); @@ -515,6 +553,17 @@ class Node { return execIndex; } + /** + * @brief Register node to the allocation \context + * + * The main use case are nodes with nested graphs. + * Use this method to make nested graphs a part of global allocation procedure + */ + virtual int registerToAllocationContext(int offset, AllocationContext& context) { + (void)context; // nothing to register by default + return offset; + } + const std::string& getTypeStr() const { return typeStr; } diff --git a/src/plugins/intel_cpu/src/nodes/batch_to_space.h b/src/plugins/intel_cpu/src/nodes/batch_to_space.h index 4d13ad27789a5d..a11358d320c5e7 100644 --- a/src/plugins/intel_cpu/src/nodes/batch_to_space.h +++ b/src/plugins/intel_cpu/src/nodes/batch_to_space.h @@ -17,6 +17,11 @@ class BatchToSpace : public Node { void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; + bool neverExecute() const override { + const auto& spd = getSelectedPrimitiveDescriptor(); + return spd->hasZeroInputDims() || spd->hasZeroOutputDims(); + } + // output shape can potentially be empty bool isExecutable() const override { return !hasEmptyInputTensors() && !hasEmptyOutputTensors(); diff --git a/src/plugins/intel_cpu/src/nodes/broadcast.cpp b/src/plugins/intel_cpu/src/nodes/broadcast.cpp index b7e1b19f09f3c7..9d5fc09e2b44fb 100644 --- a/src/plugins/intel_cpu/src/nodes/broadcast.cpp +++ b/src/plugins/intel_cpu/src/nodes/broadcast.cpp @@ -189,6 +189,10 @@ bool Broadcast::needShapeInfer() const { return false; } +bool Broadcast::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool Broadcast::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/broadcast.h b/src/plugins/intel_cpu/src/nodes/broadcast.h index c6063ebd89fbf4..d9fe039cafba50 100644 --- a/src/plugins/intel_cpu/src/nodes/broadcast.h +++ b/src/plugins/intel_cpu/src/nodes/broadcast.h @@ -24,6 +24,7 @@ class Broadcast : public Node, public TileBroadcastCommon { void executeDynamicImpl(const dnnl::stream& strm) override; bool created() const override; + bool neverExecute() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/bucketize.cpp b/src/plugins/intel_cpu/src/nodes/bucketize.cpp index 8e2db6f9192d21..8ab671cf554894 100644 --- a/src/plugins/intel_cpu/src/nodes/bucketize.cpp +++ b/src/plugins/intel_cpu/src/nodes/bucketize.cpp @@ -221,6 +221,10 @@ void Bucketize::prepareParams() { std::multiplies()); } +bool Bucketize::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool Bucketize::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/bucketize.h b/src/plugins/intel_cpu/src/nodes/bucketize.h index 3481941aa0a405..a5c0cd57dc8007 100644 --- a/src/plugins/intel_cpu/src/nodes/bucketize.h +++ b/src/plugins/intel_cpu/src/nodes/bucketize.h @@ -24,6 +24,7 @@ class Bucketize : public Node { void prepareParams() override; + bool neverExecute() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp index 488275e0ac564c..aa033a2ab5508a 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.cpp +++ b/src/plugins/intel_cpu/src/nodes/composite.cpp @@ -43,16 +43,15 @@ void Composite::selectOptimalPrimitiveDescriptor() { std::vector inConfs; std::vector graphInputConfig; + constexpr bool isInPlace = true; + for (size_t i = 0; i < getParentEdges().size(); i++) { auto desc = getParentOutputMemDesc(getParentEdgeAt(i)); inConfs.emplace_back(desc); - graphInputConfig.emplace_back(node::Input::InputConfig{std::move(desc), true}); + graphInputConfig.emplace_back(node::Input::InputConfig{std::move(desc), isInPlace}); } - std::vector graphOutputConfig; - for (size_t i = 0; i < outputShapes.size(); i++) { - graphOutputConfig.emplace_back(node::Input::OutputConfig{true, true}); - } + std::vector graphOutputConfig(outputShapes.size(), node::Input::OutputConfig{true, isInPlace}); // configure the inner graph to get the information about output memory descriptors m_graph.Init(m_body, context, graphInputConfig, graphOutputConfig); @@ -75,23 +74,37 @@ void Composite::selectOptimalPrimitiveDescriptor() { // @todo add ascii diagramm for memory mapping / reuse void Composite::createPrimitive() { + m_graph.Activate(); +} + +int Composite::registerToAllocationContext(int offset, AllocationContext& context) { OPENVINO_ASSERT(getOriginalInputsNumber() == m_graph.inputsNumber(), "Number of node inputs must be equal the number of inner graph's inputs"); - std::vector inputMemory; for (size_t i = 0; i < getOriginalInputsNumber(); i++) { - inputMemory.emplace_back(getSrcMemoryAtPort(i)); + auto parentEdge = getParentEdgeAt(i); + auto inputEdges = m_graph.getInputNodeByIndex(i)->getChildEdgesAtPort(0); + for (const auto& inputEdge : inputEdges) { + OPENVINO_ASSERT(inputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", + *this); + inputEdge->sharedMemFrom(parentEdge); + } } OPENVINO_ASSERT(getOriginalOutputsNumber() == m_graph.outputsNumber(), "Number of node outputs must be equal the number of inner graph's outputs"); - std::vector outputMemory; for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { - outputMemory.emplace_back(getDstMemoryAtPort(i)); + auto childEdge = getChildEdgeAt(i); + auto outputEdge = m_graph.getOutputNodeByIndex(i)->getParentEdgeAt(0); + OPENVINO_ASSERT(outputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", + *outputEdge); + outputEdge->sharedMemFrom(childEdge); } - m_graph.Activate(inputMemory, outputMemory); + return m_graph.RegisterToAllocationContext(offset, context); } void Composite::execute(const dnnl::stream&) { diff --git a/src/plugins/intel_cpu/src/nodes/composite.h b/src/plugins/intel_cpu/src/nodes/composite.h index 59fc3705497a18..c3a123da43e74a 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.h +++ b/src/plugins/intel_cpu/src/nodes/composite.h @@ -31,6 +31,10 @@ class Composite : public Node { return false; } + bool neverExecute() const override { + return false; + } + bool isExecutable() const override { return true; } @@ -41,6 +45,8 @@ class Composite : public Node { void execute(const dnnl::stream&) override; void executeDynamicImpl(const dnnl::stream& strm) override; + int registerToAllocationContext(int offset, AllocationContext& context) override; + const Graph& graph() const { return m_graph; } diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index 425052750f1c2f..2a6700fa3fc611 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -29,6 +29,10 @@ namespace { constexpr size_t channelAxis = 1lu; } +bool Concat::neverExecute() const { + return isInPlace() || getSelectedPrimitiveDescriptor()->hasZeroOutputDims(); +} + bool Concat::isExecutable() const { return !isInPlace() && !hasEmptyOutputTensors(); } diff --git a/src/plugins/intel_cpu/src/nodes/concat.h b/src/plugins/intel_cpu/src/nodes/concat.h index 6bbc215d80aad7..465df398c87793 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.h +++ b/src/plugins/intel_cpu/src/nodes/concat.h @@ -29,6 +29,7 @@ class Concat : public Node { ov::element::Type getRuntimePrecision() const override; + bool neverExecute() const override; bool isExecutable() const override; bool needPrepareParams() const override; void prepareParams() override; diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp index 70abb0aec191c9..b580927f862798 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/conv.cpp @@ -182,7 +182,15 @@ class Convolution::FusedSubgraph { std::vector nodes(nodesSet.begin(), nodesSet.end()); - _graph->CreateGraph(nodes, edges, context, "fused_subgraph"); + _graph->Init(nodes, edges, context, "fused_subgraph"); + } + + int RegisterToAllocationContext(int offset, AllocationContext& context) { + return _graph->RegisterToAllocationContext(offset, context); + } + + void Activate() const { + _graph->Activate(); } std::shared_ptr getInput(size_t idx) const { @@ -788,10 +796,6 @@ void Convolution::setPostOps(dnnl::primitive_attr& attr, attr.set_post_ops(ops); } -void Convolution::selectOptimalPrimitiveDescriptor() { - selectPreferPrimitiveDescriptor(getImplPriority(), true); -} - void Convolution::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) { return; @@ -904,6 +908,36 @@ void Convolution::initSupportedPrimitiveDescriptors() { } } +void Convolution::selectOptimalPrimitiveDescriptor() { + selectPreferPrimitiveDescriptor(getImplPriority(), true); + /* preemptively create a fallback subgraph to include it into global memory reuse + * pros: + * - less total memory usage when fallback is actually needed (by size of intermediate memory) + * - no runtime overhead of graph creation when fallback is needed for the first time + * cons: + * - more total memory usage when fallback is not needed (by size of a graph data structure itself) + */ + if (withSum && isDynamicNode()) { + subgraph = std::make_shared(fusedWith, *this, context); + } +} + +int Convolution::registerToAllocationContext(int offset, AllocationContext& context) { + if (subgraph) { + return subgraph->RegisterToAllocationContext(offset, context); + } + + return Node::registerToAllocationContext(offset, context); +} + +void Convolution::createPrimitive() { + if (subgraph) { + subgraph->Activate(); + } + + Node::createPrimitive(); +} + bool Convolution::created() const { return getType() == Type::Convolution; } @@ -1671,9 +1705,7 @@ void Convolution::redefineOutputMemory(const std::vector& newOutputS const auto& sumInpMem = getParentEdgeAt(sumPortNum)->getMemory(); if (newOutputShapes.front() != sumInpMem.getStaticDims()) { withSumBroadcast = true; - if (!subgraph) { - subgraph = std::make_shared(fusedWith, *this, context); - } + auto inp0 = subgraph->getInput(0); inp0->redefineOutputMemory(newOutputShapes); diff --git a/src/plugins/intel_cpu/src/nodes/conv.h b/src/plugins/intel_cpu/src/nodes/conv.h index 80c98b2a7bca07..bb4e2499a74408 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.h +++ b/src/plugins/intel_cpu/src/nodes/conv.h @@ -24,6 +24,8 @@ class Convolution : public Node { void initDescriptor(const NodeConfig& config) override; void selectOptimalPrimitiveDescriptor() override; void initSupportedPrimitiveDescriptors() override; + int registerToAllocationContext(int offset, AllocationContext& context) override; + void createPrimitive() override; bool created() const override; bool canBeInPlace() const override { return false; diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp index 4c84cb4ff5f83b..3340589e0d6a0c 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp @@ -161,6 +161,10 @@ void EmbeddingBagOffset::executeDynamicImpl(const dnnl::stream& strm) { execute(strm); } +bool EmbeddingBagOffset::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool EmbeddingBagOffset::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h index c9bef8a9e28eab..ab28ad91222125 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h @@ -20,6 +20,7 @@ class EmbeddingBagOffset : public Node, public EmbeddingBag { void execute(const dnnl::stream& strm) override; bool created() const override; + bool neverExecute() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp index 2becb7e2635020..d5e7c9b9374b68 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp @@ -126,6 +126,10 @@ void EmbeddingBagPacked::executeDynamicImpl(const dnnl::stream& strm) { execute(strm); } +bool EmbeddingBagPacked::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool EmbeddingBagPacked::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h index a979917f2570c5..d4c3064d8fe0f4 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h @@ -21,6 +21,7 @@ class EmbeddingBagPacked : public Node, public EmbeddingBag { bool created() const override; bool isExecutable() const override; + bool neverExecute() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; protected: diff --git a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp index c53fd25b038c5e..8e531865578c88 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp @@ -157,6 +157,10 @@ void EmbeddingSegmentsSum::executeDynamicImpl(const dnnl::stream& strm) { execute(strm); } +bool EmbeddingSegmentsSum::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool EmbeddingSegmentsSum::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h index 4e172655ef5472..dc6c4ac0c70d13 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h @@ -20,6 +20,7 @@ class EmbeddingSegmentsSum : public Node, public EmbeddingBag { void execute(const dnnl::stream& strm) override; bool created() const override; + bool neverExecute() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/gather.cpp b/src/plugins/intel_cpu/src/nodes/gather.cpp index 666616c9c5ab14..3977d7fd467820 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather.cpp @@ -949,6 +949,10 @@ bool Gather::created() const { return getType() == Type::Gather; } +bool Gather::neverExecute() const { + return isInPlace() || Node::neverExecute(); +} + bool Gather::isExecutable() const { return !isInPlace() && Node::isExecutable(); } diff --git a/src/plugins/intel_cpu/src/nodes/gather.h b/src/plugins/intel_cpu/src/nodes/gather.h index 9f780b822cd497..1a05f88638b399 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.h +++ b/src/plugins/intel_cpu/src/nodes/gather.h @@ -25,6 +25,7 @@ class Gather : public Node { void createPrimitive() override; void execute(const dnnl::stream& strm) override; bool created() const override; + bool neverExecute() const override; bool isExecutable() const override; void resolveInPlaceEdges(Edge::LOOK look) override; diff --git a/src/plugins/intel_cpu/src/nodes/if.cpp b/src/plugins/intel_cpu/src/nodes/if.cpp index 3a41f6431ce835..81fb51ca7235ec 100644 --- a/src/plugins/intel_cpu/src/nodes/if.cpp +++ b/src/plugins/intel_cpu/src/nodes/if.cpp @@ -8,11 +8,11 @@ #include #include -#include "common/cpu_memcpy.h" #include "nodes/common/cpu_convert.h" +#include "nodes/node_config.h" +#include "openvino/core/except.hpp" #include "openvino/op/if.hpp" #include "shape_inference/shape_inference_internal_dyn.hpp" -#include "transformations/utils/utils.hpp" namespace ov { namespace intel_cpu { @@ -73,39 +73,74 @@ bool If::isSupportedOperation(const std::shared_ptr& op, std::st If::If(const std::shared_ptr& op, const GraphContext::CPtr& context) : Node(op, context, InternalDynShapeInferFactory()), - ovOp(op) { + m_op(ov::as_type_ptr(op)) { + OPENVINO_ASSERT(m_op, "'If' operation is expected"); + std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); } } -void If::getSupportedDescriptors() { - auto ifOp = ov::as_type_ptr(ovOp); +void If::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) { + return; + } + + m_thenGraph.Init(m_op->get_then_body(), context); + m_elseGraph.Init(m_op->get_else_body(), context); + + NodeConfig config; + config.inConfs.reserve(getParentEdges().size()); + config.outConfs.reserve(getChildEdges().size()); - const std::shared_ptr& thenBody = ifOp->get_then_body(); - const std::shared_ptr& elseBody = ifOp->get_else_body(); - subGraphThen.CreateGraph(thenBody, context); - subGraphElse.CreateGraph(elseBody, context); + for (size_t i = 0; i < inputShapes.size(); i++) { + PortConfig dataConf{}; + auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); + dataConf.setMemDesc(descCreator->createSharedDesc(getOriginalInputPrecisionAtPort(i), getInputShapeAtPort(i))); + config.inConfs.emplace_back(dataConf); + } + + for (size_t i = 0; i < outputShapes.size(); i++) { + PortConfig dataConf{}; + auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); + dataConf.setMemDesc( + descCreator->createSharedDesc(getOriginalOutputPrecisionAtPort(i), getOutputShapeAtPort(i))); + config.outConfs.push_back(dataConf); + } + + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); +} + +int If::registerToAllocationContext(int offset, AllocationContext& context) { + // take into account an offset of the both subgraphs + const int thenOffset = m_thenGraph.RegisterToAllocationContext(offset, context); + const int elseOffset = m_elseGraph.RegisterToAllocationContext(thenOffset, context); + return elseOffset; +} + +void If::createPrimitive() { + m_thenGraph.Activate(); + m_elseGraph.Activate(); - for (const auto& param : ifOp->get_then_body()->get_parameters()) { - if (auto inNode = subGraphThen.getInputNodeByIndex(ifOp->get_then_body()->get_parameter_index(param))) { + for (const auto& param : m_op->get_then_body()->get_parameters()) { + if (auto inNode = m_thenGraph.getInputNodeByIndex(m_op->get_then_body()->get_parameter_index(param))) { inputMemThen.push_back(getToMemories(inNode.get(), 0)); } else { THROW_CPU_NODE_ERR("Then body of node does not have input with name: ", param->get_friendly_name()); } } - for (const auto& param : ifOp->get_else_body()->get_parameters()) { - if (auto inNode = subGraphElse.getInputNodeByIndex(ifOp->get_else_body()->get_parameter_index(param))) { + for (const auto& param : m_op->get_else_body()->get_parameters()) { + if (auto inNode = m_elseGraph.getInputNodeByIndex(m_op->get_else_body()->get_parameter_index(param))) { inputMemElse.push_back(getToMemories(inNode.get(), 0)); } else { THROW_CPU_NODE_ERR("Else body of node does not have input with name: ", param->get_friendly_name()); } } - for (const auto& out : ifOp->get_then_body()->get_results()) { - if (auto outNode = subGraphThen.getOutputNodeByIndex(ifOp->get_then_body()->get_result_index(out))) { + for (const auto& out : m_op->get_then_body()->get_results()) { + if (auto outNode = m_thenGraph.getOutputNodeByIndex(m_op->get_then_body()->get_result_index(out))) { auto outMem = outNode->getSrcMemoryAtPort(0); outputMemThen.push_back(outMem); } else { @@ -113,8 +148,8 @@ void If::getSupportedDescriptors() { } } - for (const auto& out : ifOp->get_else_body()->get_results()) { - if (auto outNode = subGraphElse.getOutputNodeByIndex(ifOp->get_else_body()->get_result_index(out))) { + for (const auto& out : m_op->get_else_body()->get_results()) { + if (auto outNode = m_elseGraph.getOutputNodeByIndex(m_op->get_else_body()->get_result_index(out))) { auto outMem = outNode->getSrcMemoryAtPort(0); outputMemElse.push_back(outMem); } else { @@ -123,57 +158,28 @@ void If::getSupportedDescriptors() { } // Port map: outputs - for (const auto& desc : ifOp->get_output_descriptions(0)) { + for (const auto& desc : m_op->get_output_descriptions(0)) { auto body_output_idx = desc->m_body_value_index; thenOutputPortMap.emplace_back( PortMap{static_cast(desc->m_output_index), static_cast(body_output_idx)}); } - for (const auto& desc : ifOp->get_output_descriptions(1)) { + for (const auto& desc : m_op->get_output_descriptions(1)) { auto body_output_idx = desc->m_body_value_index; elseOutputPortMap.emplace_back( PortMap{static_cast(desc->m_output_index), static_cast(body_output_idx)}); } - for (const auto& desc : ifOp->get_input_descriptions(0)) { + for (const auto& desc : m_op->get_input_descriptions(0)) { auto body_input_index = desc->m_body_parameter_index; thenInputPortMap.emplace_back( PortMap{static_cast(desc->m_input_index), static_cast(body_input_index)}); } - for (const auto& desc : ifOp->get_input_descriptions(1)) { + for (const auto& desc : m_op->get_input_descriptions(1)) { auto body_input_index = desc->m_body_parameter_index; elseInputPortMap.emplace_back( PortMap{static_cast(desc->m_input_index), static_cast(body_input_index)}); } -} - -void If::initSupportedPrimitiveDescriptors() { - if (!supportedPrimitiveDescriptors.empty()) { - return; - } - - NodeConfig config; - config.inConfs.reserve(getParentEdges().size()); - config.outConfs.reserve(getChildEdges().size()); - for (size_t i = 0; i < inputShapes.size(); i++) { - PortConfig dataConf{}; - auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); - dataConf.setMemDesc(descCreator->createSharedDesc(getOriginalInputPrecisionAtPort(i), getInputShapeAtPort(i))); - config.inConfs.emplace_back(dataConf); - } - - for (size_t i = 0; i < outputShapes.size(); i++) { - PortConfig dataConf{}; - auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); - dataConf.setMemDesc( - descCreator->createSharedDesc(getOriginalOutputPrecisionAtPort(i), getOutputShapeAtPort(i))); - config.outConfs.push_back(dataConf); - } - - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); -} - -void If::createPrimitive() { const auto& eng = getEngine(); prepareBeforeMappers(true, eng); prepareBeforeMappers(false, eng); @@ -240,13 +246,15 @@ void If::execute(const dnnl::stream& strm) { auto& beforeMappers = condition ? beforeThenMappers : beforeElseMappers; auto& afterMappers = condition ? afterThenMappers : afterElseMappers; - auto& subGraph = condition ? subGraphThen : subGraphElse; + auto& subGraph = condition ? m_thenGraph : m_elseGraph; for (auto& mapper : beforeMappers) { mapper->execute(strm); } + subGraph.ResetInferCount(); subGraph.Infer(); + for (auto& mapper : afterMappers) { mapper->execute(strm); } diff --git a/src/plugins/intel_cpu/src/nodes/if.h b/src/plugins/intel_cpu/src/nodes/if.h index 829734142722e1..2abdba15ac83ad 100644 --- a/src/plugins/intel_cpu/src/nodes/if.h +++ b/src/plugins/intel_cpu/src/nodes/if.h @@ -11,6 +11,8 @@ #include #include +#include "openvino/op/if.hpp" + namespace ov { namespace intel_cpu { namespace node { @@ -21,10 +23,15 @@ class If : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; void initSupportedPrimitiveDescriptors() override; - void getSupportedDescriptors() override; + void getSupportedDescriptors() override {} + int registerToAllocationContext(int offset, AllocationContext& context) override; void createPrimitive() override; bool created() const override; + void execute(const dnnl::stream& strm) override; + bool neverExecute() const override { + return false; + } bool isExecutable() const override { return true; } @@ -65,8 +72,8 @@ class If : public Node { ptrdiff_t size; }; - Graph subGraphThen; - Graph subGraphElse; + Graph m_thenGraph; + Graph m_elseGraph; std::vector> inputMemThen, inputMemElse; std::deque outputMemThen, outputMemElse; @@ -75,7 +82,7 @@ class If : public Node { std::vector thenInputPortMap, thenOutputPortMap, elseInputPortMap, elseOutputPortMap; - const std::shared_ptr ovOp; + std::shared_ptr m_op; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index 8b6b09a9ec4bd0..a09197e763b9f5 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -499,10 +499,12 @@ void Input::selectOptimalPrimitiveDescriptor() { // ignore previous configuration supportedPrimitiveDescriptors.clear(); + const int inPlacePort = m_isInPlace ? 0 : -1; // and just use parent memory descriptor for Output node to avoid reorders insertion std::vector inConfs; for (size_t i = 0; i < getParentEdges().size(); i++) { - inConfs.push_back({PortConfig(getParentOutputMemDesc(getParentEdgeAt(i)), BlockedMemoryDesc::FULL_MASK, 0)}); + inConfs.push_back( + {PortConfig(getParentOutputMemDesc(getParentEdgeAt(0)), BlockedMemoryDesc::FULL_MASK, inPlacePort)}); } NodeConfig config(inConfs, {}); @@ -575,6 +577,38 @@ void Input::initSupportedPdFromMemDesc() { supportedPrimitiveDescriptors.emplace_back(std::move(config), impl_desc_type::unknown); } +void Input::resolveInPlaceEdges(Edge::LOOK look) { + if (!m_isInPlace) { + return Node::resolveInPlaceEdges(look); + } + + if (look & Edge::LOOK_UP) { + auto edges = getChildEdgesAtPort(0); + for (const auto& edge : edges) { + EdgePtr sharedEdge = edge; + + while (sharedEdge->getSharedEdge(std::nothrow)) { + sharedEdge = sharedEdge->getSharedEdge(std::nothrow); + } + + edge->reuse(sharedEdge->getMemoryPtr()); + } + } + + if (look & Edge::LOOK_DOWN) { + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto edge = getParentEdgeAt(i); + EdgePtr sharedEdge = edge; + + while (sharedEdge->getSharedEdge(std::nothrow)) { + sharedEdge = sharedEdge->getSharedEdge(std::nothrow); + } + + edge->reuse(sharedEdge->getMemoryPtr()); + } + } +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index d0e1814b7a6878..b21e4ae080f81a 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -56,12 +56,17 @@ class Input : public Node { void selectOptimalPrimitiveDescriptor() override; void createPrimitive() override; bool created() const override; + void resolveInPlaceEdges(Edge::LOOK look) override; void withMeanImage(); MemoryCPtr getMemoryPtr() const; void execute(const dnnl::stream& strm) override {} void executeDynamicImpl(const dnnl::stream& strm) override {} + + bool neverExecute() const override { + return true; + } bool isExecutable() const override { return false; } diff --git a/src/plugins/intel_cpu/src/nodes/interaction.cpp b/src/plugins/intel_cpu/src/nodes/interaction.cpp index b4b100d5dff3da..a31b3e5bfb6f9f 100644 --- a/src/plugins/intel_cpu/src/nodes/interaction.cpp +++ b/src/plugins/intel_cpu/src/nodes/interaction.cpp @@ -361,6 +361,10 @@ void Interaction::executeDynamicImpl(const dnnl::stream& strm) { execute(strm); } +bool Interaction::neverExecute() const { + return false; +} + bool Interaction::isExecutable() const { return true; } diff --git a/src/plugins/intel_cpu/src/nodes/interaction.h b/src/plugins/intel_cpu/src/nodes/interaction.h index 405a59940076ba..34dc10bff6d147 100644 --- a/src/plugins/intel_cpu/src/nodes/interaction.h +++ b/src/plugins/intel_cpu/src/nodes/interaction.h @@ -50,6 +50,7 @@ class Interaction : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool neverExecute() const override; bool isExecutable() const override; void executeDynamicImpl(const dnnl::stream& strm) override; void prepareParams() override; diff --git a/src/plugins/intel_cpu/src/nodes/lora.cpp b/src/plugins/intel_cpu/src/nodes/lora.cpp index c252142d461b08..ded7ada321f3a4 100644 --- a/src/plugins/intel_cpu/src/nodes/lora.cpp +++ b/src/plugins/intel_cpu/src/nodes/lora.cpp @@ -51,17 +51,19 @@ void LoRA::selectOptimalPrimitiveDescriptor() { auto mainInputPrc = mainInputDesc->getPrecision(); // we have to align precision across all the inputs inConfs.emplace_back(mainInputDesc); - graphInputConfig.emplace_back(node::Input::InputConfig{mainInputDesc, true}); + + constexpr bool isInPlace = true; + graphInputConfig.emplace_back(node::Input::InputConfig{mainInputDesc, isInPlace}); for (size_t i = 1; i < getParentEdges().size(); i++) { auto desc = getParentOutputMemDesc(getParentEdgeAt(i))->cloneWithNewPrecision(mainInputPrc); inConfs.emplace_back(desc); - graphInputConfig.emplace_back(node::Input::InputConfig{desc, true}); + graphInputConfig.emplace_back(node::Input::InputConfig{desc, isInPlace}); } std::vector graphOutputConfig; // enforce the same memory descriptor on the output as on the input to allow inPlace memory - graphOutputConfig.emplace_back(node::Input::OutputConfig{inConfs.front().getMemDesc(), true}); + graphOutputConfig.emplace_back(node::Input::OutputConfig{inConfs.front().getMemDesc(), isInPlace}); // configure the inner graph to get the information about output memory descriptors m_graph.Init(m_body, context, graphInputConfig, graphOutputConfig); @@ -86,24 +88,45 @@ void LoRA::selectOptimalPrimitiveDescriptor() { selectPrimitiveDescriptorByIndex(0); } -// @todo add ascii diagram for memory mapping / reuse -void LoRA::createPrimitive() { +int LoRA::registerToAllocationContext(int offset, AllocationContext& context) { CPU_NODE_ASSERT(getOriginalInputsNumber() == m_graph.inputsNumber(), "Number of node inputs must be equal the number of inner graph's inputs"); - std::vector inputMemory; for (size_t i = 0; i < getOriginalInputsNumber(); i++) { - auto srcEdgeMem = getSrcMemoryAtPort(i); - auto mem = std::make_shared(getEngine(), srcEdgeMem->getDescPtr(), srcEdgeMem->getMemoryBlock()); - subgraphMemoryPtrs.push_back(mem); - inputMemory.emplace_back(std::move(mem)); + auto parentEdge = getParentEdgeAt(i); + auto inputEdges = m_graph.getInputNodeByIndex(i)->getChildEdgesAtPort(0); + for (const auto& inputEdge : inputEdges) { + OPENVINO_ASSERT(inputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized Edge instead of: ", + static_cast(inputEdge->getStatus())); + inputEdge->sharedMemFrom(parentEdge); + } } CPU_NODE_ASSERT(getOriginalOutputsNumber() == m_graph.outputsNumber(), "Number of node outputs must be equal the number of inner graph's outputs"); - std::vector outputMemory{getDstMemoryAtPort(0)}; - m_graph.Activate(inputMemory, outputMemory); + for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { + auto childEdge = getChildEdgeAt(i); + auto outputEdge = m_graph.getOutputNodeByIndex(i)->getParentEdgeAt(0); + outputEdge->sharedMemFrom(childEdge); + } + + return m_graph.RegisterToAllocationContext(offset, context); +} + +void LoRA::createPrimitive() { + CPU_NODE_ASSERT(getOriginalInputsNumber() == m_graph.inputsNumber(), + "Number of node inputs must be equal the number of inner graph's inputs"); + // Workaround to avoid making LoRa node always executable (isExecutable() = true) + // This way we update subgraph's input memory without performing an actual Infer() call + for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + auto subgraphInputNode = m_graph.getInputNodeByIndex(i); + auto subgraphInputMemory = subgraphInputNode->getDstMemoryAtPort(0); + subgraphMemoryPtrs.emplace_back(subgraphInputMemory); + } + + m_graph.Activate(); } void LoRA::execute(const dnnl::stream&) { diff --git a/src/plugins/intel_cpu/src/nodes/lora.h b/src/plugins/intel_cpu/src/nodes/lora.h index 3c993c20e3f91d..128dddbdd3af33 100644 --- a/src/plugins/intel_cpu/src/nodes/lora.h +++ b/src/plugins/intel_cpu/src/nodes/lora.h @@ -23,6 +23,7 @@ class LoRA : public Node { void getSupportedDescriptors() override{}; void selectOptimalPrimitiveDescriptor() override; + int registerToAllocationContext(int offset, AllocationContext& context) override; void createPrimitive() override; void prepareParams() override; void execute(const dnnl::stream&) override; diff --git a/src/plugins/intel_cpu/src/nodes/matmul.cpp b/src/plugins/intel_cpu/src/nodes/matmul.cpp index d60ab73ad06667..7c1a3f9fdcf1b2 100644 --- a/src/plugins/intel_cpu/src/nodes/matmul.cpp +++ b/src/plugins/intel_cpu/src/nodes/matmul.cpp @@ -737,6 +737,10 @@ const std::vector& MatMul::getDefaultImplPriority() { return priorities; } +bool MatMul::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroOutputDims(); +} + bool MatMul::isExecutable() const { return !hasEmptyOutputTensors(); } diff --git a/src/plugins/intel_cpu/src/nodes/matmul.h b/src/plugins/intel_cpu/src/nodes/matmul.h index d1cff0ca8bcd2a..1ffd35807f0ffc 100644 --- a/src/plugins/intel_cpu/src/nodes/matmul.h +++ b/src/plugins/intel_cpu/src/nodes/matmul.h @@ -43,6 +43,7 @@ class MatMul : public Node { const std::vector& getDefaultImplPriority() override; bool canBeExecutedInInt8() const override; + bool neverExecute() const override; bool isExecutable() const override; protected: diff --git a/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp b/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp index d6c285f8d77fac..e189f105c10cb1 100644 --- a/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp +++ b/src/plugins/intel_cpu/src/nodes/matrix_nms.cpp @@ -315,6 +315,10 @@ void MatrixNms::prepareParams() { } } +bool MatrixNms::neverExecute() const { + return !isDynamicNode() && Node::neverExecute(); +} + bool MatrixNms::isExecutable() const { return isDynamicNode() || Node::isExecutable(); } diff --git a/src/plugins/intel_cpu/src/nodes/matrix_nms.h b/src/plugins/intel_cpu/src/nodes/matrix_nms.h index 4071cd81d18ae4..88bac5d70064a7 100644 --- a/src/plugins/intel_cpu/src/nodes/matrix_nms.h +++ b/src/plugins/intel_cpu/src/nodes/matrix_nms.h @@ -29,6 +29,7 @@ class MatrixNms : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool neverExecute() const override; bool isExecutable() const override; void executeDynamicImpl(const dnnl::stream& strm) override; diff --git a/src/plugins/intel_cpu/src/nodes/memory.cpp b/src/plugins/intel_cpu/src/nodes/memory.cpp index fb643197eb611c..d900a6afaa602e 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.cpp +++ b/src/plugins/intel_cpu/src/nodes/memory.cpp @@ -235,6 +235,10 @@ void MemoryOutputBase::assignState(const MemStatePtr& newState) { assignExtMemory(state->output_mem(), state->internal_desc()); } +bool MemoryOutputBase::neverExecute() const { + return false; +} + bool MemoryOutputBase::isExecutable() const { return true; } @@ -500,6 +504,10 @@ void MemoryInputBase::deregisterSibling(MemoryOutputBase* node) { } } +bool MemoryInputBase::neverExecute() const { + return false; +} + bool MemoryInputBase::isExecutable() const { return true; } @@ -691,36 +699,54 @@ void MemoryInput::initOptimalPrimitiveDescriptor() { // @todo add ascii diagramm for memory mapping / reuse void MemoryInput::createPrimitive() { - MemoryInputBase::createPrimitive(); if (haveSubgraph()) { - OPENVINO_ASSERT(getOriginalInputsNumber() == subGraph->inputsNumber(), - "Number of node inputs must be equal the number of inner graph's inputs: ", - getOriginalInputsNumber(), - " != ", - subGraph->inputsNumber()); + CPU_NODE_ASSERT(getParentEdges().size() == subGraph->inputsNumber(), + "The number of node inputs must be equal to the number of inner graph's inputs"); - std::vector inputMemory; for (size_t i = 0; i < getOriginalInputsNumber(); i++) { - auto srcEdgeMem = getSrcMemoryAtPort(i); - // create a separate input memory objects instead of share them. avoid data corruption. - auto mem = std::make_shared(getEngine(), srcEdgeMem->getDescPtr(), srcEdgeMem->getMemoryBlock()); - subgraphMemoryPtrs.push_back(mem); - inputMemory.emplace_back(std::move(mem)); + auto subgraphInputNode = subGraph->getInputNodeByIndex(i); + auto subgraphInputMemory = subgraphInputNode->getDstMemoryAtPort(0); + subgraphMemoryPtrs.push_back(subgraphInputMemory); } - OPENVINO_ASSERT(getOriginalOutputsNumber() == subGraph->outputsNumber(), - "Number of node outputs must be equal the number of inner graph's outputs: ", - getOriginalOutputsNumber(), - " != ", - subGraph->outputsNumber()); + subGraph->Activate(); + } - std::vector outputMemory; - for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { - outputMemory.emplace_back(getDstMemoryAtPort(i)); + MemoryInputBase::createPrimitive(); +} + +int MemoryInput::registerToAllocationContext(int offset, AllocationContext& context) { + if (!haveSubgraph()) { + return Node::registerToAllocationContext(offset, context); + } + + CPU_NODE_ASSERT(getParentEdges().size() == subGraph->inputsNumber(), + "Number of node inputs must be equal the number of inner graph's inputs"); + + for (size_t i = 0; i < subGraph->inputsNumber(); i++) { + auto parentEdge = getParentEdgeAt(i); + auto inputEdges = subGraph->getInputNodeByIndex(i)->getChildEdgesAtPort(0); + for (const auto& inputEdge : inputEdges) { + CPU_NODE_ASSERT(inputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", + *this); + inputEdge->sharedMemFrom(parentEdge); } + } + + CPU_NODE_ASSERT(subGraph->outputsNumber() <= getChildEdges().size(), + "Number of inner graph's outputs must be not greater than number of node outputs"); - subGraph->Activate(inputMemory, outputMemory); + for (size_t i = 0; i < subGraph->outputsNumber(); i++) { + auto childEdge = getChildEdgeAt(i); + auto outputEdge = subGraph->getOutputNodeByIndex(i)->getParentEdgeAt(0); + CPU_NODE_ASSERT(outputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", + *outputEdge); + outputEdge->sharedMemFrom(childEdge); } + + return subGraph->RegisterToAllocationContext(offset, context); } void MemoryInput::runDynamic(dnnl::stream strm) { diff --git a/src/plugins/intel_cpu/src/nodes/memory.hpp b/src/plugins/intel_cpu/src/nodes/memory.hpp index 604d0d2c80bad2..b6f4063d4988a2 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.hpp +++ b/src/plugins/intel_cpu/src/nodes/memory.hpp @@ -66,7 +66,9 @@ class MemoryOutputBase : public Node, public MemoryNode { void execute(const dnnl::stream& strm) override final; // NOLINT void executeDynamicImpl(const dnnl::stream& strm) override final; // NOLINT - bool isExecutable() const override final; // NOLINT + + bool isExecutable() const override final; // NOLINT + bool neverExecute() const override final; // NOLINT void registerInputNode(MemoryInputBase* node); void deregisterSibling(MemoryInputBase* node); @@ -149,6 +151,7 @@ class MemoryInputBase : public Input, public MemoryStateNode { bool needPrepareParams() const override { return false; } + bool neverExecute() const override final; // NOLINT bool isExecutable() const override final; // NOLINT void registerOutputNode(MemoryOutputBase* node); @@ -212,6 +215,8 @@ class MemoryInput : public MemoryInputBase { void resolveInPlaceEdges(Edge::LOOK look) override; + int registerToAllocationContext(int offset, AllocationContext& context) override; + void createPrimitive() override; MemStatePtr makeState() const override; diff --git a/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp b/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp index e0d2f5b0ce058a..10438e6f99dcd3 100644 --- a/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp +++ b/src/plugins/intel_cpu/src/nodes/multiclass_nms.cpp @@ -226,6 +226,10 @@ void MultiClassNms::prepareParams() { m_numBoxOffset.resize(m_numBatches); } +bool MultiClassNms::neverExecute() const { + return !isDynamicNode() && Node::neverExecute(); +} + bool MultiClassNms::isExecutable() const { return isDynamicNode() || Node::isExecutable(); } diff --git a/src/plugins/intel_cpu/src/nodes/multiclass_nms.hpp b/src/plugins/intel_cpu/src/nodes/multiclass_nms.hpp index fdcdc9af26d611..8968f92011592a 100644 --- a/src/plugins/intel_cpu/src/nodes/multiclass_nms.hpp +++ b/src/plugins/intel_cpu/src/nodes/multiclass_nms.hpp @@ -27,6 +27,7 @@ class MultiClassNms : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool neverExecute() const override; bool isExecutable() const override; void executeDynamicImpl(const dnnl::stream& strm) override; diff --git a/src/plugins/intel_cpu/src/nodes/multinomial.cpp b/src/plugins/intel_cpu/src/nodes/multinomial.cpp index 7802a847f275ba..36e025d908e214 100644 --- a/src/plugins/intel_cpu/src/nodes/multinomial.cpp +++ b/src/plugins/intel_cpu/src/nodes/multinomial.cpp @@ -116,6 +116,11 @@ void Multinomial::prepareParams() { m_batches_samples_probs_count = m_output_elements_count * m_probs_count; } +bool Multinomial::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(PROBS_PORT) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(NUM_SAMPLES_PORT); +} + bool Multinomial::isExecutable() const { return !isInputTensorAtPortEmpty(PROBS_PORT) && !isInputTensorAtPortEmpty(NUM_SAMPLES_PORT); } diff --git a/src/plugins/intel_cpu/src/nodes/multinomial.hpp b/src/plugins/intel_cpu/src/nodes/multinomial.hpp index b56fe08e870d93..fe8f1c2c70f69f 100644 --- a/src/plugins/intel_cpu/src/nodes/multinomial.hpp +++ b/src/plugins/intel_cpu/src/nodes/multinomial.hpp @@ -30,6 +30,7 @@ class Multinomial : public Node { void createPrimitive() override; + bool neverExecute() const override; bool isExecutable() const override; void execute(const dnnl::stream& strm) override; void executeDynamicImpl(const dnnl::stream& strm) override; diff --git a/src/plugins/intel_cpu/src/nodes/node_config.h b/src/plugins/intel_cpu/src/nodes/node_config.h index d09e2132c1b470..9c17fec9f203f3 100644 --- a/src/plugins/intel_cpu/src/nodes/node_config.h +++ b/src/plugins/intel_cpu/src/nodes/node_config.h @@ -139,6 +139,11 @@ class PortConfig { _desc = createPortDesc(desc, cmpMask); } + bool hasZeroDims() const { + const auto desc = getMemDesc(); + return desc->getShape().hasZeroDims() && !desc->empty(); + } + private: PortDescBasePtr createPortDesc(const MemoryDescPtr& desc, BlockedMemoryDesc::CmpMask cmpMask) { if (desc->getType() & Blocked) diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp index 4af902f26e4885..e66b7daa7b2e1a 100644 --- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp +++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp @@ -968,6 +968,10 @@ void NonMaxSuppression::checkOutput(const Shape& shape, const std::string& name, } } +bool NonMaxSuppression::neverExecute() const { + return !isDynamicNode() && Node::neverExecute(); +} + bool NonMaxSuppression::isExecutable() const { return isDynamicNode() || Node::isExecutable(); } diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.h b/src/plugins/intel_cpu/src/nodes/non_max_suppression.h index 8d7aa93b969ad7..caea270de9e768 100644 --- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.h +++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.h @@ -48,6 +48,7 @@ class NonMaxSuppression : public Node { int suppress_begin_index; }; + bool neverExecute() const override; bool isExecutable() const override; bool needShapeInfer() const override { diff --git a/src/plugins/intel_cpu/src/nodes/non_zero.h b/src/plugins/intel_cpu/src/nodes/non_zero.h index e9e2bef9fe294a..1685d9c24ccf63 100644 --- a/src/plugins/intel_cpu/src/nodes/non_zero.h +++ b/src/plugins/intel_cpu/src/nodes/non_zero.h @@ -34,6 +34,9 @@ class NonZero : public Node { void executeDynamicImpl(const dnnl::stream& strm) override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool neverExecute() const override { + return false; + } bool isExecutable() const override { return true; } diff --git a/src/plugins/intel_cpu/src/nodes/normalize.cpp b/src/plugins/intel_cpu/src/nodes/normalize.cpp index 689eb757c90491..b4e3c44cc42f0a 100644 --- a/src/plugins/intel_cpu/src/nodes/normalize.cpp +++ b/src/plugins/intel_cpu/src/nodes/normalize.cpp @@ -962,6 +962,10 @@ void NormalizeL2::createPrimitive() { } } +bool NormalizeL2::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool NormalizeL2::isExecutable() const { return !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/normalize.h b/src/plugins/intel_cpu/src/nodes/normalize.h index fe6c6c83589fc1..e531e0c8a8851c 100644 --- a/src/plugins/intel_cpu/src/nodes/normalize.h +++ b/src/plugins/intel_cpu/src/nodes/normalize.h @@ -98,6 +98,7 @@ class NormalizeL2 : public Node { void prepareParams() override; void executeDynamicImpl(const dnnl::stream& strm) override; + bool neverExecute() const override; bool isExecutable() const override; enum class NormEpsMode { ADD, MAX }; diff --git a/src/plugins/intel_cpu/src/nodes/pad.cpp b/src/plugins/intel_cpu/src/nodes/pad.cpp index 530fb84f15a3f3..0f5b0adec6ca30 100644 --- a/src/plugins/intel_cpu/src/nodes/pad.cpp +++ b/src/plugins/intel_cpu/src/nodes/pad.cpp @@ -210,6 +210,10 @@ void Pad::createPrimitive() { } } +bool Pad::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroOutputDimsAtPort(0); +} + bool Pad::isExecutable() const { return !isOutputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/pad.h b/src/plugins/intel_cpu/src/nodes/pad.h index 02915eff6a7d3b..f051dde01cd141 100644 --- a/src/plugins/intel_cpu/src/nodes/pad.h +++ b/src/plugins/intel_cpu/src/nodes/pad.h @@ -23,6 +23,7 @@ class Pad : public Node { void prepareParams() override; bool needShapeInfer() const override; + bool neverExecute() const override; bool isExecutable() const override; bool needPrepareParams() const override; diff --git a/src/plugins/intel_cpu/src/nodes/paged_attn.h b/src/plugins/intel_cpu/src/nodes/paged_attn.h index 8526b3b8dda999..8f41d811337d43 100644 --- a/src/plugins/intel_cpu/src/nodes/paged_attn.h +++ b/src/plugins/intel_cpu/src/nodes/paged_attn.h @@ -22,10 +22,19 @@ class PagedAttention : public Node { bool created() const override { return getType() == Type::PagedAttention; } + + // pastkv may have zero dimension + bool neverExecute() const override { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(1) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(2); + } + // pastkv may have zero dimension bool isExecutable() const override { return !isInputTensorAtPortEmpty(0) && !isInputTensorAtPortEmpty(1) && !isInputTensorAtPortEmpty(2); } + bool needPrepareParams() const override { return false; } diff --git a/src/plugins/intel_cpu/src/nodes/random_uniform.cpp b/src/plugins/intel_cpu/src/nodes/random_uniform.cpp index bf2e8efa1c1cd9..eed6ed260ec80e 100644 --- a/src/plugins/intel_cpu/src/nodes/random_uniform.cpp +++ b/src/plugins/intel_cpu/src/nodes/random_uniform.cpp @@ -236,6 +236,10 @@ bool RandomUniform::needShapeInfer() const { return !m_const_inputs[SHAPE]; } +bool RandomUniform::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(SHAPE); +} + bool RandomUniform::isExecutable() const { return !isInputTensorAtPortEmpty(SHAPE); } diff --git a/src/plugins/intel_cpu/src/nodes/random_uniform.hpp b/src/plugins/intel_cpu/src/nodes/random_uniform.hpp index 7c0321b8183bfc..0e479682fd9388 100644 --- a/src/plugins/intel_cpu/src/nodes/random_uniform.hpp +++ b/src/plugins/intel_cpu/src/nodes/random_uniform.hpp @@ -41,6 +41,7 @@ class RandomUniform : public Node { void executeDynamicImpl(const dnnl::stream& strm) override; + bool neverExecute() const override; bool isExecutable() const override; void createPrimitive() override; diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index a86b89c099d5ec..6edefd701d5d73 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -2237,6 +2237,10 @@ void Reduce::initSupportedPrimitiveDescriptors() { } } +bool Reduce::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroOutputDimsAtPort(0); +} + bool Reduce::isExecutable() const { return !isOutputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/reduce.h b/src/plugins/intel_cpu/src/nodes/reduce.h index 16cf99bd9c75d4..8caa7a92b405a0 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.h +++ b/src/plugins/intel_cpu/src/nodes/reduce.h @@ -103,6 +103,7 @@ class Reduce : public Node { return false; } + bool neverExecute() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; @@ -206,4 +207,4 @@ class Reduce : public Node { } // namespace node } // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/reference.h b/src/plugins/intel_cpu/src/nodes/reference.h index f0a37ae6529f5f..11eef0b6fa6364 100644 --- a/src/plugins/intel_cpu/src/nodes/reference.h +++ b/src/plugins/intel_cpu/src/nodes/reference.h @@ -24,6 +24,9 @@ class Reference : public Node { bool needPrepareParams() const override { return false; } + bool neverExecute() const override { + return false; + } bool isExecutable() const override { return true; } diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index 73fb1b0509f01f..6f4a052033f082 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -28,8 +28,12 @@ namespace ov { namespace intel_cpu { namespace node { +bool Reorder::neverExecute() const { + return isOptimized || Node::neverExecute(); +} + bool Reorder::isExecutable() const { - return Node::isExecutable() && !isOptimized; + return !isOptimized && Node::isExecutable(); } Reorder::Reorder(const std::shared_ptr& op, const GraphContext::CPtr& context) diff --git a/src/plugins/intel_cpu/src/nodes/reorder.h b/src/plugins/intel_cpu/src/nodes/reorder.h index d04416157a0991..7f5e929939da5c 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.h +++ b/src/plugins/intel_cpu/src/nodes/reorder.h @@ -26,6 +26,7 @@ class Reorder : public Node { bool created() const override; const std::vector& getDefaultImplPriority() override; + bool neverExecute() const override; bool isExecutable() const override; void createPrimitive() override; diff --git a/src/plugins/intel_cpu/src/nodes/reshape.cpp b/src/plugins/intel_cpu/src/nodes/reshape.cpp index 69d4b1f3d9b3a2..6d7b33c9ebd724 100644 --- a/src/plugins/intel_cpu/src/nodes/reshape.cpp +++ b/src/plugins/intel_cpu/src/nodes/reshape.cpp @@ -142,7 +142,7 @@ void Reshape::execute(const dnnl::stream& strm) { } } -bool Reshape::isExecutable() const { +bool Reshape::neverExecute() const { bool inPlaceEnabled = false; if (auto prim_desc = getSelectedPrimitiveDescriptor()) { auto& config = prim_desc->getConfig(); @@ -150,7 +150,11 @@ bool Reshape::isExecutable() const { inPlaceEnabled = true; } } - return !inPlaceEnabled; + return inPlaceEnabled; +} + +bool Reshape::isExecutable() const { + return !neverExecute(); } bool Reshape::created() const { diff --git a/src/plugins/intel_cpu/src/nodes/reshape.h b/src/plugins/intel_cpu/src/nodes/reshape.h index 7758dfa6e06746..2f6f5ec9be4ff6 100644 --- a/src/plugins/intel_cpu/src/nodes/reshape.h +++ b/src/plugins/intel_cpu/src/nodes/reshape.h @@ -18,6 +18,7 @@ class Reshape : public Node { void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; bool created() const override; + bool neverExecute() const override; bool isExecutable() const override; bool needShapeInfer() const override; diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.h b/src/plugins/intel_cpu/src/nodes/scaled_attn.h index aeabee681599b8..6efcd4ebcc5d1e 100644 --- a/src/plugins/intel_cpu/src/nodes/scaled_attn.h +++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.h @@ -21,6 +21,12 @@ class ScaledDotProductAttention : public Node { bool created() const override { return getType() == Type::ScaledDotProductAttention; } + + bool neverExecute() const override { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(1) || + getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(2); + } // pastkv may have zero dimension bool isExecutable() const override { return !isInputTensorAtPortEmpty(0) && !isInputTensorAtPortEmpty(1) && !isInputTensorAtPortEmpty(2); diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp index d184b9cbbf4fad..e61f677ef80c54 100644 --- a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp +++ b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp @@ -70,6 +70,10 @@ bool ScatterUpdate::isSupportedOperation(const std::shared_ptr& return true; } +bool ScatterUpdate::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(DATA_ID); +} + bool ScatterUpdate::isExecutable() const { return !isInputTensorAtPortEmpty(DATA_ID); } diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.h b/src/plugins/intel_cpu/src/nodes/scatter_update.h index df3827c2fa4f65..df45c49dc33cf0 100644 --- a/src/plugins/intel_cpu/src/nodes/scatter_update.h +++ b/src/plugins/intel_cpu/src/nodes/scatter_update.h @@ -89,6 +89,7 @@ class ScatterUpdate : public Node { bool needPrepareParams() const override; void executeDynamicImpl(const dnnl::stream& strm) override; + bool neverExecute() const override; bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; diff --git a/src/plugins/intel_cpu/src/nodes/shapeof.cpp b/src/plugins/intel_cpu/src/nodes/shapeof.cpp index 074a0e6c667f32..18c07bc6c7de1d 100644 --- a/src/plugins/intel_cpu/src/nodes/shapeof.cpp +++ b/src/plugins/intel_cpu/src/nodes/shapeof.cpp @@ -82,10 +82,6 @@ void ShapeOf::initOptimalPrimitiveDescriptor() { selected_pd->setConfig(config); } -bool ShapeOf::isExecutable() const { - return true; -} - void ShapeOf::execute(const dnnl::stream& strm) { auto inPtr = getSrcMemoryAtPort(0); auto outPtr = getDstMemoryAtPort(0); diff --git a/src/plugins/intel_cpu/src/nodes/shapeof.h b/src/plugins/intel_cpu/src/nodes/shapeof.h index e625af7bfb6a0c..5d1722b0ec6817 100644 --- a/src/plugins/intel_cpu/src/nodes/shapeof.h +++ b/src/plugins/intel_cpu/src/nodes/shapeof.h @@ -28,11 +28,17 @@ class ShapeOf : public Node { bool needPrepareParams() const override { return false; }; + void executeDynamicImpl(const dnnl::stream& strm) override { execute(strm); } - bool isExecutable() const override; + bool neverExecute() const override { + return false; + }; + bool isExecutable() const override { + return true; + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; }; diff --git a/src/plugins/intel_cpu/src/nodes/split.cpp b/src/plugins/intel_cpu/src/nodes/split.cpp index cc1560b4bb10cb..87a8421c554609 100644 --- a/src/plugins/intel_cpu/src/nodes/split.cpp +++ b/src/plugins/intel_cpu/src/nodes/split.cpp @@ -293,6 +293,10 @@ void Split::prepareParams() { } } +bool Split::neverExecute() const { + return isInPlace() || getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool Split::isExecutable() const { return !isInPlace() && !isInputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/split.h b/src/plugins/intel_cpu/src/nodes/split.h index a93e439d7fd5f6..2731a6f3a6062e 100644 --- a/src/plugins/intel_cpu/src/nodes/split.h +++ b/src/plugins/intel_cpu/src/nodes/split.h @@ -23,6 +23,7 @@ class Split : public Node { void initOptimalPrimitiveDescriptor() override; + bool neverExecute() const override; bool isExecutable() const override; bool needPrepareParams() const override; diff --git a/src/plugins/intel_cpu/src/nodes/strided_slice.cpp b/src/plugins/intel_cpu/src/nodes/strided_slice.cpp index 15c99b91824bb6..3dfef6b9e50476 100644 --- a/src/plugins/intel_cpu/src/nodes/strided_slice.cpp +++ b/src/plugins/intel_cpu/src/nodes/strided_slice.cpp @@ -325,6 +325,11 @@ void StridedSlice::initSupportedPrimitiveDescriptors() { } } +bool StridedSlice::neverExecute() const { + return getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0) || + getSelectedPrimitiveDescriptor()->hasZeroOutputDimsAtPort(0); +} + bool StridedSlice::isExecutable() const { return !isInputTensorAtPortEmpty(0) && !isOutputTensorAtPortEmpty(0); } diff --git a/src/plugins/intel_cpu/src/nodes/strided_slice.h b/src/plugins/intel_cpu/src/nodes/strided_slice.h index ca755a62a7bdf5..d4776708a9703e 100644 --- a/src/plugins/intel_cpu/src/nodes/strided_slice.h +++ b/src/plugins/intel_cpu/src/nodes/strided_slice.h @@ -27,6 +27,7 @@ class StridedSlice : public Node { return false; } + bool neverExecute() const override; bool isExecutable() const override; bool needShapeInfer() const override; diff --git a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp index 7543ade46514ed..ecd13c64952b48 100644 --- a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp +++ b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp @@ -436,29 +436,40 @@ TensorIterator::TensorIterator(const std::shared_ptr& op, const GraphC } } -void TensorIterator::getSupportedDescriptors() { - auto tiOp = ov::as_type_ptr(ngraphOp); - if (!tiOp) { - THROW_CPU_NODE_ERR("cannot be cast to ov::op::util::SubGraphOp"); +void TensorIterator::initSupportedPrimitiveDescriptors() { + auto subgraphOp = ov::as_type_ptr(ngraphOp); + CPU_NODE_ASSERT(subgraphOp, "cannot be cast to ov::op::util::SubGraphOp"); + + sub_graph.Init(subgraphOp->get_function(), context); + + if (!supportedPrimitiveDescriptors.empty()) { + return; } - const std::shared_ptr body = tiOp->get_function(); - sub_graph.CreateGraph(body, context); - for (const auto& param : tiOp->get_function()->get_parameters()) { - if (auto inNode = sub_graph.getInputNodeByIndex(tiOp->get_function()->get_parameter_index(param))) { + supportedPrimitiveDescriptors.emplace_back(make_plain_config(ngraphOp), impl_desc_type::unknown); +} + +void TensorIterator::createPrimitive() { + sub_graph.Activate(); + + auto subgraphOp = ov::as_type_ptr(ngraphOp); + CPU_NODE_ASSERT(subgraphOp, "cannot be cast to ov::op::util::SubGraphOp"); + + for (const auto& param : subgraphOp->get_function()->get_parameters()) { + if (auto inNode = sub_graph.getInputNodeByIndex(subgraphOp->get_function()->get_parameter_index(param))) { input_mems.push_back(getToMemories(inNode.get(), 0)); } } - for (const auto& out : tiOp->get_function()->get_results()) { - if (auto outNode = sub_graph.getOutputNodeByIndex(tiOp->get_function()->get_result_index(out))) { + for (const auto& out : subgraphOp->get_function()->get_results()) { + if (auto outNode = sub_graph.getOutputNodeByIndex(subgraphOp->get_function()->get_result_index(out))) { auto outMem = outNode->getSrcMemoryAtPort(0); output_mem.push_back(outMem); } } // Port map: outputs - for (const auto& desc : tiOp->get_output_descriptions()) { + for (const auto& desc : subgraphOp->get_output_descriptions()) { auto body_output_idx = desc->m_body_value_index; std::string type_name = desc->get_type_info().name; @@ -490,7 +501,7 @@ void TensorIterator::getSupportedDescriptors() { } // Port map : inputs and back edges - for (const auto& desc : tiOp->get_input_descriptions()) { + for (const auto& desc : subgraphOp->get_input_descriptions()) { auto body_input_index = desc->m_body_parameter_index; if (auto slice_desc = ov::as_type_ptr(desc)) { @@ -543,17 +554,7 @@ void TensorIterator::getSupportedDescriptors() { } else { THROW_CPU_NODE_ERR("isn't supported!"); } -} -void TensorIterator::initSupportedPrimitiveDescriptors() { - if (!supportedPrimitiveDescriptors.empty()) { - return; - } - - supportedPrimitiveDescriptors.emplace_back(make_plain_config(ngraphOp), impl_desc_type::unknown); -} - -void TensorIterator::createPrimitive() { if (loopBodyConditionOutputIdx == -1) { continue_cond_check.reset(new staticValueCheck(true)); // always true } @@ -573,6 +574,10 @@ void TensorIterator::createPrimitive() { } } +int TensorIterator::registerToAllocationContext(int offset, AllocationContext& context) { + return sub_graph.RegisterToAllocationContext(offset, context); +} + bool TensorIterator::needPrepareParams() const { if (getAlgorithm() == Algorithm::TensorIteratorLoop) { const auto tripCountPtr = getSrcDataAtPortAs(loopTripCountIdx); diff --git a/src/plugins/intel_cpu/src/nodes/tensoriterator.h b/src/plugins/intel_cpu/src/nodes/tensoriterator.h index 97399d28e788b3..247b88ab1bddf9 100644 --- a/src/plugins/intel_cpu/src/nodes/tensoriterator.h +++ b/src/plugins/intel_cpu/src/nodes/tensoriterator.h @@ -113,13 +113,21 @@ class TensorIterator : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; void initSupportedPrimitiveDescriptors() override; - void getSupportedDescriptors() override; + void getSupportedDescriptors() override{}; void createPrimitive() override; + int registerToAllocationContext(int offset, AllocationContext& context) override; bool created() const override; void execute(const dnnl::stream& strm) override; + bool neverExecute() const override { + return false; + } bool isExecutable() const override { return true; } + // @todo limit to particular in / out ports + bool usesInOutMemoryMultipleTimes() { + return true; + } protected: // needShapeInfer() should return false diff --git a/src/plugins/intel_cpu/src/nodes/transpose.cpp b/src/plugins/intel_cpu/src/nodes/transpose.cpp index b17333011f6dd1..9ecc2ee50fdabc 100644 --- a/src/plugins/intel_cpu/src/nodes/transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/transpose.cpp @@ -127,8 +127,12 @@ void Transpose::initSupportedPrimitiveDescriptors() { } } +bool Transpose::neverExecute() const { + return isOptimized || getSelectedPrimitiveDescriptor()->hasZeroInputDimsAtPort(0); +} + bool Transpose::isExecutable() const { - return !isInputTensorAtPortEmpty(0) && !isOptimized; + return !isOptimized && !isInputTensorAtPortEmpty(0); } bool Transpose::needPrepareParams() const { diff --git a/src/plugins/intel_cpu/src/nodes/transpose.h b/src/plugins/intel_cpu/src/nodes/transpose.h index c865e4918c28cd..34db4cd6e6d733 100644 --- a/src/plugins/intel_cpu/src/nodes/transpose.h +++ b/src/plugins/intel_cpu/src/nodes/transpose.h @@ -34,6 +34,7 @@ class Transpose : public Node { return order; } + bool neverExecute() const override; bool isExecutable() const override; bool needPrepareParams() const override; void prepareParams() override; diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 82342479aa9d77..a49669ab095466 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -7,6 +7,7 @@ #include "cpu_streams_calculation.hpp" #include "internal_properties.hpp" #include "itt.h" +#include "openvino/core/parallel.hpp" #include "openvino/op/paged_attention.hpp" #include "openvino/runtime/intel_cpu/properties.hpp" #include "openvino/runtime/internal_properties.hpp" diff --git a/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake b/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake index 6ce25c7bff55e4..52ccb94c7fa55f 100644 --- a/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake +++ b/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake @@ -2,13 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 # -# create targed with prefix TARGET_PREFIX for each test file in directory TEST_DIR +#create targed with prefix TARGET_PREFIX for each test file in directory TEST_DIR function(create_target_per_test_for_directory TEST_DIR TARGET_PREFIX) - # exclude every other test file inside directory +#exclude every other test file inside directory set(EXCLUDED_SOURCE_PATHS_FOR_TEST ${TEST_DIR}) - # list of object files required for each test +#list of object files required for each test set(REQUIRED_OBJECT_FILES ${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/core_config.cpp ${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/skip_tests_config.cpp @@ -96,7 +96,7 @@ endif() endfunction() if(ENABLE_CPU_SPECIFIC_TARGET_PER_TEST) - create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src ov_cpu_func_subgraph) + create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src/common ov_cpu_func_subgraph) create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/single_layer_tests ov_cpu_func_slt) endif() diff --git a/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp b/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp index f520b0b53feae8..7ff89a029003d2 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/inplace_resolve_io.cpp @@ -5,24 +5,26 @@ #include "dummy_node.hpp" #include "graph.h" - -#include "nodes/input.h" +#include "memory_control.hpp" #include "nodes/concat.h" +#include "nodes/input.h" #include "nodes/rnn.h" - #include "openvino/op/concat.hpp" -#include "openvino/op/result.hpp" -#include "openvino/op/parameter.hpp" #include "openvino/op/gru_sequence.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/result.hpp" using namespace ov::intel_cpu; using namespace ov::op; // helper to check the inplace direction of a node with a part of its name -static void CheckInplaceDirection(const std::shared_ptr graph, const std::string &partial_name, size_t inport, const Edge::LOOK undesiredDirection) { +static void CheckInplaceDirection(const std::shared_ptr graph, + const std::string& partial_name, + size_t inport, + const Edge::LOOK undesiredDirection) { const std::vector& graph_nodes = graph->GetNodes(); size_t actualCount = 0; - for (auto &node : graph_nodes) { + for (auto& node : graph_nodes) { if (node->getName().find(partial_name) != std::string::npos) { auto parentEdge = node->getParentEdgeAt(inport); if (undesiredDirection == 0) @@ -36,13 +38,14 @@ static void CheckInplaceDirection(const std::shared_ptr graph, const std: ASSERT_EQ(1, actualCount); } - class InplaceResolveIOCPUTestBase : public ::testing::Test { public: - std::shared_ptr create_graph(const std::vector& input_shapes, const size_t num_consumers = 1) { + std::shared_ptr create_graph(const std::vector& input_shapes, + const size_t num_consumers = 1) { Config conf; conf.rtCacheCapacity = 100; - const auto context = std::make_shared(conf, nullptr, false); + const auto context = + std::make_shared(conf, nullptr, false); std::shared_ptr graph = std::shared_ptr(new Graph()); @@ -82,7 +85,8 @@ class InplaceResolveIOCPUTestBase : public ::testing::Test { private: void Replicate(ov::ParameterVector params, ov::ResultVector results, GraphContext::CPtr context) { replicate_impl(params, results, context); - for (auto &node : nodesSet) nodes.emplace_back(node); + for (auto& node : nodesSet) + nodes.emplace_back(node); } std::vector nodes; @@ -91,21 +95,22 @@ class InplaceResolveIOCPUTestBase : public ::testing::Test { }; class RNNConcatCPUTest : public InplaceResolveIOCPUTestBase { -/*This test runs the following subgraph: - param0 param1 param2 - H_t X seq_lens - / \ | / - / \ | / - Softmax0 RNNSequence - \ /(Ho) \(Y) - \ / \ - Concat Reshape1 - | | - | | - Result0 Result1 - -Edge Concat -> Result0 can share memory of inference output; Reshape1 -> Result1 can share memory of inference output; -*/ + /*This test runs the following subgraph: + param0 param1 param2 + H_t X seq_lens + / \ | / + / \ | / + Softmax0 RNNSequence + \ /(Ho) \(Y) + \ / \ + Concat Reshape1 + | | + | | + Result0 Result1 + + Edge Concat -> Result0 can share memory of inference output; Reshape1 -> Result1 can share memory of inference + output; + */ protected: void replicate_impl(ov::ParameterVector params, ov::ResultVector results, GraphContext::CPtr context) override { std::vector> inputNodes; @@ -113,10 +118,16 @@ Edge Concat -> Result0 can share memory of inference output; Reshape1 -> Result1 inputNodes.push_back(std::make_shared(params[i], context)); } - auto dummy_softmax = std::make_shared( - params[0]->get_output_partial_shape(0), testPrec, "Softmax0" /*name*/, "DummyNode" /*type*/, context, LayoutType::ncsp, 0/*look*/); + auto dummy_softmax = std::make_shared(params[0]->get_output_partial_shape(0), + testPrec, + "Softmax0" /*name*/, + "DummyNode" /*type*/, + context, + LayoutType::ncsp, + 0 /*look*/); - auto concat = std::make_shared(ov::OutputVector{params[0], params[0]}, 0); // default, the connection will be reset by addEdge + auto concat = std::make_shared(ov::OutputVector{params[0], params[0]}, + 0); // default, the connection will be reset by addEdge auto concatNode = std::make_shared(concat, context); constexpr size_t input_size = 8; @@ -128,19 +139,30 @@ Edge Concat -> Result0 can share memory of inference output; Reshape1 -> Result1 auto wNode = std::make_shared(W, context); auto rNode = std::make_shared(R, context); auto bNode = std::make_shared(B, context); - auto rnnseq = std::make_shared( - params[1], // X - params[0], // H_t - params[2], // sequence_lengths - W, R, B, - hidden_size, RecurrentSequenceDirection::FORWARD); + auto rnnseq = std::make_shared(params[1], // X + params[0], // H_t + params[2], // sequence_lengths + W, + R, + B, + hidden_size, + RecurrentSequenceDirection::FORWARD); auto rnnseqNode = std::make_shared(rnnseq, context); - auto dummy_reshape = std::make_shared( - rnnseq->get_output_partial_shape(0), testPrec, "Reshape1" /*name*/, "DummyNode" /*type*/, context, LayoutType::ncsp, Edge::LOOK::LOOK_BOTH); + auto dummy_reshape = std::make_shared(rnnseq->get_output_partial_shape(0), + testPrec, + "Reshape1" /*name*/, + "DummyNode" /*type*/, + context, + LayoutType::ncsp, + Edge::LOOK::LOOK_BOTH); auto outputNode0 = std::make_shared(results.front(), context); - auto outputNode1 = std::make_shared(dummy_reshape->getOutputShapeAtPort(0), testPrec, "_result1", "Result", context); + auto outputNode1 = std::make_shared(dummy_reshape->getOutputShapeAtPort(0), + testPrec, + "_result1", + "Result", + context); addEdge(inputNodes[0], dummy_softmax, 0, 0); addEdge(inputNodes[0], rnnseqNode, 0, 0); @@ -155,8 +177,8 @@ Edge Concat -> Result0 can share memory of inference output; Reshape1 -> Result1 addEdge(rNode, rnnseqNode, 0, 4); addEdge(bNode, rnnseqNode, 0, 5); - addEdge(rnnseqNode, dummy_reshape, 0, 0); // Y - addEdge(rnnseqNode, concatNode, 1, 1); // Ho + addEdge(rnnseqNode, dummy_reshape, 0, 0); // Y + addEdge(rnnseqNode, concatNode, 1, 1); // Ho addEdge(dummy_reshape, outputNode1, 0, 0); } @@ -164,26 +186,32 @@ Edge Concat -> Result0 can share memory of inference output; Reshape1 -> Result1 TEST_F(RNNConcatCPUTest, smoke_resolve_inplace_io) { auto graph = create_graph({ov::PartialShape{-1, 1, 3}, ov::PartialShape{-1, 10, 8}, ov::PartialShape{-1}}, 2); - CheckInplaceDirection(graph, std::string("Concat"), 0/*inport*/, Edge::LOOK::LOOK_UP/*undesired edge look direction*/); - CheckInplaceDirection(graph, std::string("_result1"), 0/*inport*/, Edge::LOOK::LOOK_UP/*undesired edge look direction*/); + CheckInplaceDirection(graph, + std::string("Concat"), + 0 /*inport*/, + Edge::LOOK::LOOK_UP /*undesired edge look direction*/); + CheckInplaceDirection(graph, + std::string("_result1"), + 0 /*inport*/, + Edge::LOOK::LOOK_UP /*undesired edge look direction*/); } class SoftmaxAddReshapeOutputCPUTest : public InplaceResolveIOCPUTestBase { -/*This test runs the following subgraph: - - param - | - | - Softmax - / \ - / \ - Add Reshape0 - | | - | | - Result0 Result1 - -expect edge Reshape0->Result1 to be referenced by its upstreams, instead of referencing to its upstreams. -*/ + /*This test runs the following subgraph: + + param + | + | + Softmax + / \ + / \ + Add Reshape0 + | | + | | + Result0 Result1 + + expect edge Reshape0->Result1 to be referenced by its upstreams, instead of referencing to its upstreams. + */ protected: void replicate_impl(ov::ParameterVector params, ov::ResultVector results, GraphContext::CPtr context) override { std::vector> inputNodes; @@ -196,14 +224,29 @@ expect edge Reshape0->Result1 to be referenced by its upstreams, instead of refe outputNodes.push_back(std::make_shared(results[i], context)); } - auto dummy_softmax = std::make_shared( - testShape, testPrec, "softmax" /*name*/, "DummyNode" /*type*/, context, LayoutType::ncsp, 0/*look*/); - - auto dummy_add = std::make_shared( - testShape, testPrec, "add" /*name*/, "DummyNode" /*type*/, context, LayoutType::ncsp, 0/*look*/); - - auto dummy_reshape = std::make_shared( - testShape, testPrec, "reshape" /*name*/, "DummyNode" /*type*/, context, LayoutType::ncsp, Edge::LOOK::LOOK_BOTH); + auto dummy_softmax = std::make_shared(testShape, + testPrec, + "softmax" /*name*/, + "DummyNode" /*type*/, + context, + LayoutType::ncsp, + 0 /*look*/); + + auto dummy_add = std::make_shared(testShape, + testPrec, + "add" /*name*/, + "DummyNode" /*type*/, + context, + LayoutType::ncsp, + 0 /*look*/); + + auto dummy_reshape = std::make_shared(testShape, + testPrec, + "reshape" /*name*/, + "DummyNode" /*type*/, + context, + LayoutType::ncsp, + Edge::LOOK::LOOK_BOTH); addEdge(inputNodes.front(), dummy_softmax, 0, 0); @@ -217,30 +260,32 @@ expect edge Reshape0->Result1 to be referenced by its upstreams, instead of refe TEST_F(SoftmaxAddReshapeOutputCPUTest, smoke_resolve_inplace_io) { auto graph = create_graph({ov::PartialShape{2, -1}}, 2); - CheckInplaceDirection(graph, std::string("_result1"), 0/*inport*/, Edge::LOOK::LOOK_UP/*undesired edge look direction*/); + CheckInplaceDirection(graph, + std::string("_result1"), + 0 /*inport*/, + Edge::LOOK::LOOK_UP /*undesired edge look direction*/); } - class SoftmaxAddReshapeTwoOutputsCPUTest : public InplaceResolveIOCPUTestBase { -/*This test runs the following subgraph: - - param - | - | - Softmax - / \ - / \ - Add Reshape0 - | | \ - | | \ - Result0 Reshape1 Result2 - | - | - Result1 - -Hope Reshape0 could resolve downstream, so either edge Reshape1 -> Result1 or Reshape0 -> Result2 -could get a chance to be referenced by infer request. -*/ + /*This test runs the following subgraph: + + param + | + | + Softmax + / \ + / \ + Add Reshape0 + | | \ + | | \ + Result0 Reshape1 Result2 + | + | + Result1 + + Hope Reshape0 could resolve downstream, so either edge Reshape1 -> Result1 or Reshape0 -> Result2 + could get a chance to be referenced by infer request. + */ protected: void replicate_impl(ov::ParameterVector params, ov::ResultVector results, GraphContext::CPtr context) override { std::vector> inputNodes; @@ -253,17 +298,37 @@ could get a chance to be referenced by infer request. outputNodes.push_back(std::make_shared(results[i], context)); } - auto dummy_softmax = std::make_shared( - testShape, testPrec, "softmax" /*name*/, "DummyNode" /*type*/, context, LayoutType::ncsp, 0/*look*/); - - auto dummy_add = std::make_shared( - testShape, testPrec, "add" /*name*/, "DummyNode" /*type*/, context, LayoutType::ncsp, 0/*look*/); - - auto dummy_reshape0 = std::make_shared( - testShape, testPrec, "reshape0" /*name*/, "DummyNode" /*type*/, context, LayoutType::ncsp, Edge::LOOK::LOOK_BOTH); - - auto dummy_reshape1 = std::make_shared( - testShape, testPrec, "reshape1" /*name*/, "DummyNode" /*type*/, context, LayoutType::ncsp, Edge::LOOK::LOOK_BOTH); + auto dummy_softmax = std::make_shared(testShape, + testPrec, + "softmax" /*name*/, + "DummyNode" /*type*/, + context, + LayoutType::ncsp, + 0 /*look*/); + + auto dummy_add = std::make_shared(testShape, + testPrec, + "add" /*name*/, + "DummyNode" /*type*/, + context, + LayoutType::ncsp, + 0 /*look*/); + + auto dummy_reshape0 = std::make_shared(testShape, + testPrec, + "reshape0" /*name*/, + "DummyNode" /*type*/, + context, + LayoutType::ncsp, + Edge::LOOK::LOOK_BOTH); + + auto dummy_reshape1 = std::make_shared(testShape, + testPrec, + "reshape1" /*name*/, + "DummyNode" /*type*/, + context, + LayoutType::ncsp, + Edge::LOOK::LOOK_BOTH); addEdge(inputNodes.front(), dummy_softmax, 0, 0); @@ -280,22 +345,25 @@ could get a chance to be referenced by infer request. TEST_F(SoftmaxAddReshapeTwoOutputsCPUTest, smoke_resolve_inplace_io) { auto graph = create_graph({ov::PartialShape{2, -1}}, 3); - CheckInplaceDirection(graph, std::string("reshape0"), 0/*inport*/, Edge::LOOK::LOOK_UP/*undesired edge look direction*/); + CheckInplaceDirection(graph, + std::string("reshape0"), + 0 /*inport*/, + Edge::LOOK::LOOK_UP /*undesired edge look direction*/); } class InputReshapeOutputCPUTest : public InplaceResolveIOCPUTestBase { -/*This test runs the following subgraph: - - param - | - | - Reshape0 - | - | - Result0 - -Edge Reshape0 -> Result0 cannot be referenced by its upstreams as its upstream is an input. -*/ + /*This test runs the following subgraph: + + param + | + | + Reshape0 + | + | + Result0 + + Edge Reshape0 -> Result0 cannot be referenced by its upstreams as its upstream is an input. + */ protected: void replicate_impl(ov::ParameterVector params, ov::ResultVector results, GraphContext::CPtr context) override { std::vector> inputNodes; @@ -308,8 +376,13 @@ Edge Reshape0 -> Result0 cannot be referenced by its upstreams as its upstream i outputNodes.push_back(std::make_shared(results[i], context)); } - auto dummy_reshape = std::make_shared( - testShape, testPrec, "reshape0" /*name*/, "DummyNode" /*type*/, context, LayoutType::ncsp, Edge::LOOK::LOOK_BOTH); + auto dummy_reshape = std::make_shared(testShape, + testPrec, + "reshape0" /*name*/, + "DummyNode" /*type*/, + context, + LayoutType::ncsp, + Edge::LOOK::LOOK_BOTH); addEdge(inputNodes.front(), dummy_reshape, 0, 0); addEdge(dummy_reshape, outputNodes.front(), 0, 0); @@ -318,5 +391,8 @@ Edge Reshape0 -> Result0 cannot be referenced by its upstreams as its upstream i TEST_F(InputReshapeOutputCPUTest, smoke_resolve_inplace_io) { auto graph = create_graph({ov::PartialShape{2, -1}}); - CheckInplaceDirection(graph, std::string("reshape0"), 0/*inport*/, Edge::LOOK::LOOK_DOWN/*undesired edge look direction*/); + CheckInplaceDirection(graph, + std::string("reshape0"), + 0 /*inport*/, + Edge::LOOK::LOOK_DOWN /*undesired edge look direction*/); } diff --git a/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp b/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp index c59df309ace294..18983f08cf5294 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/memory_state.cpp @@ -4,12 +4,12 @@ #include #include "dummy_node.hpp" - #include "graph.h" +#include "memory_control.hpp" +#include "nodes/convert.h" #include "nodes/memory.hpp" -#include "nodes/softmax.h" #include "nodes/shapeof.h" -#include "nodes/convert.h" +#include "nodes/softmax.h" #include "openvino/op/convert.hpp" #include "openvino/op/shape_of.hpp" #include "openvino/op/softmax.hpp" @@ -58,8 +58,8 @@ TEST(MemStateGraphTest, smoke_Check_Memory_Modification_Guard) { // The ReadValue/Assign operations must be used in pairs in the model. // For each such a pair, its own variable object must be created. const std::string variable_name("variable0"); - auto variable = std::make_shared( - ov::op::util::VariableInfo{test_shape, test_prec, variable_name}); + auto variable = + std::make_shared(ov::op::util::VariableInfo{test_shape, test_prec, variable_name}); // creat ngraph ops to build CPU nodes auto read = std::make_shared(param, variable); @@ -78,26 +78,33 @@ TEST(MemStateGraphTest, smoke_Check_Memory_Modification_Guard) { nodes_set.insert(child); }; - //create graph context + // create graph context Config conf; conf.rtCacheCapacity = 0; - auto context = std::make_shared(conf, nullptr, false); + auto context = + std::make_shared(conf, nullptr, false); auto input_node = std::make_shared(param, context); auto memory_input = std::make_shared(read, context); - auto first_dummy = std::make_shared( - test_shape, test_prec, - "first_dummy", - "DummyNode", - context, - LayoutType::ncsp, - first_dummy_inplace_direction, - true); + auto first_dummy = std::make_shared(test_shape, + test_prec, + "first_dummy", + "DummyNode", + context, + LayoutType::ncsp, + first_dummy_inplace_direction, + true); auto memory_output = std::make_shared(assign, context); - auto second_dummy = std::make_shared( - test_shape, test_prec, "second_dummy", "DummyNode", context, LayoutType::ncsp, Edge::LOOK::LOOK_UP, true); + auto second_dummy = std::make_shared(test_shape, + test_prec, + "second_dummy", + "DummyNode", + context, + LayoutType::ncsp, + Edge::LOOK::LOOK_UP, + true); auto softmax_node = std::make_shared(softmax, context); auto output_node = std::make_shared(res, context); @@ -117,8 +124,9 @@ TEST(MemStateGraphTest, smoke_Check_Memory_Modification_Guard) { auto find_node_str = [](const Graph& graph, const char* name) -> NodePtr { auto&& nodes = graph.GetNodes(); - auto itr = - std::find_if(nodes.begin(), nodes.end(), [=](const NodePtr& node){ return name == node->getName(); }); + auto itr = std::find_if(nodes.begin(), nodes.end(), [=](const NodePtr& node) { + return name == node->getName(); + }); if (itr == nodes.end()) { return nullptr; @@ -129,8 +137,9 @@ TEST(MemStateGraphTest, smoke_Check_Memory_Modification_Guard) { auto find_node_type = [](const Graph& graph, Type type) -> NodePtr { auto&& nodes = graph.GetNodes(); - auto itr = - std::find_if(nodes.begin(), nodes.end(), [=](const NodePtr& node){ return type == node->getType(); }); + auto itr = std::find_if(nodes.begin(), nodes.end(), [=](const NodePtr& node) { + return type == node->getType(); + }); if (itr == nodes.end()) { return nullptr; @@ -165,7 +174,7 @@ TEST(MemStateGraphTest, smoke_Check_Memory_Modification_Guard) { auto second_dummy_out_mem = second_dummy->getDstDataAtPort(0); auto memory_output_inp_mem = memory_output->getSrcDataAtPort(0); - //otherwise the memory will be modified by the dummy_look_up + // otherwise the memory will be modified by the dummy_look_up ASSERT_NE(second_dummy_out_mem, memory_output_inp_mem); // due to double buffer usage by default @@ -203,7 +212,7 @@ TEST(MemStateGraphTest, smoke_Check_Memory_Modification_Guard) { auto second_dummy_out_mem = second_dummy->getDstDataAtPort(0); auto memory_output_inp_mem = memory_output->getSrcDataAtPort(0); - //otherwise the memory will be modified by the dummy_look_up + // otherwise the memory will be modified by the dummy_look_up ASSERT_NE(second_dummy_out_mem, memory_output_inp_mem); // as the input memory bypassed through a cascade of look_up inplace nodes, it's set directly to the output @@ -255,8 +264,8 @@ TEST(MemStateGraphTest, smoke_ShapeOf_no_Inplace_Conflicts) { // The ReadValue/Assign operations must be used in pairs in the model. // For each such a pair, its own variable object must be created. const std::string variable_name("variable0"); - auto variable = std::make_shared( - ov::op::util::VariableInfo{test_shape, test_prec, variable_name}); + auto variable = + std::make_shared(ov::op::util::VariableInfo{test_shape, test_prec, variable_name}); // creat ngraph ops to build CPU nodes auto read = std::make_shared(param, variable); @@ -277,22 +286,23 @@ TEST(MemStateGraphTest, smoke_ShapeOf_no_Inplace_Conflicts) { nodes_set.insert(child); }; - //create graph context + // create graph context Config conf; conf.rtCacheCapacity = 0; - auto context = std::make_shared(conf, nullptr, false); + auto context = + std::make_shared(conf, nullptr, false); auto input_node = std::make_shared(param, context); auto memory_input = std::make_shared(read, context); - auto dummy = std::make_shared( - test_shape, test_prec, - "first_dummy", - "DummyNode", - context, - LayoutType::ncsp, - Edge::LOOK::LOOK_UP, - true); + auto dummy = std::make_shared(test_shape, + test_prec, + "first_dummy", + "DummyNode", + context, + LayoutType::ncsp, + Edge::LOOK::LOOK_UP, + true); auto memory_output = std::make_shared(assign, context); auto shapeof_node = std::make_shared(shapeof, context); @@ -318,8 +328,9 @@ TEST(MemStateGraphTest, smoke_ShapeOf_no_Inplace_Conflicts) { graph.CreateGraph(graph_nodes, graph_edges, context, "test_graph"); auto&& nodes = graph.GetNodes(); - auto itr = std::find_if(nodes.begin(), nodes.end(), - [](const NodePtr& node){ return Type::Reorder == node->getType(); }); + auto itr = std::find_if(nodes.begin(), nodes.end(), [](const NodePtr& node) { + return Type::Reorder == node->getType(); + }); ASSERT_EQ(itr, nodes.end()); } diff --git a/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp b/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp index 87c6f2db5e728f..ddd878fbcfedfb 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp @@ -2,13 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 // #include +#include #include -#include #include "common_test_utils/node_builders/constant.hpp" #include "dummy_node.hpp" #include "graph.h" +#include "memory_control.hpp" #include "nodes/input.h" #include "nodes/reorder.h" #include "nodes/reshape.h" @@ -73,10 +74,12 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterface(GetParam()); const auto& params = std::get<1>(GetParam()); OPENVINO_ASSERT(shape.size() == 4 || shape.size() == 3, - "MergeTransposeReorderCPUTest doesn't support shape", shape, + "MergeTransposeReorderCPUTest doesn't support shape", + shape, ". Only 4D and 3D shapes are supported"); Config conf; - m_context = std::make_shared(conf, nullptr, false); + m_context = + std::make_shared(conf, nullptr, false); const auto replication_result = CreateModelAndReplicate(shape, params.firstNodeLayout, params.firstNodeInplaceDirection, @@ -87,12 +90,13 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterfaceCreateGraph(replication_result.first, replication_result.second, m_context, "fused_graph"); } - virtual std::pair, std::vector> CreateModelAndReplicate(const ov::Shape& testShape, - LayoutType firstNodeLayout, - LOOK firstNodeInplaceDirection, - LayoutType lastNodeLayout, - LOOK lastNodeInplaceDirection, - size_t num_consumers) { + virtual std::pair, std::vector> CreateModelAndReplicate( + const ov::Shape& testShape, + LayoutType firstNodeLayout, + LOOK firstNodeInplaceDirection, + LayoutType lastNodeLayout, + LOOK lastNodeInplaceDirection, + size_t num_consumers) { const auto precision = ov::element::f32; // ov::Model with only a transpose node ov::ParameterVector params{std::make_shared(precision, testShape)}; @@ -118,8 +122,13 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterface(params[0], m_context); - auto dummyNode1 = std::make_shared( - testShape, precision, "reshape", "DummyNode", m_context, firstNodeLayout, firstNodeInplaceDirection); + auto dummyNode1 = std::make_shared(testShape, + precision, + "reshape", + "DummyNode", + m_context, + firstNodeLayout, + firstNodeInplaceDirection); auto orderNode = std::make_shared(constOrder, m_context); auto transposeNode = std::make_shared(transpose, m_context); @@ -142,13 +151,14 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterfaceGetNodes()) { + for (auto& node : m_graph->GetNodes()) { if (node->getType() == Type::Transpose) { transpose_count++; } @@ -159,7 +169,7 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterfaceGetNodes()) { + for (auto& node : m_graph->GetNodes()) { if (auto reorder_node = std::dynamic_pointer_cast(node)) { if (reorder_node->getOptimized()) optimized_count++; @@ -176,29 +186,29 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterface, std::vector> CreateModelAndReplicate(const ov::Shape& testShape, @@ -209,7 +219,8 @@ class MergeTransposeReorderWithReshapeCPUTest : public MergeTransposeReorderCPUT size_t num_consumers) override { const auto precision = ov::element::f32; const auto param = std::make_shared(precision, testShape); - auto reshape_const = std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{0, 0, -1}); + auto reshape_const = + std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{0, 0, -1}); auto reshape = std::make_shared(param, reshape_const, true); auto order = std::vector{0, 2, 1}; auto transpose_order = std::make_shared(ov::element::i32, ov::Shape{order.size()}, order); @@ -232,8 +243,13 @@ class MergeTransposeReorderWithReshapeCPUTest : public MergeTransposeReorderCPUT }; auto inputNode = std::make_shared(param, m_context); - auto dummyNode1 = std::make_shared( - testShape, precision, "before_reshape", "DummyNode", m_context, LayoutType::nspc, LOOK::LOOK_UP); + auto dummyNode1 = std::make_shared(testShape, + precision, + "before_reshape", + "DummyNode", + m_context, + LayoutType::nspc, + LOOK::LOOK_UP); auto reshapeConstNode = std::make_shared(reshape_const, m_context); auto reshapeNode = std::make_shared(reshape, m_context); @@ -261,7 +277,8 @@ class MergeTransposeReorderWithReshapeCPUTest : public MergeTransposeReorderCPUT addEdge(transposeNode, dummyConsumer, 0, 0); addEdge(dummyConsumer, outputNode, 0, 0); } - for (auto &node : nodesSet) nodes.emplace_back(node); + for (auto& node : nodesSet) + nodes.emplace_back(node); return {nodes, edges}; } }; @@ -340,30 +357,46 @@ TEST(MergeTransposeReorder, smoke_InplaceConflict) { std::unique_ptr graph = std::unique_ptr(new Graph()); const ov::Shape testShape{1, 8, 8, 8}; - ov::ParameterVector params{std::make_shared(ov::element::Type_t::f32, ov::Shape{1, 8, 8, 8})}; + ov::ParameterVector params{ + std::make_shared(ov::element::Type_t::f32, ov::Shape{1, 8, 8, 8})}; - auto shape_constant = std::make_shared(ov::element::Type_t::i32, ov::Shape{3}, std::vector{1, 8, 64}); + auto shape_constant = + std::make_shared(ov::element::Type_t::i32, ov::Shape{3}, std::vector{1, 8, 64}); auto reshape_node = std::make_shared(params[0], shape_constant, true); - auto order_constant = std::make_shared(ov::element::Type_t::i32, ov::Shape{3}, std::vector{0, 2, 1}); + auto order_constant = + std::make_shared(ov::element::Type_t::i32, ov::Shape{3}, std::vector{0, 2, 1}); auto transpose_node = std::make_shared(reshape_node, order_constant); ov::ResultVector results{std::make_shared(transpose_node), std::make_shared(params[0])}; auto nspcCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::nspc); - auto inDesc = nspcCreator->createSharedDesc(ov::element::Type_t::f32, ov::intel_cpu::Shape(ov::intel_cpu::VectorDims{1, 8, 8, 8})); + auto inDesc = nspcCreator->createSharedDesc(ov::element::Type_t::f32, + ov::intel_cpu::Shape(ov::intel_cpu::VectorDims{1, 8, 8, 8})); auto inputNode = std::make_shared(inDesc->clone(), "Input0", "Parameter", context); auto shapeConst = std::make_shared(shape_constant, context); auto reshapeNode = std::make_shared(reshape_node, context); auto orderConst = std::make_shared(order_constant, context); auto transposeNode = std::make_shared(transpose_node, context); - auto dummyNode0 = std::make_shared( - ov::Shape{1, 64, 8}, ov::element::Type_t::f32, "Dummy0", "DummyNode", context, LayoutType::ncsp, Edge::LOOK::LOOK_UP, true); + auto dummyNode0 = std::make_shared(ov::Shape{1, 64, 8}, + ov::element::Type_t::f32, + "Dummy0", + "DummyNode", + context, + LayoutType::ncsp, + Edge::LOOK::LOOK_UP, + true); auto outputNode0 = std::make_shared(results[0], context); - auto dummyNode1 = std::make_shared( - ov::Shape{1, 8, 8, 8}, ov::element::Type_t::f32, "Dummy1", "DummyNode", context, LayoutType::nspc, Edge::LOOK::LOOK_UP, true); + auto dummyNode1 = std::make_shared(ov::Shape{1, 8, 8, 8}, + ov::element::Type_t::f32, + "Dummy1", + "DummyNode", + context, + LayoutType::nspc, + Edge::LOOK::LOOK_UP, + true); auto outputNode1 = std::make_shared(results[1], context); std::vector graphNodes; @@ -387,7 +420,8 @@ TEST(MergeTransposeReorder, smoke_InplaceConflict) { addEdge(transposeNode, dummyNode0, 0, 0); addEdge(dummyNode0, outputNode0, 0, 0); - for (auto &node : nodesSet) graphNodes.emplace_back(node); + for (auto& node : nodesSet) + graphNodes.emplace_back(node); graph->CreateGraph(graphNodes, graphEdges, context, "test_graph"); auto expected_reorder_node0 = dummyNode0->getParentEdgeAt(0)->getParent(); diff --git a/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp b/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp index 9752b17b2e4e4d..2b342e5cc72cdb 100644 --- a/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/graph/resolve_edge_conflicts_test.cpp @@ -5,16 +5,17 @@ #include "dummy_node.hpp" #include "graph.h" -#include "nodes/input.h" +#include "memory_control.hpp" #include "nodes/concat.h" +#include "nodes/input.h" #include "openvino/op/concat.hpp" -#include "openvino/opsets/opset.hpp" -#include "openvino/op/shape_of.hpp" +#include "openvino/op/constant.hpp" #include "openvino/op/parameter.hpp" -#include "openvino/op/result.hpp" #include "openvino/op/reduce_prod.hpp" +#include "openvino/op/result.hpp" #include "openvino/op/scatter_nd_update.hpp" -#include "openvino/op/constant.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/opsets/opset.hpp" using namespace ov::intel_cpu; @@ -37,7 +38,7 @@ TEST(ResolveEdgeConflictsCPUTest, smoke_Run_ResolveEdgeConflicts) { | Output - Dummy1, Dummy2 and Dummy3 can be inplace. In ResolveEdgeConflicts(), detect Dummy3 is + Dummy1, Dummy2 and Dummy3 can be inplace. In ResolveEdgeConflicts(), detect Dummy3 is a modifying node. Collect consumers of edge Input->Dummy1 and find consumer execution order is after Dummy3. Then insert Reorder in edge Input->Dummy2. */ @@ -57,14 +58,24 @@ TEST(ResolveEdgeConflictsCPUTest, smoke_Run_ResolveEdgeConflicts) { auto inputNode = std::make_shared(params[0], context); auto outputNode = std::make_shared(results[0], context); auto concatNode = std::make_shared(concat, context); - auto dummyNode1 = std::make_shared( - testShape, testPrec, "Dummy1", "DummyNode", context); - auto dummyNode2 = std::make_shared( - testShape, testPrec, "Dummy2", "DummyNode", context); - auto dummyNode3 = std::make_shared( - testShape, testPrec, "Dummy3", "DummyNode", context, LayoutType::ncsp, Edge::LOOK::LOOK_UP, true); - auto dummyNode4 = std::make_shared( - testShape, testPrec, "Dummy4", "DummyNode", context, LayoutType::ncsp, 0, true); + auto dummyNode1 = std::make_shared(testShape, testPrec, "Dummy1", "DummyNode", context); + auto dummyNode2 = std::make_shared(testShape, testPrec, "Dummy2", "DummyNode", context); + auto dummyNode3 = std::make_shared(testShape, + testPrec, + "Dummy3", + "DummyNode", + context, + LayoutType::ncsp, + Edge::LOOK::LOOK_UP, + true); + auto dummyNode4 = std::make_shared(testShape, + testPrec, + "Dummy4", + "DummyNode", + context, + LayoutType::ncsp, + 0, + true); std::vector graphNodes; std::vector graphEdges; @@ -84,7 +95,8 @@ TEST(ResolveEdgeConflictsCPUTest, smoke_Run_ResolveEdgeConflicts) { addEdge(inputNode, dummyNode1, 0, 0); addEdge(dummyNode1, concatNode, 0, 0); addEdge(concatNode, outputNode, 0, 0); - for (auto &node : nodesSet) graphNodes.emplace_back(node); + for (auto& node : nodesSet) + graphNodes.emplace_back(node); graph->CreateGraph(graphNodes, graphEdges, context, "test_graph"); // Check whether reorder is inserted @@ -116,9 +128,12 @@ TEST(ResolveEdgeConflictsCPUTest2, smoke_Run_ResolveEdgeConflicts2) { auto org_ReduceProd_423 = std::make_shared(org_ShapeOf_386, params[1]); - auto org_Constant_387 = std::make_shared(ov::element::Type_t::i32, ov::Shape{1, 1}, std::vector{1}); - auto org_Constant_1 = std::make_shared(ov::element::Type_t::i32, ov::Shape{1}, std::vector{1}); - auto org_ScatterNDUpdate_411 = std::make_shared(org_ShapeOf_386, org_Constant_387, org_Constant_1); + auto org_Constant_387 = + std::make_shared(ov::element::Type_t::i32, ov::Shape{1, 1}, std::vector{1}); + auto org_Constant_1 = + std::make_shared(ov::element::Type_t::i32, ov::Shape{1}, std::vector{1}); + auto org_ScatterNDUpdate_411 = + std::make_shared(org_ShapeOf_386, org_Constant_387, org_Constant_1); ov::ResultVector results{std::make_shared(org_ScatterNDUpdate_411), std::make_shared(org_ReduceProd_423)}; diff --git a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp index ac43a743fbc402..7864dd2426c8cf 100644 --- a/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/nodes/reorder_node_test.cpp @@ -14,6 +14,7 @@ #include #include "common_test_utils/common_utils.hpp" +#include "memory_control.hpp" #include "nodes/input.h" using namespace ov::intel_cpu; @@ -109,17 +110,14 @@ class ReorderCPUTestGraph { auto context = std::make_shared(conf, std::make_shared(), false); + const dnnl::engine cpuEngine = context->getEngine(); - inputNode = std::make_shared(inputDesc.clone(), - "Reorder_Input", - "Parameter", - context); + inputNode = + std::make_shared(inputDesc.clone(), "Reorder_Input", "Parameter", context); reorderNode = std::make_shared(inputDesc, outputDesc, "Reorder", context); - outputNode = std::make_shared(outputDesc.clone(), - "Reorder_Output", - "Result", - context); + outputNode = + std::make_shared(outputDesc.clone(), "Reorder_Output", "Result", context); parentEdge = std::make_shared(inputNode, reorderNode, 0, 0); childEdge = std::make_shared(reorderNode, outputNode, 0, 0); @@ -154,7 +152,7 @@ class ReorderCPUTestGraph { ov::element::Type prec; }; -}// namespace ReorderCPUTest +} // namespace ReorderCPUTest using namespace ReorderCPUTest; @@ -305,8 +303,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_ReorderTestCustomStrideWithFactor, * ReorderCPUTest to test the CPU plugin-in dynamism and RT cache */ class ReorderDynamismCPUTest : public ::testing::Test, - public ::testing::WithParamInterface, - public ::ReorderCPUTest::ReorderCPUTestGraph { + public ::testing::WithParamInterface, + public ::ReorderCPUTest::ReorderCPUTestGraph { public: static std::string getTestCaseName(const testing::TestParamInfo& obj) { ReorderCPUTestParamSet p = obj.param;