diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp index aa6b684d0..9f6a8e3d0 100644 --- a/hipamd/src/hip_device.cpp +++ b/hipamd/src/hip_device.cpp @@ -274,8 +274,6 @@ void Device::SyncAllStreams( bool cpu_wait) { } // Release freed memory for all memory pools on the device ReleaseFreedMemory(); - // Release all graph exec objects destroyed by user. - ReleaseGraphExec(hip::getCurrentDevice()->deviceId()); } // ================================================================================================ diff --git a/hipamd/src/hip_event.cpp b/hipamd/src/hip_event.cpp index f29df7838..d708fb235 100644 --- a/hipamd/src/hip_event.cpp +++ b/hipamd/src/hip_event.cpp @@ -448,8 +448,6 @@ hipError_t hipEventSynchronize(hipEvent_t event) { hipError_t status = e->synchronize(); // Release freed memory for all memory pools on the device g_devices[e->deviceId()]->ReleaseFreedMemory(); - // Release all graph exec objects destroyed by user. - ReleaseGraphExec(e->deviceId()); HIP_RETURN(status); } diff --git a/hipamd/src/hip_graph.cpp b/hipamd/src/hip_graph.cpp index 2e0da2f43..ab72635ec 100644 --- a/hipamd/src/hip_graph.cpp +++ b/hipamd/src/hip_graph.cpp @@ -27,8 +27,6 @@ #include "hip_mempool_impl.hpp" namespace hip { -extern std::unordered_map> GraphExecStatus_; -extern amd::Monitor GraphExecStatusLock_; std::vector g_captureStreams; // StreamCaptureGlobalList lock @@ -1422,16 +1420,8 @@ hipError_t hipGraphExecDestroy(hipGraphExec_t pGraphExec) { if (pGraphExec == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - amd::ScopedLock lock(GraphExecStatusLock_); hip::GraphExec* ge = reinterpret_cast(pGraphExec); - // bool found = false; - if (GraphExecStatus_.find(ge) == GraphExecStatus_.end()) { - ge->release(); - } else { - // graph execution is under progress. destroy graphExec during next sync point - auto pair = GraphExecStatus_[ge]; - GraphExecStatus_[ge] = std::make_pair(pair.first, true); - } + ge->release(); HIP_RETURN(hipSuccess); } diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp index 77189c7ee..bff3c9b20 100644 --- a/hipamd/src/hip_graph_internal.cpp +++ b/hipamd/src/hip_graph_internal.cpp @@ -51,10 +51,6 @@ const char* GetGraphNodeTypeString(uint32_t op) { } namespace hip { -std::unordered_map> - GraphExecStatus_ ROCCLR_INIT_PRIORITY(101); -// Guards graph execution state -amd::Monitor GraphExecStatusLock_ ROCCLR_INIT_PRIORITY(101){true}; int GraphNode::nextID = 0; int Graph::nextID = 0; @@ -601,6 +597,15 @@ void UpdateStream(std::vector>& parallelLists, hip::Stream* st } } +// ================================================================================================ + +void GraphExec::DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data) { + GraphExec* graphExec = reinterpret_cast(user_data); + graphExec->release(); +} + +// ================================================================================================ + hipError_t EnqueueGraphWithSingleList(std::vector& topoOrder, hip::Stream* hip_stream, hip::GraphExec* graphExec) { // Accumulate command tracks all the AQL packet batch that we submit to the HW. For now @@ -893,52 +898,40 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) { } } } - amd::ScopedLock lock(GraphExecStatusLock_); - GraphExecStatus_[this] = std::make_pair(launch_stream, false); + this->retain(); + amd::Command* CallbackCommand = new amd::Marker(*launch_stream, kMarkerDisableFlush, {}); + // we may not need to flush any caches. + CallbackCommand->setEventScope(amd::Device::kCacheStateIgnore); + amd::Event& event = CallbackCommand->event(); + if (!event.setCallback(CL_COMPLETE, GraphExec::DecrementRefCount, this)) { + return hipErrorInvalidHandle; + } + CallbackCommand->enqueue(); + // Add the new barrier to stall the stream, until the callback is done + amd::Command::EventWaitList eventWaitList; + eventWaitList.push_back(CallbackCommand); + amd::Command* block_command = new amd::Marker(*launch_stream, kMarkerDisableFlush, eventWaitList); + // we may not need to flush any caches. + block_command->setEventScope(amd::Device::kCacheStateIgnore); + if (block_command == nullptr) { + return hipErrorInvalidValue; + } + block_command->enqueue(); + block_command->release(); + CallbackCommand->release(); ResetQueueIndex(); return status; } -void ReleaseGraphExec(int deviceId) { - // Release all graph exec objects destroyed by user. - amd::ScopedLock lock(GraphExecStatusLock_); - for (auto itr = GraphExecStatus_.begin(); itr != GraphExecStatus_.end();) { - auto pair = itr->second; - if (pair.first->DeviceId() == deviceId) { - if (pair.second == true) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Release GraphExec"); - (itr->first)->release(); - } - GraphExecStatus_.erase(itr++); - } else { - itr++; - } - } -} - -void ReleaseGraphExec(hip::Stream* stream) { - amd::ScopedLock lock(GraphExecStatusLock_); - for (auto itr = GraphExecStatus_.begin(); itr != GraphExecStatus_.end();) { - auto pair = itr->second; - if (pair.first == stream) { - if (pair.second == true) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Release GraphExec"); - (itr->first)->release(); - } - GraphExecStatus_.erase(itr++); - } else { - ++itr; - } - } -} - // ================================================================================================ bool GraphKernelArgManager::AllocGraphKernargPool(size_t pool_size) { bool bStatus = true; assert(pool_size > 0); address graph_kernarg_base; auto device = g_devices[ihipGetDevice()]->devices()[0]; - + // Current device is stored as part of tls. Save current device to destroy kernelArgs from the + // callback thread. + device_ = device; if (device->info().largeBar_) { graph_kernarg_base = reinterpret_cast
(device->deviceLocalAlloc(pool_size)); device_kernarg_pool_ = true; @@ -977,13 +970,12 @@ address GraphKernelArgManager::AllocKernArg(size_t size, size_t alignment) { } void GraphKernelArgManager::ReadBackOrFlush() { - if (device_kernarg_pool_) { - auto device = g_devices[ihipGetDevice()]->devices()[0]; - auto kernArgImpl = device->settings().kernel_arg_impl_; + if (device_kernarg_pool_ && device_) { + auto kernArgImpl = device_->settings().kernel_arg_impl_; if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) { - *device->info().hdpMemFlushCntl = 1u; - auto kSentinel = *reinterpret_cast(device->info().hdpMemFlushCntl); + *device_->info().hdpMemFlushCntl = 1u; + auto kSentinel = *reinterpret_cast(device_->info().hdpMemFlushCntl); } else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback && kernarg_graph_.back().kernarg_pool_addr_ != 0) { address dev_ptr = diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp index 0dd918b78..a75d431a3 100644 --- a/hipamd/src/hip_graph_internal.hpp +++ b/hipamd/src/hip_graph_internal.hpp @@ -160,11 +160,12 @@ class GraphKernelArgManager : public amd::ReferenceCountedObject, public amd::Gr GraphKernelArgManager() : amd::ReferenceCountedObject() {} ~GraphKernelArgManager() { //! Release the kernel arg pools - auto device = g_devices[ihipGetDevice()]->devices()[0]; - for (auto& element : kernarg_graph_) { - device->hostFree(element.kernarg_pool_addr_, element.kernarg_pool_size_); + if (device_ != nullptr) { + for (auto& element : kernarg_graph_) { + device_->hostFree(element.kernarg_pool_addr_, element.kernarg_pool_size_); + } + kernarg_graph_.clear(); } - kernarg_graph_.clear(); } // Allocate kernel arg pool for the given size. @@ -185,7 +186,8 @@ class GraphKernelArgManager : public amd::ReferenceCountedObject, public amd::Gr size_t kernarg_pool_size_; //! Size of the pool size_t kernarg_pool_offset_; //! Current offset in the kernel arg alloc }; - bool device_kernarg_pool_ = false; //! Indicate if kernel pool in device mem + bool device_kernarg_pool_ = false; //! Indicate if kernel pool in device mem + amd::Device* device_ = nullptr; //! Device from where kernel arguments are allocated std::vector kernarg_graph_; //! Vector of allocated kernarg pool using KernelArgImpl = device::Settings::KernelArgImpl; }; @@ -828,6 +830,7 @@ struct GraphExec : public amd::ReferenceCountedObject { GraphKernelArgManager* GetKernelArgManager() { return kernArgManager_; } + static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data); }; struct ChildGraphNode : public GraphNode { diff --git a/hipamd/src/hip_internal.hpp b/hipamd/src/hip_internal.hpp index 8edaacd3c..d0a6dca57 100644 --- a/hipamd/src/hip_internal.hpp +++ b/hipamd/src/hip_internal.hpp @@ -89,8 +89,6 @@ struct GraphNode; struct GraphExec; struct UserObject; class Stream; -extern void ReleaseGraphExec(int deviceId); -extern void ReleaseGraphExec(hip::Stream* stream); typedef struct ihipIpcMemHandle_st { char ipc_handle[IHIP_IPC_MEM_HANDLE_SIZE]; ///< ipc memory handle on ROCr size_t psize; diff --git a/hipamd/src/hip_stream.cpp b/hipamd/src/hip_stream.cpp index 40ffc6ce2..4db1dc77c 100644 --- a/hipamd/src/hip_stream.cpp +++ b/hipamd/src/hip_stream.cpp @@ -82,7 +82,6 @@ void Stream::Destroy(hip::Stream* stream) { // ================================================================================================ bool Stream::terminate() { HostQueue::terminate(); - ReleaseGraphExec(this); return true; } // ================================================================================================ @@ -368,12 +367,6 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) { } // Wait for the current host queue hip_stream->finish(wait_for_cpu); - if (stream == nullptr) { - // null stream will sync with other streams. - ReleaseGraphExec(hip_stream->DeviceId()); - } else { - ReleaseGraphExec(hip_stream); - } // Release freed memory for all memory pools on the device hip_stream->GetDevice()->ReleaseFreedMemory(); return hipSuccess;