Skip to content

Commit

Permalink
SWDEV-480209 - Handle GraphExec object release
Browse files Browse the repository at this point in the history
=> GraphExec instance is destroyed before async launch completes,
destroy after all pending graph launches
=> Remove GraphExec destroy during next sync point(hipStreamSync,
hipDeviceSync etc..)

Change-Id: I4df682aae5787fd6e5240a7be936ce50361345d0
  • Loading branch information
Anusha GodavarthySurya authored and Anusha Godavarthy Surya committed Oct 22, 2024
1 parent 05d6f75 commit f9f995c
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 74 deletions.
2 changes: 0 additions & 2 deletions hipamd/src/hip_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,6 @@ void Device::SyncAllStreams( bool cpu_wait) {
}
// Release freed memory for all memory pools on the device
ReleaseFreedMemory();
// Release all graph exec objects destroyed by user.
ReleaseGraphExec(hip::getCurrentDevice()->deviceId());
}

// ================================================================================================
Expand Down
2 changes: 0 additions & 2 deletions hipamd/src/hip_event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,8 +448,6 @@ hipError_t hipEventSynchronize(hipEvent_t event) {
hipError_t status = e->synchronize();
// Release freed memory for all memory pools on the device
g_devices[e->deviceId()]->ReleaseFreedMemory();
// Release all graph exec objects destroyed by user.
ReleaseGraphExec(e->deviceId());
HIP_RETURN(status);
}

Expand Down
12 changes: 1 addition & 11 deletions hipamd/src/hip_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@
#include "hip_mempool_impl.hpp"

namespace hip {
extern std::unordered_map<GraphExec*, std::pair<hip::Stream*, bool>> GraphExecStatus_;
extern amd::Monitor GraphExecStatusLock_;

std::vector<hip::Stream*> g_captureStreams;
// StreamCaptureGlobalList lock
Expand Down Expand Up @@ -1422,16 +1420,8 @@ hipError_t hipGraphExecDestroy(hipGraphExec_t pGraphExec) {
if (pGraphExec == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
amd::ScopedLock lock(GraphExecStatusLock_);
hip::GraphExec* ge = reinterpret_cast<hip::GraphExec*>(pGraphExec);
// bool found = false;
if (GraphExecStatus_.find(ge) == GraphExecStatus_.end()) {
ge->release();
} else {
// graph execution is under progress. destroy graphExec during next sync point
auto pair = GraphExecStatus_[ge];
GraphExecStatus_[ge] = std::make_pair(pair.first, true);
}
ge->release();
HIP_RETURN(hipSuccess);
}

Expand Down
82 changes: 37 additions & 45 deletions hipamd/src/hip_graph_internal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,6 @@ const char* GetGraphNodeTypeString(uint32_t op) {
}

namespace hip {
std::unordered_map<GraphExec *, std::pair<hip::Stream *, bool>>
GraphExecStatus_ ROCCLR_INIT_PRIORITY(101);
// Guards graph execution state
amd::Monitor GraphExecStatusLock_ ROCCLR_INIT_PRIORITY(101){true};

int GraphNode::nextID = 0;
int Graph::nextID = 0;
Expand Down Expand Up @@ -601,6 +597,15 @@ void UpdateStream(std::vector<std::vector<Node>>& parallelLists, hip::Stream* st
}
}

// ================================================================================================

void GraphExec::DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data) {
GraphExec* graphExec = reinterpret_cast<GraphExec*>(user_data);
graphExec->release();
}

// ================================================================================================

hipError_t EnqueueGraphWithSingleList(std::vector<hip::Node>& topoOrder, hip::Stream* hip_stream,
hip::GraphExec* graphExec) {
// Accumulate command tracks all the AQL packet batch that we submit to the HW. For now
Expand Down Expand Up @@ -893,52 +898,40 @@ hipError_t GraphExec::Run(hipStream_t graph_launch_stream) {
}
}
}
amd::ScopedLock lock(GraphExecStatusLock_);
GraphExecStatus_[this] = std::make_pair(launch_stream, false);
this->retain();
amd::Command* CallbackCommand = new amd::Marker(*launch_stream, kMarkerDisableFlush, {});
// we may not need to flush any caches.
CallbackCommand->setEventScope(amd::Device::kCacheStateIgnore);
amd::Event& event = CallbackCommand->event();
if (!event.setCallback(CL_COMPLETE, GraphExec::DecrementRefCount, this)) {
return hipErrorInvalidHandle;
}
CallbackCommand->enqueue();
// Add the new barrier to stall the stream, until the callback is done
amd::Command::EventWaitList eventWaitList;
eventWaitList.push_back(CallbackCommand);
amd::Command* block_command = new amd::Marker(*launch_stream, kMarkerDisableFlush, eventWaitList);
// we may not need to flush any caches.
block_command->setEventScope(amd::Device::kCacheStateIgnore);
if (block_command == nullptr) {
return hipErrorInvalidValue;
}
block_command->enqueue();
block_command->release();
CallbackCommand->release();
ResetQueueIndex();
return status;
}

void ReleaseGraphExec(int deviceId) {
// Release all graph exec objects destroyed by user.
amd::ScopedLock lock(GraphExecStatusLock_);
for (auto itr = GraphExecStatus_.begin(); itr != GraphExecStatus_.end();) {
auto pair = itr->second;
if (pair.first->DeviceId() == deviceId) {
if (pair.second == true) {
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Release GraphExec");
(itr->first)->release();
}
GraphExecStatus_.erase(itr++);
} else {
itr++;
}
}
}

void ReleaseGraphExec(hip::Stream* stream) {
amd::ScopedLock lock(GraphExecStatusLock_);
for (auto itr = GraphExecStatus_.begin(); itr != GraphExecStatus_.end();) {
auto pair = itr->second;
if (pair.first == stream) {
if (pair.second == true) {
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Release GraphExec");
(itr->first)->release();
}
GraphExecStatus_.erase(itr++);
} else {
++itr;
}
}
}

// ================================================================================================
bool GraphKernelArgManager::AllocGraphKernargPool(size_t pool_size) {
bool bStatus = true;
assert(pool_size > 0);
address graph_kernarg_base;
auto device = g_devices[ihipGetDevice()]->devices()[0];

// Current device is stored as part of tls. Save current device to destroy kernelArgs from the
// callback thread.
device_ = device;
if (device->info().largeBar_) {
graph_kernarg_base = reinterpret_cast<address>(device->deviceLocalAlloc(pool_size));
device_kernarg_pool_ = true;
Expand Down Expand Up @@ -977,13 +970,12 @@ address GraphKernelArgManager::AllocKernArg(size_t size, size_t alignment) {
}

void GraphKernelArgManager::ReadBackOrFlush() {
if (device_kernarg_pool_) {
auto device = g_devices[ihipGetDevice()]->devices()[0];
auto kernArgImpl = device->settings().kernel_arg_impl_;
if (device_kernarg_pool_ && device_) {
auto kernArgImpl = device_->settings().kernel_arg_impl_;

if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) {
*device->info().hdpMemFlushCntl = 1u;
auto kSentinel = *reinterpret_cast<volatile int*>(device->info().hdpMemFlushCntl);
*device_->info().hdpMemFlushCntl = 1u;
auto kSentinel = *reinterpret_cast<volatile int*>(device_->info().hdpMemFlushCntl);
} else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback &&
kernarg_graph_.back().kernarg_pool_addr_ != 0) {
address dev_ptr =
Expand Down
13 changes: 8 additions & 5 deletions hipamd/src/hip_graph_internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,12 @@ class GraphKernelArgManager : public amd::ReferenceCountedObject, public amd::Gr
GraphKernelArgManager() : amd::ReferenceCountedObject() {}
~GraphKernelArgManager() {
//! Release the kernel arg pools
auto device = g_devices[ihipGetDevice()]->devices()[0];
for (auto& element : kernarg_graph_) {
device->hostFree(element.kernarg_pool_addr_, element.kernarg_pool_size_);
if (device_ != nullptr) {
for (auto& element : kernarg_graph_) {
device_->hostFree(element.kernarg_pool_addr_, element.kernarg_pool_size_);
}
kernarg_graph_.clear();
}
kernarg_graph_.clear();
}

// Allocate kernel arg pool for the given size.
Expand All @@ -185,7 +186,8 @@ class GraphKernelArgManager : public amd::ReferenceCountedObject, public amd::Gr
size_t kernarg_pool_size_; //! Size of the pool
size_t kernarg_pool_offset_; //! Current offset in the kernel arg alloc
};
bool device_kernarg_pool_ = false; //! Indicate if kernel pool in device mem
bool device_kernarg_pool_ = false; //! Indicate if kernel pool in device mem
amd::Device* device_ = nullptr; //! Device from where kernel arguments are allocated
std::vector<KernelArgPoolGraph> kernarg_graph_; //! Vector of allocated kernarg pool
using KernelArgImpl = device::Settings::KernelArgImpl;
};
Expand Down Expand Up @@ -828,6 +830,7 @@ struct GraphExec : public amd::ReferenceCountedObject {
GraphKernelArgManager* GetKernelArgManager() {
return kernArgManager_;
}
static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data);
};

struct ChildGraphNode : public GraphNode {
Expand Down
2 changes: 0 additions & 2 deletions hipamd/src/hip_internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,6 @@ struct GraphNode;
struct GraphExec;
struct UserObject;
class Stream;
extern void ReleaseGraphExec(int deviceId);
extern void ReleaseGraphExec(hip::Stream* stream);
typedef struct ihipIpcMemHandle_st {
char ipc_handle[IHIP_IPC_MEM_HANDLE_SIZE]; ///< ipc memory handle on ROCr
size_t psize;
Expand Down
7 changes: 0 additions & 7 deletions hipamd/src/hip_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ void Stream::Destroy(hip::Stream* stream) {
// ================================================================================================
bool Stream::terminate() {
HostQueue::terminate();
ReleaseGraphExec(this);
return true;
}
// ================================================================================================
Expand Down Expand Up @@ -368,12 +367,6 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) {
}
// Wait for the current host queue
hip_stream->finish(wait_for_cpu);
if (stream == nullptr) {
// null stream will sync with other streams.
ReleaseGraphExec(hip_stream->DeviceId());
} else {
ReleaseGraphExec(hip_stream);
}
// Release freed memory for all memory pools on the device
hip_stream->GetDevice()->ReleaseFreedMemory();
return hipSuccess;
Expand Down

0 comments on commit f9f995c

Please sign in to comment.