Skip to content

Commit

Permalink
Fix crash on shutdown with several workers per gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
sthibaul committed Aug 30, 2024
1 parent fe762ce commit 59c8218
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 7 deletions.
6 changes: 3 additions & 3 deletions src/drivers/cuda/driver_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -845,7 +845,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
for (workerid = 0; workerid < nworkers; workerid++)
{
struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
if (worker->arch == STARPU_CUDA_WORKER && worker->devid != devid)
if (worker->arch == STARPU_CUDA_WORKER && worker->subworkerid == 0 && worker->devid != devid)
{
int can = _starpu_cuda_peer_access(devid, worker->devid);
if (can)
Expand Down Expand Up @@ -896,7 +896,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
for (workerid = 0; workerid < nworkers; workerid++)
{
struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
if (worker->arch == STARPU_CUDA_WORKER)
if (worker->arch == STARPU_CUDA_WORKER && worker->subworkerid == 0)
{
cures = starpu_cudaStreamCreate(&in_peer_transfer_streams[worker->devid][devid]);
if (STARPU_UNLIKELY(cures))
Expand Down Expand Up @@ -928,7 +928,7 @@ static void deinit_device_context(unsigned devid STARPU_ATTRIBUTE_UNUSED)
for (workerid = 0; workerid < nworkers; workerid++)
{
struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
if (worker->arch == STARPU_CUDA_WORKER)
if (worker->arch == STARPU_CUDA_WORKER && worker->subworkerid == 0)
{
cudaStreamDestroy(in_peer_transfer_streams[worker->devid][devid]);
#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
Expand Down
4 changes: 2 additions & 2 deletions src/drivers/cuda/driver_cuda1.c
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
for (workerid = 0; workerid < nworkers; workerid++)
{
struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
if (worker->arch == STARPU_CUDA_WORKER)
if (worker->arch == STARPU_CUDA_WORKER && worker->subworkerid == 0)
{
cures = starpu_cudaStreamCreate(&in_peer_transfer_streams[worker->devid][devid]);
if (STARPU_UNLIKELY(cures))
Expand All @@ -424,7 +424,7 @@ static void deinit_device_context(unsigned devid STARPU_ATTRIBUTE_UNUSED)
for (workerid = 0; workerid < nworkers; workerid++)
{
struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
if (worker->arch == STARPU_CUDA_WORKER)
if (worker->arch == STARPU_CUDA_WORKER && worker->subworkerid == 0)
{
cudaStreamDestroy(in_peer_transfer_streams[worker->devid][devid]);
}
Expand Down
4 changes: 2 additions & 2 deletions src/drivers/hip/driver_hip.c
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
for (workerid = 0; workerid < nworkers; workerid++)
{
struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
if (worker->arch == STARPU_HIP_WORKER)
if (worker->arch == STARPU_HIP_WORKER && worker->subworkerid == 0)
{
hipres = starpu_hipStreamCreate(&in_peer_transfer_streams[worker->devid][devid]);
if (STARPU_UNLIKELY(hipres))
Expand Down Expand Up @@ -609,7 +609,7 @@ static void deinit_device_context(unsigned devid STARPU_ATTRIBUTE_UNUSED)
for (workerid = 0; workerid < nworkers; workerid++)
{
struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
if (worker->arch == STARPU_HIP_WORKER)
if (worker->arch == STARPU_HIP_WORKER && worker->subworkerid == 0)
{
hipStreamDestroy(in_peer_transfer_streams[worker->devid][devid]);
if (hip_peer_enabled[devid][worker->devid])
Expand Down

0 comments on commit 59c8218

Please sign in to comment.