Skip to content

Commit 5f205ef

Browse files
Fix multithread request wait
Signed-off-by: Florent Germain <[email protected]>
1 parent 0922d15 commit 5f205ef

File tree

8 files changed

+100
-34
lines changed

8 files changed

+100
-34
lines changed

ompi/communicator/comm_cid.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,7 @@ int ompi_comm_nextcid (ompi_communicator_t *newcomm, ompi_communicator_t *comm,
544544
}
545545

546546
if (&ompi_request_empty != req) {
547-
ompi_request_wait_completion (req);
547+
ompi_request_wait_completion (&req);
548548
rc = req->req_status.MPI_ERROR;
549549
ompi_comm_request_return ((ompi_comm_request_t *) req);
550550
}
@@ -909,7 +909,7 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
909909
}
910910

911911
if (&ompi_request_empty != req) {
912-
ompi_request_wait_completion (req);
912+
ompi_request_wait_completion (&req);
913913
rc = req->req_status.MPI_ERROR;
914914
ompi_comm_request_return ((ompi_comm_request_t *) req);
915915
}

ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3200,7 +3200,7 @@ int mca_coll_ftagree_era_intra(void *contrib,
32003200
rc = mca_coll_ftagree_iera_intra(contrib, dt_count, dt, op, group, grp_update, comm, &req, module);
32013201
if(OPAL_UNLIKELY( OMPI_SUCCESS != rc ))
32023202
return rc;
3203-
ompi_request_wait_completion(req);
3203+
ompi_request_wait_completion(&req);
32043204
rc = req->req_status.MPI_ERROR;
32053205
ompi_request_free(&req);
32063206
return rc;

ompi/mca/pml/cm/pml_cm.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,8 @@ mca_pml_cm_recv(void *addr,
204204
return ret;
205205
}
206206

207-
ompi_request_wait_completion(&req.req_ompi);
207+
ompi_request_t *ompi_req = &req.req_ompi;
208+
ompi_request_wait_completion(&ompi_req);
208209

209210
if (MPI_STATUS_IGNORE != status) {
210211
OMPI_COPY_STATUS(status, req.req_ompi.req_status, false);
@@ -533,7 +534,8 @@ mca_pml_cm_mrecv(void *buf,
533534
return ret;
534535
}
535536

536-
ompi_request_wait_completion(&recvreq->req_base.req_ompi);
537+
ompi_request_t *ompi_req = &recvreq->req_base.req_ompi;
538+
ompi_request_wait_completion(&ompi_req);
537539

538540
if (MPI_STATUS_IGNORE != status) {
539541
OMPI_COPY_STATUS(status, recvreq->req_base.req_ompi.req_status, false);

ompi/mca/pml/ob1/pml_ob1_iprobe.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ int mca_pml_ob1_probe(int src,
6969
MCA_PML_OB1_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, false);
7070
MCA_PML_OB1_RECV_REQUEST_START(&recvreq);
7171

72-
ompi_request_wait_completion(&recvreq.req_recv.req_base.req_ompi);
72+
ompi_request_t *ompi_req = &recvreq.req_recv.req_base.req_ompi;
73+
ompi_request_wait_completion(&ompi_req);
7374
rc = recvreq.req_recv.req_base.req_ompi.req_status.MPI_ERROR;
7475
if( MPI_STATUS_IGNORE != status ) {
7576
OMPI_COPY_STATUS(status, recvreq.req_recv.req_base.req_ompi.req_status, false);
@@ -159,7 +160,8 @@ mca_pml_ob1_mprobe(int src,
159160
src, tag, comm, false);
160161
MCA_PML_OB1_RECV_REQUEST_START(recvreq);
161162

162-
ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);
163+
ompi_request_t *ompi_req = &recvreq->req_recv.req_base.req_ompi;
164+
ompi_request_wait_completion(&ompi_req);
163165
rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
164166
if( MPI_STATUS_IGNORE != status ) {
165167
OMPI_COPY_STATUS(status, recvreq->req_recv.req_base.req_ompi.req_status, false);

ompi/mca/pml/ob1/pml_ob1_irecv.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,8 @@ int mca_pml_ob1_recv(void *addr,
133133
PERUSE_RECV);
134134

135135
MCA_PML_OB1_RECV_REQUEST_START(recvreq);
136-
ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);
136+
ompi_request_t *ompi_req = &recvreq->req_recv.req_base.req_ompi;
137+
ompi_request_wait_completion(&ompi_req);
137138

138139
if (recvreq->req_recv.req_base.req_pml_complete) {
139140
/* make buffer defined when the request is completed */
@@ -153,7 +154,8 @@ int mca_pml_ob1_recv(void *addr,
153154
#if OPAL_ENABLE_FT_MPI
154155
if( OPAL_UNLIKELY( MPI_ERR_PROC_FAILED_PENDING == rc )) {
155156
ompi_request_cancel(&recvreq->req_recv.req_base.req_ompi);
156-
ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);
157+
ompi_request_t *ft_req = &recvreq->req_recv.req_base.req_ompi;
158+
ompi_request_wait_completion(&ft_req);
157159
rc = MPI_ERR_PROC_FAILED;
158160
}
159161
#endif
@@ -358,7 +360,8 @@ mca_pml_ob1_mrecv( void *buf,
358360

359361
ompi_message_return(*message);
360362
*message = MPI_MESSAGE_NULL;
361-
ompi_request_wait_completion(&(recvreq->req_recv.req_base.req_ompi));
363+
ompi_request_t *ompi_req = &recvreq->req_recv.req_base.req_ompi;
364+
ompi_request_wait_completion(&ompi_req);
362365

363366
MCA_PML_OB1_RECV_FRAG_RETURN(frag);
364367

@@ -369,7 +372,8 @@ mca_pml_ob1_mrecv( void *buf,
369372
#if OPAL_ENABLE_FT_MPI
370373
if( OPAL_UNLIKELY( MPI_ERR_PROC_FAILED_PENDING == rc )) {
371374
ompi_request_cancel(&recvreq->req_recv.req_base.req_ompi);
372-
ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);
375+
ompi_request_t *ft_req = &recvreq->req_recv.req_base.req_ompi;
376+
ompi_request_wait_completion(&ft_req);
373377
rc = MPI_ERR_PROC_FAILED;
374378
}
375379
#endif

ompi/mca/pml/ob1/pml_ob1_isend.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ int mca_pml_ob1_send(const void *buf,
279279
return rc;
280280
}
281281

282-
ompi_request_wait_completion (brequest);
282+
ompi_request_wait_completion (&brequest);
283283
ompi_request_free (&brequest);
284284
return OMPI_SUCCESS;
285285
}
@@ -324,7 +324,8 @@ int mca_pml_ob1_send(const void *buf,
324324

325325
MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(sendreq, endpoint, seqn, rc);
326326
if (OPAL_LIKELY(rc == OMPI_SUCCESS)) {
327-
ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi);
327+
ompi_request_t *ompi_req = &sendreq->req_send.req_base.req_ompi;
328+
ompi_request_wait_completion(&ompi_req);
328329

329330
rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR;
330331
}

ompi/request/req_wait.c

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ int ompi_request_default_wait(
3535
ompi_request_t ** req_ptr,
3636
ompi_status_public_t * status)
3737
{
38-
ompi_request_t *req = *req_ptr;
38+
ompi_request_wait_completion(req_ptr);
3939

40-
ompi_request_wait_completion(req);
40+
ompi_request_t *req = *req_ptr;
4141

4242
/* make sure we get the correct status */
4343
opal_atomic_rmb();
@@ -78,10 +78,25 @@ int ompi_request_default_wait(
7878
return req->req_status.MPI_ERROR;
7979
}
8080

81+
/* Swap the request with MPI_REQUEST_NULL
82+
* Make sure ompi_request_free is called only once
83+
*/
84+
ompi_request_t *old_req = *req_ptr;
85+
ompi_request_t *new_req = MPI_REQUEST_NULL;
86+
87+
retry:
88+
if (new_req == old_req) {
89+
return OMPI_SUCCESS;
90+
}
91+
92+
if (!OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(req_ptr, &old_req, new_req)) {
93+
goto retry;
94+
}
95+
8196
/* If there's an error while freeing the request, assume that the
8297
request is still there. Otherwise, Bad Things will happen
8398
later! */
84-
return ompi_request_free(req_ptr);
99+
return ompi_request_free(&old_req);
85100
}
86101

87102

ompi/request/request.h

Lines changed: 60 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -447,53 +447,84 @@ static inline bool ompi_request_tag_is_collective(int tag) {
447447
* Wait a particular request for completion
448448
*/
449449

450-
static inline void ompi_request_wait_completion(ompi_request_t *req)
450+
static inline void ompi_request_wait_completion(ompi_request_t **req)
451451
{
452452
if (opal_using_threads ()) {
453-
if(!REQUEST_COMPLETE(req)) {
453+
if(!REQUEST_COMPLETE(*req)) {
454454
void *_tmp_ptr;
455455
ompi_wait_sync_t sync;
456456

457457

458458
#if OPAL_ENABLE_FT_MPI
459459
redo:
460-
if(OPAL_UNLIKELY( ompi_request_is_failed(req) )) {
460+
if(OPAL_UNLIKELY( ompi_request_is_failed(*req) )) {
461461
return;
462462
}
463463
#endif /* OPAL_ENABLE_FT_MPI */
464464
_tmp_ptr = REQUEST_PENDING;
465465

466466
WAIT_SYNC_INIT(&sync, 1);
467467

468-
if (OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&req->req_complete, &_tmp_ptr, &sync)) {
468+
/* CAS sync on the req->req_complete field.
469+
* - If the request is PENDING, this will let know other waiting
470+
* threads and the ompi_request_complete that we are waiting on it.
471+
*
472+
* - If the request is already completed, clean sync and exit
473+
*
474+
* - If another thread is already waiting on this request, stack on it
475+
* We will signal its sync when the request is completed
476+
*/
477+
if (OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&(*req)->req_complete, &_tmp_ptr, &sync)) {
478+
/* I'm the first one to wait on this request. */
469479
SYNC_WAIT(&sync);
470-
} else {
471-
/* completed before we had a chance to swap in the sync object */
480+
} else if (REQUEST_COMPLETE(*req)) {
481+
/* Completed before we had a chance to swap in the sync object
482+
* Clean sync and exit */
472483
WAIT_SYNC_SIGNALLED(&sync);
484+
} else {
485+
/* Another thread is waiting on the request.
486+
* It's sync is stored in _tmp_ptr */
487+
stack_retry:
488+
/* Try to stack our sync on the request */
489+
if (OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&(*req)->req_complete, &_tmp_ptr, &sync)) {
490+
/* Successfully stacked on the request */
491+
SYNC_WAIT(&sync);
492+
493+
/* Request is completed, continue unstacking */
494+
wait_sync_update((ompi_wait_sync_t*) _tmp_ptr, 1, sync.status);
495+
} else if (REQUEST_COMPLETE(*req)) {
496+
/* Completed before I could stack on the request.
497+
* Clean sync and exit */
498+
WAIT_SYNC_SIGNALLED(&sync);
499+
} else {
500+
/* Someone else has successfully stacked its sync.
501+
* Retry */
502+
goto stack_retry;
503+
}
473504
}
474505

475506
#if OPAL_ENABLE_FT_MPI
476507
if (OPAL_UNLIKELY(OMPI_SUCCESS != sync.status)) {
477-
OPAL_OUTPUT_VERBOSE((50, ompi_ftmpi_output_handle, "Status %d reported for sync %p rearming req %p", sync.status, (void*)&sync, (void*)req));
508+
OPAL_OUTPUT_VERBOSE((50, ompi_ftmpi_output_handle, "Status %d reported for sync %p rearming req %p", sync.status, (void*)&sync, (void*)(*req)));
478509
_tmp_ptr = &sync;
479-
if (OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&req->req_complete, &_tmp_ptr, REQUEST_PENDING)) {
480-
opal_output_verbose(10, ompi_ftmpi_output_handle, "Status %d reported for sync %p rearmed req %p", sync.status, (void*)&sync, (void*)req);
510+
if (OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&(*req)->req_complete, &_tmp_ptr, REQUEST_PENDING)) {
511+
opal_output_verbose(10, ompi_ftmpi_output_handle, "Status %d reported for sync %p rearmed req %p", sync.status, (void*)&sync, (void*)(*req));
481512
WAIT_SYNC_RELEASE(&sync);
482513
goto redo;
483514
}
484515
}
485516
#endif /* OPAL_ENABLE_FT_MPI */
486-
assert(REQUEST_COMPLETE(req));
517+
assert(REQUEST_COMPLETE(*req));
487518
WAIT_SYNC_RELEASE(&sync);
488-
}
489-
opal_atomic_rmb();
519+
}
520+
opal_atomic_rmb();
490521
} else {
491-
while(!REQUEST_COMPLETE(req)) {
522+
while(!REQUEST_COMPLETE(*req)) {
492523
opal_progress();
493524
#if OPAL_ENABLE_FT_MPI
494525
/* Check to make sure that process failure did not break the
495526
* request. */
496-
if(OPAL_UNLIKELY( ompi_request_is_failed(req) )) {
527+
if(OPAL_UNLIKELY( ompi_request_is_failed(*req) )) {
497528
break;
498529
}
499530
#endif /* OPAL_ENABLE_FT_MPI */
@@ -530,10 +561,21 @@ static inline int ompi_request_complete(ompi_request_t* request, bool with_signa
530561
/* make sure everything in the request is visible before we mark it complete */
531562
opal_atomic_wmb();
532563

533-
ompi_wait_sync_t *tmp_sync = (ompi_wait_sync_t *) OPAL_ATOMIC_SWAP_PTR(&request->req_complete,
534-
REQUEST_COMPLETED);
535-
if( REQUEST_PENDING != tmp_sync ) {
536-
wait_sync_update(tmp_sync, 1, request->req_status.MPI_ERROR);
564+
void *_tmp_ptr = REQUEST_PENDING;
565+
566+
if(!OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&request->req_complete, &_tmp_ptr, REQUEST_COMPLETED)) {
567+
/* CAS failled: someone is waiting on this request
568+
* Its sync structure is stored in _tmp_ptr
569+
* Mark the request as completed and store the waiting thread sync.
570+
*/
571+
while (!OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&request->req_complete, &_tmp_ptr, REQUEST_COMPLETED)) {
572+
}
573+
574+
/* In the case where another thread concurrently changed the request to REQUEST_PENDING or REQUEST_COMPLETED */
575+
if( REQUEST_PENDING != _tmp_ptr
576+
&& REQUEST_COMPLETED != _tmp_ptr) {
577+
wait_sync_update((ompi_wait_sync_t*) _tmp_ptr, 1, request->req_status.MPI_ERROR);
578+
}
537579
}
538580
} else {
539581
request->req_complete = REQUEST_COMPLETED;

0 commit comments

Comments
 (0)