Skip to content

Commit

Permalink
communicator bugfix: disjoint function does not have the correct max_…
Browse files Browse the repository at this point in the history
…local_peers value

local_peers is passed in the non-blocking function iallreduce_fn as a stack variable.
Change it to be part of the context struct so the correct value is passed.

Signed-off-by: Jessie Yang <[email protected]>
  • Loading branch information
jiaxiyan committed Jan 10, 2024
1 parent 74fbf8f commit d680336
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions ompi/communicator/comm_cid.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ struct ompi_comm_cid_context_t {
int iter;
/** storage for activate barrier */
int max_local_peers;
int local_peers;
char *port_string;
bool send_first;
int pml_tag;
Expand Down Expand Up @@ -267,6 +268,7 @@ static ompi_comm_cid_context_t *mca_comm_cid_context_alloc (ompi_communicator_t
context->send_first = send_first;
context->iter = 0;
context->max_local_peers = ompi_group_count_local_peers(newcomm->c_local_group);
context->local_peers = context->max_local_peers;

return context;
}
Expand Down Expand Up @@ -774,6 +776,11 @@ static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request);
/* Callback function to set communicator disjointness flags */
static inline void ompi_comm_set_disjointness_nb_complete(ompi_comm_cid_context_t *context)
{
/* Only set the disjoint flags when it is intra-communicator */
if (OMPI_COMM_IS_INTER(*context->newcommp)) {
return;
}

if (OMPI_COMM_IS_DISJOINT_SET(*context->newcommp)) {
opal_show_help("help-comm.txt", "disjointness-set-again", true);
return;
Expand Down Expand Up @@ -870,7 +877,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
ompi_comm_cid_context_t *context;
ompi_comm_request_t *request;
ompi_request_t *subreq;
int ret = 0, local_peers = -1;
int ret = 0;

/* the caller should not pass NULL for comm (it may be the same as *newcomm) */
assert (NULL != comm);
Expand Down Expand Up @@ -907,9 +914,8 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
* 1. The communicator's disjointness is inferred from max_local_peers.
* 2. After the operation it is allowed to send messages over the new communicator.
*/
local_peers = context->max_local_peers;
ret = context->iallreduce_fn (&local_peers, &context->max_local_peers, 1, MPI_MAX, context,
&subreq);
ret = context->iallreduce_fn (&context->local_peers, &context->max_local_peers, 1, MPI_MAX, context,
&subreq);
if (OMPI_SUCCESS != ret) {
ompi_comm_request_return (request);
return ret;
Expand Down

0 comments on commit d680336

Please sign in to comment.