-
Notifications
You must be signed in to change notification settings - Fork 284
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
comm: create local_group/remote_group beform comm commit #7237
Open
hzhou
wants to merge
19
commits into
pmodels:main
Choose a base branch
from
hzhou:2412_group_comm
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 1 commit
Commits
Show all changes
19 commits
Select commit
Hold shift + click to select a range
225e98f
test: remove unused test glpid
hzhou 0a21659
group: remove unused groupdebug.c
hzhou 976d46c
group: abstract group access and lpid integer type
hzhou 729e8a7
misc: use the new group rank/lpid conversion routines
hzhou 87f5fac
comm: use MPIR_Group_create_{map, stride)
hzhou 59c1d05
group: rearange functions in group_impl.c
hzhou 8dbe66e
group: refactor group_impl.c to use new group interfaces
hzhou 9b05ad0
group: refactor MPIR_Group
hzhou ed910e9
---- START HERE ----
hzhou 37b9eb9
mpid/ch4: remove MPIDI_NM_comm_get_gpid
hzhou 4a7fd12
mpid: replace usage of uint64_t lpid with MPIR_Lpid
hzhou b18ac27
group: add MPIR_Worlds
hzhou c28b290
group: add builtin MPIR_GROUP_{WORLD,SELF}
hzhou 21b288e
group: add MPIR_Group_dup
hzhou b24bbcb
binding/group: remove error check in MPI_Group_free
hzhou 6082a68
comm: always set local_group and remote_group
hzhou 138bad2
group: avoid freeing MPIR_Group_empty
hzhou 6d281b9
ch4: release init_comm->local_group
hzhou 5a53a62
ch4: assert group before communicator commit
hzhou File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,9 @@ int MPIR_init_comm_world(void) | |
MPIR_Process.comm_world->remote_size = MPIR_Process.size; | ||
MPIR_Process.comm_world->local_size = MPIR_Process.size; | ||
|
||
MPIR_Process.comm_world->local_group = MPIR_GROUP_WORLD_PTR; | ||
MPIR_Group_add_ref(MPIR_GROUP_WORLD_PTR); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe explicitly set |
||
|
||
mpi_errno = MPIR_Comm_commit(MPIR_Process.comm_world); | ||
MPIR_ERR_CHECK(mpi_errno); | ||
|
||
|
@@ -59,6 +62,9 @@ int MPIR_init_comm_self(void) | |
MPIR_Process.comm_self->remote_size = 1; | ||
MPIR_Process.comm_self->local_size = 1; | ||
|
||
MPIR_Process.comm_self->local_group = MPIR_GROUP_SELF_PTR; | ||
MPIR_Group_add_ref(MPIR_GROUP_SELF_PTR); | ||
|
||
mpi_errno = MPIR_Comm_commit(MPIR_Process.comm_self); | ||
MPIR_ERR_CHECK(mpi_errno); | ||
|
||
|
@@ -91,6 +97,9 @@ int MPIR_init_icomm_world(void) | |
MPIR_Process.icomm_world->remote_size = MPIR_Process.size; | ||
MPIR_Process.icomm_world->local_size = MPIR_Process.size; | ||
|
||
MPIR_Process.icomm_world->local_group = MPIR_GROUP_WORLD_PTR; | ||
MPIR_Group_add_ref(MPIR_GROUP_WORLD_PTR); | ||
|
||
mpi_errno = MPIR_Comm_commit(MPIR_Process.icomm_world); | ||
MPIR_ERR_CHECK(mpi_errno); | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -337,8 +337,7 @@ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co | |
(*newcomm_ptr)->local_group = group_ptr; | ||
MPIR_Group_add_ref(group_ptr); | ||
|
||
(*newcomm_ptr)->remote_group = group_ptr; | ||
MPIR_Group_add_ref(group_ptr); | ||
(*newcomm_ptr)->remote_group = NULL; | ||
(*newcomm_ptr)->context_id = (*newcomm_ptr)->recvcontext_id; | ||
(*newcomm_ptr)->remote_size = (*newcomm_ptr)->local_size = n; | ||
|
||
|
@@ -382,15 +381,12 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co | |
int mpi_errno = MPI_SUCCESS; | ||
int new_context_id; | ||
int *mapping = NULL; | ||
int *remote_mapping = NULL; | ||
MPIR_Comm *mapping_comm = NULL; | ||
int remote_size = -1; | ||
int rinfo[2]; | ||
MPIR_CHKLMEM_DECL(1); | ||
|
||
MPIR_FUNC_ENTER; | ||
|
||
MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM); | ||
MPIR_Session *session_ptr = comm_ptr->session_ptr; | ||
|
||
/* Create a new communicator from the specified group members */ | ||
|
||
|
@@ -409,6 +405,7 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co | |
MPIR_Assert(new_context_id != 0); | ||
MPIR_Assert(new_context_id != comm_ptr->recvcontext_id); | ||
|
||
MPIR_Comm *mapping_comm; | ||
mpi_errno = MPII_Comm_create_calculate_mapping(group_ptr, comm_ptr, &mapping, &mapping_comm); | ||
MPIR_ERR_CHECK(mpi_errno); | ||
|
||
|
@@ -434,7 +431,7 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co | |
|
||
(*newcomm_ptr)->is_low_group = comm_ptr->is_low_group; | ||
|
||
MPIR_Comm_set_session_ptr(*newcomm_ptr, comm_ptr->session_ptr); | ||
MPIR_Comm_set_session_ptr(*newcomm_ptr, session_ptr); | ||
} | ||
|
||
/* There is an additional step. We must communicate the | ||
|
@@ -445,6 +442,11 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co | |
* in the remote group, from which the remote network address | ||
* mapping can be constructed. We need to use the "collective" | ||
* context in the original intercommunicator */ | ||
|
||
int remote_size = -1; | ||
int *remote_mapping; /* a list of remote ranks */ | ||
int rinfo[2]; | ||
|
||
if (comm_ptr->rank == 0) { | ||
int info[2]; | ||
info[0] = new_context_id; | ||
|
@@ -494,6 +496,7 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co | |
|
||
MPIR_Assert(remote_size >= 0); | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove blank line. |
||
if (group_ptr->rank != MPI_UNDEFINED) { | ||
(*newcomm_ptr)->remote_size = remote_size; | ||
/* Now, everyone has the remote_mapping, and can apply that to | ||
|
@@ -505,6 +508,23 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co | |
mapping, remote_mapping, mapping_comm, *newcomm_ptr); | ||
MPIR_ERR_CHECK(mpi_errno); | ||
|
||
/* create remote_group. | ||
* FIXME: we can directly exchange group maps once we get rid of comm mappers */ | ||
MPIR_Group *remote_group; | ||
|
||
MPIR_Lpid *remote_map; | ||
remote_map = MPL_malloc(remote_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); | ||
MPIR_ERR_CHKANDJUMP(!remote_map, mpi_errno, MPI_ERR_OTHER, "**nomem"); | ||
|
||
MPIR_Group *mapping_group = mapping_comm->remote_group; | ||
MPIR_Assert(mapping_group); | ||
for (int i = 0; i < remote_size; i++) { | ||
remote_map[i] = MPIR_Group_rank_to_lpid(mapping_group, remote_mapping[i]); | ||
} | ||
mpi_errno = MPIR_Group_create_map(remote_size, MPI_UNDEFINED, session_ptr, remote_map, | ||
&remote_group); | ||
(*newcomm_ptr)->remote_group = remote_group; | ||
|
||
(*newcomm_ptr)->tainted = comm_ptr->tainted; | ||
mpi_errno = MPIR_Comm_commit(*newcomm_ptr); | ||
MPIR_ERR_CHECK(mpi_errno); | ||
|
@@ -605,8 +625,7 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in | |
(*newcomm_ptr)->local_group = group_ptr; | ||
MPIR_Group_add_ref(group_ptr); | ||
|
||
(*newcomm_ptr)->remote_group = group_ptr; | ||
MPIR_Group_add_ref(group_ptr); | ||
(*newcomm_ptr)->remote_group = NULL; | ||
(*newcomm_ptr)->context_id = (*newcomm_ptr)->recvcontext_id; | ||
(*newcomm_ptr)->remote_size = (*newcomm_ptr)->local_size = n; | ||
|
||
|
@@ -913,6 +932,9 @@ int MPIR_Comm_remote_group_impl(MPIR_Comm * comm_ptr, MPIR_Group ** group_ptr) | |
int mpi_errno = MPI_SUCCESS; | ||
MPIR_FUNC_ENTER; | ||
|
||
/* FIXME: remove the following remote_group creation once this assertion passes */ | ||
MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM && comm_ptr->remote_group); | ||
|
||
/* Create a group and populate it with the local process ids */ | ||
if (!comm_ptr->remote_group) { | ||
int n = comm_ptr->remote_size; | ||
|
@@ -965,6 +987,7 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, | |
uint64_t *remote_lpids = NULL; | ||
int comm_info[3]; | ||
int is_low_group = 0; | ||
MPIR_Session *session_ptr = local_comm_ptr->session_ptr; | ||
|
||
MPIR_FUNC_ENTER; | ||
|
||
|
@@ -1042,7 +1065,14 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, | |
(*new_intercomm_ptr)->local_comm = 0; | ||
(*new_intercomm_ptr)->is_low_group = is_low_group; | ||
|
||
MPIR_Comm_set_session_ptr(*new_intercomm_ptr, local_comm_ptr->session_ptr); | ||
(*new_intercomm_ptr)->local_group = local_comm_ptr->local_group; | ||
MPIR_Group_add_ref(local_comm_ptr->local_group); | ||
|
||
/* construct remote_group */ | ||
mpi_errno = MPIR_Group_create_map(remote_size, MPI_UNDEFINED, session_ptr, remote_lpids, | ||
&(*new_intercomm_ptr)->remote_group); | ||
|
||
MPIR_Comm_set_session_ptr(*new_intercomm_ptr, session_ptr); | ||
|
||
mpi_errno = MPID_Create_intercomm_from_lpids(*new_intercomm_ptr, remote_size, remote_lpids); | ||
if (mpi_errno) | ||
|
@@ -1064,8 +1094,6 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, | |
|
||
|
||
fn_exit: | ||
MPL_free(remote_lpids); | ||
remote_lpids = NULL; | ||
MPIR_FUNC_EXIT; | ||
return mpi_errno; | ||
fn_fail: | ||
|
@@ -1106,6 +1134,15 @@ int MPIR_peer_intercomm_create(int context_id, int recvcontext_id, | |
} | ||
MPID_THREAD_CS_EXIT(VCI, comm_self->mutex); | ||
|
||
MPIR_Session *session_ptr = NULL; /* Can we just use NULL session since peer_intercomm is always temporary? */ | ||
MPIR_Lpid my_lpid = MPIR_Group_rank_to_lpid(comm_self->local_group, 0); | ||
mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, my_lpid, 1, 1, | ||
&(*newcomm)->local_group); | ||
MPIR_ERR_CHECK(mpi_errno); | ||
mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, remote_lpid, 1, 1, | ||
&(*newcomm)->remote_group); | ||
MPIR_ERR_CHECK(mpi_errno); | ||
|
||
(*newcomm)->tainted = 1; | ||
mpi_errno = MPIR_Comm_commit(*newcomm); | ||
MPIR_ERR_CHECK(mpi_errno); | ||
|
@@ -1222,6 +1259,37 @@ int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_i | |
|
||
MPIR_Comm_set_session_ptr(*new_intracomm_ptr, comm_ptr->session_ptr); | ||
|
||
/* construct local_group */ | ||
MPIR_Group *new_local_group; | ||
|
||
MPIR_Lpid *map; | ||
map = MPL_malloc(new_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); | ||
MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); | ||
|
||
int myrank; | ||
MPIR_Group *group1, *group2; | ||
if (local_high) { | ||
group1 = comm_ptr->remote_group; | ||
group2 = comm_ptr->local_group; | ||
myrank = group1->size + group2->rank; | ||
} else { | ||
group1 = comm_ptr->local_group; | ||
group2 = comm_ptr->remote_group; | ||
myrank = group1->rank; | ||
} | ||
for (int i = 0; i < group1->size; i++) { | ||
map[i] = MPIR_Group_rank_to_lpid(group1, i); | ||
} | ||
for (int i = 0; i < group2->size; i++) { | ||
map[group1->size + i] = MPIR_Group_rank_to_lpid(group2, i); | ||
} | ||
|
||
mpi_errno = MPIR_Group_create_map(new_size, myrank, comm_ptr->session_ptr, map, | ||
&new_local_group); | ||
|
||
(*new_intracomm_ptr)->local_group = new_local_group; | ||
MPIR_Group_add_ref(new_local_group); | ||
|
||
/* Now we know which group comes first. Build the new mapping | ||
* from the existing comm */ | ||
mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); | ||
|
@@ -1260,6 +1328,7 @@ int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_i | |
(*new_intracomm_ptr)->recvcontext_id = new_context_id; | ||
|
||
MPIR_Comm_set_session_ptr(*new_intracomm_ptr, comm_ptr->session_ptr); | ||
(*new_intracomm_ptr)->local_group = new_local_group; | ||
|
||
mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); | ||
MPIR_ERR_CHECK(mpi_errno); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the
remote_group
isNULL
, theremote_size
should probably be 0.Or, if we want to keep
local_size == remote_size
for now to lessen the impact on existing codes, we should update the comment in the struct definition and maybe add a TODO for future cleanup.