Skip to content

Commit

Permalink
mca/coll: add allreduce method based on allgather (bruck)
Browse files Browse the repository at this point in the history
use allgather to do allreduce; this reduce the latency comparing to
gather and reduce on root followed by broadcast to other ranks at the
cost of additional memory usage and message exchanges.

Signed-off-by: Jun Tang <[email protected]>
  • Loading branch information
juntangc committed Feb 22, 2024
1 parent aca2938 commit 51298c7
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 1 deletion.
82 changes: 82 additions & 0 deletions ompi/mca/coll/base/coll_base_allreduce.c
Original file line number Diff line number Diff line change
Expand Up @@ -1376,4 +1376,86 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
return err;
}

int ompi_coll_base_allreduce_intra_k_bruck(const void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int line = -1;
char *partial_buf = NULL;
char *partial_buf_start = NULL;
char *sendtmpbuf = NULL;
char *buffer1 = NULL;
char *buffer1_start = NULL;
int err = OMPI_SUCCESS;

ptrdiff_t extent, lb;
ompi_datatype_get_extent(dtype, &lb, &extent);

int rank = ompi_comm_rank(comm);
int size = ompi_comm_size(comm);

sendtmpbuf = (char*) sbuf;
if( sbuf == MPI_IN_PLACE ) {
sendtmpbuf = (char *)rbuf;
}
ptrdiff_t buf_size, gap = 0;
buf_size = opal_datatype_span(&dtype->super, (int64_t)count * size, &gap);
partial_buf = (char *) malloc(buf_size);
partial_buf_start = partial_buf - gap;
buf_size = opal_datatype_span(&dtype->super, (int64_t)count, &gap);
buffer1 = (char *) malloc(buf_size);
buffer1_start = buffer1 - gap;

err = ompi_datatype_copy_content_same_ddt(dtype, count,
(char*)buffer1_start,
(char*)sendtmpbuf);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

// apply allgather data so that each rank has a full copy to do reduce (trade bandwidth for better latency)
err = comm->c_coll->coll_allgather(buffer1_start, count, dtype,
partial_buf_start, count, dtype,
comm, comm->c_coll->coll_allgather_module);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

for(int target = 1; target < size; target++)
{
ompi_op_reduce(op,
partial_buf_start + (ptrdiff_t)target * count * extent,
partial_buf_start,
count,
dtype);
}

// move data to rbuf
err = ompi_datatype_copy_content_same_ddt(dtype, count,
(char*)rbuf,
(char*)partial_buf_start);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

if (NULL != buffer1) {
free(buffer1);
buffer1 = NULL;
buffer1_start = NULL;
}
return OMPI_SUCCESS;

err_hndl:
if (NULL != partial_buf) {
free(partial_buf);
partial_buf = NULL;
partial_buf_start = NULL;
}
if (NULL != buffer1) {
free(buffer1);
buffer1 = NULL;
buffer1_start = NULL;
}
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
(void)line; // silence compiler warning
return err;

}
/* copied function (with appropriate renaming) ends here */
1 change: 1 addition & 0 deletions ompi/mca/coll/base/coll_base_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsi
int ompi_coll_base_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
int ompi_coll_base_allreduce_intra_redscat_allgather(ALLREDUCE_ARGS);
int ompi_coll_base_allreduce_intra_allgather_reduce(ALLREDUCE_ARGS);
int ompi_coll_base_allreduce_intra_k_bruck(ALLREDUCE_ARGS);

/* AlltoAll */
int ompi_coll_base_alltoall_intra_pairwise(ALLTOALL_ARGS);
Expand Down
5 changes: 4 additions & 1 deletion ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ static const mca_base_var_enum_value_t allreduce_algorithms[] = {
{5, "segmented_ring"},
{6, "rabenseifner"},
{7, "allgather_reduce"},
{8, "allreduce_bruck"},
{0, NULL}
};

Expand Down Expand Up @@ -78,7 +79,7 @@ int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorith
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm",
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring. "
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring, 6 rabenseifner, 7 allgather_reduce, 8 allreduce_bruck. "
"Only relevant if coll_tuned_use_dynamic_rules is true.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5,
Expand Down Expand Up @@ -149,6 +150,8 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, int co
return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, module);
case (7):
return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module);
case (8):
return ompi_coll_base_allreduce_intra_k_bruck(sbuf, rbuf, count, dtype, op, comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
Expand Down

0 comments on commit 51298c7

Please sign in to comment.