Skip to content

Commit

Permalink
mca/coll: add reduce method (knomial)
Browse files Browse the repository at this point in the history
use tree method to do reduce; the tree can be
knomial/kary/binomial/binary. This method use knomial by default, the
radix is a user configuration and the default radix is 4 for
intranode; for internode, a large k may be beneficial to performance.

Signed-off-by: Jun Tang <[email protected]>
  • Loading branch information
juntangc committed Feb 22, 2024
1 parent 51298c7 commit 8385e14
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 1 deletion.
1 change: 1 addition & 0 deletions ompi/mca/coll/base/coll_base_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_ou
int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_base_reduce_intra_redscat_gather(REDUCE_ARGS);
int ompi_coll_base_reduce_intra_knomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, int radix);

/* Reduce_scatter */
int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
Expand Down
146 changes: 146 additions & 0 deletions ompi/mca/coll/base/coll_base_reduce.c
Original file line number Diff line number Diff line change
Expand Up @@ -1142,3 +1142,149 @@ int ompi_coll_base_reduce_intra_redscat_gather(
free(scount);
return err;
}

int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
int count, ompi_datatype_t* datatype,
ompi_op_t* op, int root,
ompi_communicator_t* comm,
mca_coll_base_module_t *module,
uint32_t segsize,
int max_outstanding_reqs, int radix)
{
int err = OMPI_SUCCESS, rank, size, line;
ptrdiff_t extent, lb;
size_t dtype_size;
int seg_count = count;
char *child_buf = NULL;
char *child_buf_start = NULL;
char *reduce_buf = NULL;
char *reduce_buf_start = NULL;
char *sendtmpbuf = NULL;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
ompi_coll_tree_t* tree;
int num_children;
bool is_leaf;
ptrdiff_t buf_size, gap = 0;
int max_reqs, num_reqs;
ompi_request_t **reqs;

OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:ompi_coll_base_reduce_intra_knomial msg size %d, max_requests %d",
count, max_outstanding_reqs));

rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);

// create a k-nomial tree with radix 4
COLL_BASE_UPDATE_KMTREE(comm, base_module, root, radix);
if (NULL == data->cached_kmtree) {
// fail to create knomial tree fallback to previous allreduce method
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"REDUCE: failed to create knomial tree. \n"));
goto err_hndl;
}

tree = data->cached_kmtree;
num_children = tree->tree_nextsize;
is_leaf = (tree->tree_nextsize == 0) ? true : false;

ompi_datatype_get_extent(datatype, &lb, &extent);
ompi_datatype_type_size(datatype, &dtype_size);

sendtmpbuf = (char*) sendbuf;
if( sendbuf == MPI_IN_PLACE ) {
sendtmpbuf = (char *)recvbuf;
}
buf_size = opal_datatype_span(&datatype->super, (int64_t)count, &gap);
reduce_buf = (char *)malloc(buf_size);
reduce_buf_start = reduce_buf - gap;
err = ompi_datatype_copy_content_same_ddt(datatype, count,
(char*)reduce_buf_start,
(char*)sendtmpbuf);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

// do transfer in a single transaction instead of segments
num_reqs = 0;
max_reqs = num_children;
if(!is_leaf) {
buf_size = opal_datatype_span(&datatype->super, (int64_t)count * num_children, &gap);
child_buf = (char *)malloc(buf_size);
child_buf_start = child_buf - gap;
reqs = ompi_coll_base_comm_get_reqs(data, max_reqs);
}

for (int i = 0; i < num_children; i++) {
int child = tree->tree_next[i];
err = MCA_PML_CALL(irecv(child_buf_start + (ptrdiff_t)i * count * extent,
count,
datatype,
child,
MCA_COLL_BASE_TAG_REDUCE,
comm,
&reqs[num_reqs++]));
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}

if (num_reqs > 0) {
err = ompi_request_wait_all(num_reqs, reqs, MPI_STATUS_IGNORE);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}

for (int i = 0; i < num_children; i++) {
ompi_op_reduce(op,
child_buf_start + (ptrdiff_t)i * count * extent,
reduce_buf,
count,
datatype);
}

if (rank != root) {
err = MCA_PML_CALL(send(reduce_buf_start,
count,
datatype,
tree->tree_prev,
MCA_COLL_BASE_TAG_REDUCE,
MCA_PML_BASE_SEND_STANDARD,
comm));
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}

if (rank == root) {
err = ompi_datatype_copy_content_same_ddt(datatype, count,
(char*)recvbuf,
(char*)reduce_buf_start);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}

return OMPI_SUCCESS;

err_hndl:
if (NULL != child_buf) {
free(child_buf);
child_buf = NULL;
child_buf_start = NULL;
}
if (NULL != reduce_buf) {
free(child_buf);
child_buf = NULL;
child_buf_start = NULL;
}
if( NULL != reqs ) {
if (MPI_ERR_IN_STATUS == err) {
for( num_reqs = 0; num_reqs < tree->tree_nextsize; num_reqs++ ) {
if (MPI_REQUEST_NULL == reqs[num_reqs]) continue;
if (MPI_ERR_PENDING == reqs[num_reqs]->req_status.MPI_ERROR) continue;
if (reqs[num_reqs]->req_status.MPI_ERROR != MPI_SUCCESS) {
err = reqs[num_reqs]->req_status.MPI_ERROR;
break;
}
}
}
ompi_coll_base_free_reqs(reqs, max_reqs);
}
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
(void)line; // silence compiler warning
return err;

}
18 changes: 17 additions & 1 deletion ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ static int coll_tuned_reduce_segment_size = 0;
static int coll_tuned_reduce_max_requests;
static int coll_tuned_reduce_tree_fanout;
static int coll_tuned_reduce_chain_fanout;
static int coll_tuned_reduce_knomial_radix = 4;

/* valid values for coll_tuned_reduce_forced_algorithm */
static const mca_base_var_enum_value_t reduce_algorithms[] = {
Expand All @@ -42,6 +43,8 @@ static const mca_base_var_enum_value_t reduce_algorithms[] = {
{5, "binomial"},
{6, "in-order_binary"},
{7, "rabenseifner"},
{8, "knomial"},
{8, "knomial"},
{0, NULL}
};

Expand Down Expand Up @@ -80,7 +83,7 @@ int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_m
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm",
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary, 7 rabenseifner. "
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary, 7 rabenseifner, 8 knomial. "
"Only relevant if coll_tuned_use_dynamic_rules is true.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5,
Expand Down Expand Up @@ -121,6 +124,15 @@ int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_m
MCA_BASE_VAR_SCOPE_ALL,
&coll_tuned_reduce_chain_fanout);

coll_tuned_reduce_knomial_radix = 4;
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_knomial_radix",
"k-nomial tree radix for the reduce algorithm (radix > 1).",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_ALL,
&coll_tuned_reduce_knomial_radix);

coll_tuned_reduce_max_requests = 0; /* no limit for reduce by default */
mca_param_indices->max_requests_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
Expand Down Expand Up @@ -177,6 +189,10 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, int count
segsize, max_requests);
case (7): return ompi_coll_base_reduce_intra_redscat_gather(sbuf, rbuf, count, dtype,
op, root, comm, module);
case (8): return ompi_coll_base_reduce_intra_knomial(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests,
coll_tuned_reduce_knomial_radix);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
Expand Down

0 comments on commit 8385e14

Please sign in to comment.