From 99f0482108dd9629920e6dbc406b6fd45497a832 Mon Sep 17 00:00:00 2001 From: Jun Tang Date: Sun, 11 Feb 2024 18:12:50 +0000 Subject: [PATCH] add bruck's method to allgather and allreduce; add knomial to allreduce Signed-off-by: Jun Tang --- ompi/mca/coll/base/coll_base_allgather.c | 210 ++++++++++++++++++ ompi/mca/coll/base/coll_base_allreduce.c | 70 ++++++ ompi/mca/coll/base/coll_base_functions.h | 3 + ompi/mca/coll/base/coll_base_reduce.c | 147 ++++++++++++ .../tuned/coll_tuned_allgather_decision.c | 7 +- .../tuned/coll_tuned_allreduce_decision.c | 5 +- .../coll/tuned/coll_tuned_reduce_decision.c | 6 +- 7 files changed, 445 insertions(+), 3 deletions(-) diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c index fc28696599c..f9156c96127 100644 --- a/ompi/mca/coll/base/coll_base_allgather.c +++ b/ompi/mca/coll/base/coll_base_allgather.c @@ -866,4 +866,214 @@ ompi_coll_base_allgather_intra_basic_linear(const void *sbuf, int scount, return err; } +/* + * ompi_coll_base_allgather_intra_k_bruck + * + * Function: allgather using O(logk(N)) steps. + * Accepts: Same arguments as MPI_Allgather + * Returns: MPI_SUCCESS or error code + * + * Description: This method extend ompi_coll_base_allgather_intra_bruck to handle any + * radix k; use non-blocking communication to take advantage of multiple ports + * The algorithm detail is described in Bruck et al. (1997), + * "Efficient Algorithms for All-to-all Communications + * in Multiport Message-Passing Systems" + * Memory requirements: non-zero ranks require temporary buffer to perform final + * step in the algorithm. + * + * Example on 10 nodes with k=3: + * Initialization: everyone has its own buffer at location 0 in rbuf + * This means if user specified MPI_IN_PLACE for sendbuf + * we must copy our block from recvbuf to beginning! + * # 0 1 2 3 4 5 6 7 8 9 + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * Step 0: send message to (rank - k^0 * i), receive message from (rank + k^0 * i) + * message size is k^0 * block size and i is between [1, k-1] + * # 0 1 2 3 4 5 6 7 8 9 + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * [1] [2] [3] [4] [5] [6] [7] [8] [9] [0] + * [2] [3] [4] [5] [6] [7] [8] [9] [0] [1] + * Step 1: send message to (rank - k^1 * i), receive message from (rank + k^1 * i) + * message size is k^1 * block size + * # 0 1 2 3 4 5 6 7 8 9 + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * [1] [2] [3] [4] [5] [6] [7] [8] [9] [0] + * [2] [3] [4] [5] [6] [7] [8] [9] [0] [1] + * [3] [4] [5] [6] [7] [8] [9] [0] [1] [2] + * [4] [5] [6] [7] [8] [9] [0] [1] [2] [3] + * [5] [6] [7] [8] [9] [0] [1] [2] [3] [4] + * [6] [7] [8] [9] [0] [1] [2] [3] [4] [5] + * [7] [8] [9] [0] [1] [2] [3] [4] [5] [6] + * [8] [9] [0] [1] [2] [3] [4] [5] [6] [7] + * Step 2: send message to (rank - k^2 * i), receive message from (rank + k^2 * i) + * message size is k^2 * block size or "all remaining blocks" for each exchange + * # 0 1 2 3 4 5 6 7 8 9 + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * [1] [2] [3] [4] [5] [6] [7] [8] [9] [0] + * [2] [3] [4] [5] [6] [7] [8] [9] [0] [1] + * [3] [4] [5] [6] [7] [8] [9] [0] [1] [2] + * [4] [5] [6] [7] [8] [9] [0] [1] [2] [3] + * [5] [6] [7] [8] [9] [0] [1] [2] [3] [4] + * [6] [7] [8] [9] [0] [1] [2] [3] [4] [5] + * [7] [8] [9] [0] [1] [2] [3] [4] [5] [6] + * [8] [9] [0] [1] [2] [3] [4] [5] [6] [7] + * [9] [0] [1] [2] [3] [4] [5] [6] [7] [8] + * Finalization: Do a local shift (except rank 0) to get data in correct place + * # 0 1 2 3 4 5 6 7 8 9 + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + */ +int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int line = -1, rank, size, dst, src, dist, err = MPI_SUCCESS; + int recvcount, distance; + int k = 8; + ptrdiff_t rlb, rextent; + ptrdiff_t rsize, rgap = 0; + ompi_request_t **reqs; + int num_reqs, max_reqs = 0; + + char *tmpsend = NULL; + char *tmprecv = NULL; + char *tmp_buf = NULL; + char *tmp_buf_start = NULL; + + rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); + + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "coll:base:allgather_intra_k_bruck radix %d rank %d", k, rank)); + err = ompi_datatype_get_extent (rdtype, &rlb, &rextent); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + if (0 != rank) { + /* Compute the temporary buffer size, including datatypes empty gaps */ + rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * (size - rank), &rgap); + tmp_buf = (char *) malloc(rsize); + tmp_buf_start = tmp_buf - rgap; + } + + // tmprecv points to the data initially on this rank, handle mpi_in_place case + tmprecv = (char*) rbuf; + if (MPI_IN_PLACE != sbuf) { + tmpsend = (char*) sbuf; + err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } else if (0 != rank) { + // root data placement is at the correct poistion + tmpsend = ((char*)rbuf) + (ptrdiff_t)rank * (ptrdiff_t)rcount * rextent; + err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmprecv, tmpsend); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + /* + Maximum number of communication phases logk(n) + For each phase i, rank r: + - increase the distance and recvcount by k times + - sends (k - 1) messages which starts at beginning of rbuf and has size + (recvcount) to rank (r - distance * j) + - receives (k - 1) messages of size recvcount from rank (r + distance * j) + at location (rbuf + distance * j * rcount * rext) + - calculate the remaining data for each of the (k - 1) messages in the last + phase to complete all transactions + */ + max_reqs = 2 * (k - 1); + reqs = ompi_coll_base_comm_get_reqs(module->base_data, max_reqs); + recvcount = 1; + tmpsend = (char*) rbuf; + for (distance = 1; distance < size; distance *= k) { + num_reqs = 0; + for (int j = 1; j < k; j++) + { + if (distance * j >= size) { + break; + } + src = (rank + distance * j) % size; + dst = (rank - distance * j + size) % size; + + tmprecv = tmpsend + (ptrdiff_t)distance * j * rcount * rextent; + + if (distance <= (size / k)) { + recvcount = distance; + } else { + recvcount = (distance < (size - distance * j)? + distance:(size - distance * j)); + } + + err = MCA_PML_CALL(irecv(tmprecv, + recvcount * rcount, + rdtype, + src, + MCA_COLL_BASE_TAG_ALLGATHER, + comm, + &reqs[num_reqs++])); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = MCA_PML_CALL(isend(tmpsend, + recvcount * rcount, + rdtype, + dst, + MCA_COLL_BASE_TAG_ALLGATHER, + MCA_PML_BASE_SEND_STANDARD, + comm, + &reqs[num_reqs++])); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + err = ompi_request_wait_all(num_reqs, reqs, MPI_STATUSES_IGNORE); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + + // Finalization step: On all ranks except 0, data needs to be shifted locally + if (0 != rank) { + err = ompi_datatype_copy_content_same_ddt(rdtype, + ((ptrdiff_t)(size - rank) * rcount), + tmp_buf_start, + rbuf); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + tmpsend = (char*) rbuf + (ptrdiff_t)(size - rank) * rcount * rextent; + err = ompi_datatype_copy_content_same_ddt(rdtype, + (ptrdiff_t)rank * rcount, + rbuf, + tmpsend); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + tmprecv = (char*) rbuf + (ptrdiff_t)rank * rcount * rextent; + err = ompi_datatype_copy_content_same_ddt(rdtype, + (ptrdiff_t)(size - rank) * rcount, + tmprecv, + tmp_buf_start); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + + if(tmp_buf != NULL) { + free(tmp_buf); + tmp_buf = NULL; + tmp_buf_start = NULL; + } + + return OMPI_SUCCESS; + + err_hndl: + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank)); + if(tmp_buf != NULL) { + free(tmp_buf); + tmp_buf = NULL; + tmp_buf_start = NULL; + } + (void)line; // silence compiler warning + return err; +} /* copied function (with appropriate renaming) ends here */ diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c index 1eb9d2ca64c..bf375ea9c53 100644 --- a/ompi/mca/coll/base/coll_base_allreduce.c +++ b/ompi/mca/coll/base/coll_base_allreduce.c @@ -1376,4 +1376,74 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf return err; } +int ompi_coll_base_allreduce_intra_k_bruck(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + int line = -1; + char *partial_buf = NULL; + char *partial_buf_start = NULL; + char *sendtmpbuf = NULL; + char *buffer1 = NULL; + char *buffer1_start = NULL; + int err = OMPI_SUCCESS; + + ptrdiff_t extent, lb; + ompi_datatype_get_extent(dtype, &lb, &extent); + + int rank = ompi_comm_rank(comm); + int size = ompi_comm_size(comm); + + sendtmpbuf = (char*) sbuf; + if( sbuf == MPI_IN_PLACE ) { + sendtmpbuf = (char *)rbuf; + } + ptrdiff_t buf_size, gap = 0; + buf_size = opal_datatype_span(&dtype->super, (int64_t)count * size, &gap); + partial_buf = (char *) malloc(buf_size); + partial_buf_start = partial_buf - gap; + buf_size = opal_datatype_span(&dtype->super, (int64_t)count, &gap); + buffer1 = (char *) malloc(buf_size); + buffer1_start = buffer1 - gap; + + err = ompi_datatype_copy_content_same_ddt(dtype, count, + (char*)buffer1_start, + (char*)sendtmpbuf); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + /* Local roots perform a allreduce on the upper comm */ + err = comm->c_coll->coll_allgather(buffer1_start, count, dtype, + partial_buf_start, count, dtype, + comm, comm->c_coll->coll_allgather_module); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + for(int target = 1; target < size; target++) + { + ompi_op_reduce(op, + partial_buf_start + (ptrdiff_t)target * count * extent, + partial_buf_start, + count, + dtype); + } + + // move data to rbuf + err = ompi_datatype_copy_content_same_ddt(dtype, count, + (char*)rbuf, + (char*)partial_buf_start); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + +err_hndl: + if (NULL != partial_buf) { + free(partial_buf); + partial_buf = NULL; + partial_buf_start = NULL; + } + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank)); + (void)line; // silence compiler warning + return err; + +} /* copied function (with appropriate renaming) ends here */ diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h index 1c73d01d37e..e75a7c6c21f 100644 --- a/ompi/mca/coll/base/coll_base_functions.h +++ b/ompi/mca/coll/base/coll_base_functions.h @@ -194,6 +194,7 @@ int ompi_coll_base_allgather_intra_ring(ALLGATHER_ARGS); int ompi_coll_base_allgather_intra_neighborexchange(ALLGATHER_ARGS); int ompi_coll_base_allgather_intra_basic_linear(ALLGATHER_ARGS); int ompi_coll_base_allgather_intra_two_procs(ALLGATHER_ARGS); +int ompi_coll_base_allgather_intra_k_bruck(ALLGATHER_ARGS); /* All GatherV */ int ompi_coll_base_allgatherv_intra_bruck(ALLGATHERV_ARGS); @@ -211,6 +212,7 @@ int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsi int ompi_coll_base_allreduce_intra_basic_linear(ALLREDUCE_ARGS); int ompi_coll_base_allreduce_intra_redscat_allgather(ALLREDUCE_ARGS); int ompi_coll_base_allreduce_intra_allgather_reduce(ALLREDUCE_ARGS); +int ompi_coll_base_allreduce_intra_k_bruck(ALLREDUCE_ARGS); /* AlltoAll */ int ompi_coll_base_alltoall_intra_pairwise(ALLTOALL_ARGS); @@ -274,6 +276,7 @@ int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_ou int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); int ompi_coll_base_reduce_intra_redscat_gather(REDUCE_ARGS); +int ompi_coll_base_reduce_intra_knomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs ); /* Reduce_scatter */ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS); diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c index e7cf7e0656a..d70b470ff53 100644 --- a/ompi/mca/coll/base/coll_base_reduce.c +++ b/ompi/mca/coll/base/coll_base_reduce.c @@ -1142,3 +1142,150 @@ int ompi_coll_base_reduce_intra_redscat_gather( free(scount); return err; } + +int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf, + int count, ompi_datatype_t* datatype, + ompi_op_t* op, int root, + ompi_communicator_t* comm, + mca_coll_base_module_t *module, + uint32_t segsize, + int max_outstanding_reqs ) +{ + int err = OMPI_SUCCESS, rank, size, line, radix; + ptrdiff_t extent, lb; + size_t dtype_size; + int seg_count = count; + char *child_buf = NULL; + char *child_buf_start = NULL; + char *reduce_buf = NULL; + char *reduce_buf_start = NULL; + char *sendtmpbuf = NULL; + mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; + mca_coll_base_comm_t *data = base_module->base_data; + ompi_coll_tree_t* tree; + int num_children; + bool is_leaf; + ptrdiff_t buf_size, gap = 0; + int max_reqs, num_reqs; + ompi_request_t **reqs; + + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:ompi_coll_base_reduce_intra_knomial msg size %d, max_requests %d", + count, max_outstanding_reqs)); + + rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); + + // create a k-nomial tree with radix 4 + radix = 4; + COLL_BASE_UPDATE_KMTREE(comm, base_module, root, radix); + if (NULL == data->cached_kmtree) { + // fail to create knomial tree fallback to previous allreduce method + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "REDUCE: failed to create knomial tree. \n")); + goto err_hndl; + } + + tree = data->cached_kmtree; + num_children = tree->tree_nextsize; + is_leaf = (tree->tree_nextsize == 0) ? true : false; + + ompi_datatype_get_extent(datatype, &lb, &extent); + ompi_datatype_type_size(datatype, &dtype_size); + + sendtmpbuf = (char*) sendbuf; + if( sendbuf == MPI_IN_PLACE ) { + sendtmpbuf = (char *)recvbuf; + } + buf_size = opal_datatype_span(&datatype->super, (int64_t)count, &gap); + reduce_buf = (char *)malloc(buf_size); + reduce_buf_start = reduce_buf - gap; + err = ompi_datatype_copy_content_same_ddt(datatype, count, + (char*)reduce_buf_start, + (char*)sendtmpbuf); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + // do transfer in a single transaction instead of segments + num_reqs = 0; + max_reqs = num_children; + if(!is_leaf) { + buf_size = opal_datatype_span(&datatype->super, (int64_t)count * num_children, &gap); + child_buf = (char *)malloc(buf_size); + child_buf_start = child_buf - gap; + reqs = ompi_coll_base_comm_get_reqs(data, max_reqs); + } + + for (int i = 0; i < num_children; i++) { + int child = tree->tree_next[i]; + err = MCA_PML_CALL(irecv(child_buf_start + (ptrdiff_t)i * count * extent, + count, + datatype, + child, + MCA_COLL_BASE_TAG_REDUCE, + comm, + &reqs[num_reqs++])); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + + if (num_reqs > 0) { + err = ompi_request_wait_all(num_reqs, reqs, MPI_STATUS_IGNORE); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + + for (int i = 0; i < num_children; i++) { + ompi_op_reduce(op, + child_buf_start + (ptrdiff_t)i * count * extent, + reduce_buf, + count, + datatype); + } + + if (rank != root) { + err = MCA_PML_CALL(send(reduce_buf_start, + count, + datatype, + tree->tree_prev, + MCA_COLL_BASE_TAG_REDUCE, + MCA_PML_BASE_SEND_STANDARD, + comm)); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + + if (rank == root) { + err = ompi_datatype_copy_content_same_ddt(datatype, count, + (char*)recvbuf, + (char*)reduce_buf_start); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + + return OMPI_SUCCESS; + + err_hndl: + if (NULL != child_buf) { + free(child_buf); + child_buf = NULL; + child_buf_start = NULL; + } + if (NULL != reduce_buf) { + free(child_buf); + child_buf = NULL; + child_buf_start = NULL; + } + if( NULL != reqs ) { + if (MPI_ERR_IN_STATUS == err) { + for( num_reqs = 0; num_reqs < tree->tree_nextsize; num_reqs++ ) { + if (MPI_REQUEST_NULL == reqs[num_reqs]) continue; + if (MPI_ERR_PENDING == reqs[num_reqs]->req_status.MPI_ERROR) continue; + if (reqs[num_reqs]->req_status.MPI_ERROR != MPI_SUCCESS) { + err = reqs[num_reqs]->req_status.MPI_ERROR; + break; + } + } + } + ompi_coll_base_free_reqs(reqs, max_reqs); + } + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank)); + (void)line; // silence compiler warning + return err; + +} diff --git a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c index 1ae5f506ef1..6ae558c699c 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c @@ -41,6 +41,7 @@ static const mca_base_var_enum_value_t allgather_algorithms[] = { {5, "neighbor"}, {6, "two_proc"}, {7, "sparbit"}, + {8, "bruck-k-fanout"}, {0, NULL} }; @@ -79,7 +80,7 @@ ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca mca_param_indices->algorithm_param_index = mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "allgather_algorithm", - "Which allgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only, 7: sparbit. " + "Which allgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only, 7: sparbit, 8: bruck with radix k. " "Only relevant if coll_tuned_use_dynamic_rules is true.", MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, @@ -168,6 +169,10 @@ int ompi_coll_tuned_allgather_intra_do_this(const void *sbuf, int scount, return ompi_coll_base_allgather_intra_sparbit(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); + case (8): + return ompi_coll_base_allgather_intra_k_bruck(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); } /* switch */ OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", diff --git a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c index 3711cdb8eb1..f29c083d373 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c @@ -43,6 +43,7 @@ static const mca_base_var_enum_value_t allreduce_algorithms[] = { {5, "segmented_ring"}, {6, "rabenseifner"}, {7, "allgather_reduce"}, + {8, "allreduce_bruck"}, {0, NULL} }; @@ -78,7 +79,7 @@ int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorith mca_param_indices->algorithm_param_index = mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "allreduce_algorithm", - "Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring. " + "Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring, 6 rabenseifner, 7 allgather_reduce, 8 allreduce_bruck. " "Only relevant if coll_tuned_use_dynamic_rules is true.", MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, @@ -149,6 +150,8 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, int co return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, module); case (7): return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module); + case (8): + return ompi_coll_base_allreduce_intra_k_bruck(sbuf, rbuf, count, dtype, op, comm, module); } /* switch */ OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE])); diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c index 40e500d1c04..0fa57909d50 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c @@ -42,6 +42,7 @@ static const mca_base_var_enum_value_t reduce_algorithms[] = { {5, "binomial"}, {6, "in-order_binary"}, {7, "rabenseifner"}, + {8, "knomial"}, {0, NULL} }; @@ -80,7 +81,7 @@ int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_m mca_param_indices->algorithm_param_index = mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "reduce_algorithm", - "Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary, 7 rabenseifner. " + "Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary, 7 rabenseifner, 8 knomial. " "Only relevant if coll_tuned_use_dynamic_rules is true.", MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, @@ -177,6 +178,9 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, int count segsize, max_requests); case (7): return ompi_coll_base_reduce_intra_redscat_gather(sbuf, rbuf, count, dtype, op, root, comm, module); + case (8): return ompi_coll_base_reduce_intra_knomial(sbuf, rbuf, count, dtype, + op, root, comm, module, + segsize, max_requests); } /* switch */ OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));