From 99f0482108dd9629920e6dbc406b6fd45497a832 Mon Sep 17 00:00:00 2001
From: Jun Tang <juntangc@amazon.com>
Date: Sun, 11 Feb 2024 18:12:50 +0000
Subject: [PATCH] add bruck's method to allgather and allreduce; add knomial to
 allreduce

Signed-off-by: Jun Tang <juntangc@amazon.com>
---
 ompi/mca/coll/base/coll_base_allgather.c      | 210 ++++++++++++++++++
 ompi/mca/coll/base/coll_base_allreduce.c      |  70 ++++++
 ompi/mca/coll/base/coll_base_functions.h      |   3 +
 ompi/mca/coll/base/coll_base_reduce.c         | 147 ++++++++++++
 .../tuned/coll_tuned_allgather_decision.c     |   7 +-
 .../tuned/coll_tuned_allreduce_decision.c     |   5 +-
 .../coll/tuned/coll_tuned_reduce_decision.c   |   6 +-
 7 files changed, 445 insertions(+), 3 deletions(-)

diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c
index fc28696599c..f9156c96127 100644
--- a/ompi/mca/coll/base/coll_base_allgather.c
+++ b/ompi/mca/coll/base/coll_base_allgather.c
@@ -866,4 +866,214 @@ ompi_coll_base_allgather_intra_basic_linear(const void *sbuf, int scount,
     return err;
 }
 
+/*
+ * ompi_coll_base_allgather_intra_k_bruck
+ *
+ * Function:     allgather using O(logk(N)) steps.
+ * Accepts:      Same arguments as MPI_Allgather
+ * Returns:      MPI_SUCCESS or error code
+ *
+ * Description:  This method extend ompi_coll_base_allgather_intra_bruck to handle any
+ *               radix k; use non-blocking communication to take advantage of multiple ports
+ *               The algorithm detail is described in Bruck et al. (1997),
+ *               "Efficient Algorithms for All-to-all Communications
+ *                in Multiport Message-Passing Systems"
+ * Memory requirements:  non-zero ranks require temporary buffer to perform final
+ *               step in the algorithm.
+ *
+ * Example on 10 nodes with k=3:
+ *   Initialization: everyone has its own buffer at location 0 in rbuf
+ *                   This means if user specified MPI_IN_PLACE for sendbuf
+ *                   we must copy our block from recvbuf to beginning!
+ *    #     0      1      2      3      4      5      6      7      8      9
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *   Step 0: send message to (rank - k^0 * i), receive message from (rank + k^0 * i)
+ *           message size is k^0 * block size and i is between [1, k-1]
+ *    #     0      1      2      3      4      5      6      7      8      9
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *         [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [0]
+ *         [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [0]    [1]
+ *   Step 1: send message to (rank - k^1 * i), receive message from (rank + k^1 * i)
+ *           message size is k^1 * block size
+ *    #     0      1      2      3      4      5      6      7      8      9
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *         [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [0]
+ *         [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [0]    [1]
+ *         [3]    [4]    [5]    [6]    [7]    [8]    [9]    [0]    [1]    [2]
+ *         [4]    [5]    [6]    [7]    [8]    [9]    [0]    [1]    [2]    [3]
+ *         [5]    [6]    [7]    [8]    [9]    [0]    [1]    [2]    [3]    [4]
+ *         [6]    [7]    [8]    [9]    [0]    [1]    [2]    [3]    [4]    [5]
+ *         [7]    [8]    [9]    [0]    [1]    [2]    [3]    [4]    [5]    [6]
+ *         [8]    [9]    [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+ *   Step 2: send message to (rank - k^2 * i), receive message from (rank + k^2 * i)
+ *           message size is k^2 * block size or "all remaining blocks" for each exchange
+ *    #     0      1      2      3      4      5      6      7      8      9
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *         [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [0]
+ *         [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [0]    [1]
+ *         [3]    [4]    [5]    [6]    [7]    [8]    [9]    [0]    [1]    [2]
+ *         [4]    [5]    [6]    [7]    [8]    [9]    [0]    [1]    [2]    [3]
+ *         [5]    [6]    [7]    [8]    [9]    [0]    [1]    [2]    [3]    [4]
+ *         [6]    [7]    [8]    [9]    [0]    [1]    [2]    [3]    [4]    [5]
+ *         [7]    [8]    [9]    [0]    [1]    [2]    [3]    [4]    [5]    [6]
+ *         [8]    [9]    [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+ *         [9]    [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+ *    Finalization: Do a local shift (except rank 0) to get data in correct place
+ *    #     0      1      2      3      4      5      6      7      8      9
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ *         [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+ */
+int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, int scount,
+                                          struct ompi_datatype_t *sdtype,
+                                          void* rbuf, int rcount,
+                                          struct ompi_datatype_t *rdtype,
+                                          struct ompi_communicator_t *comm,
+                                          mca_coll_base_module_t *module)
+{
+    int line = -1, rank, size, dst, src, dist, err = MPI_SUCCESS;
+    int recvcount, distance;
+    int k = 8;
+    ptrdiff_t rlb, rextent;
+    ptrdiff_t rsize, rgap = 0;
+    ompi_request_t **reqs;
+    int num_reqs, max_reqs = 0;
+
+    char *tmpsend = NULL;
+    char *tmprecv = NULL;
+    char *tmp_buf = NULL;
+    char *tmp_buf_start = NULL;
+
+    rank = ompi_comm_rank(comm);
+    size = ompi_comm_size(comm);
+
+    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
+                 "coll:base:allgather_intra_k_bruck radix %d rank %d", k, rank));
+    err = ompi_datatype_get_extent (rdtype, &rlb, &rextent);
+    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+    if (0 != rank) {
+        /* Compute the temporary buffer size, including datatypes empty gaps */
+        rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * (size - rank), &rgap);
+        tmp_buf = (char *) malloc(rsize);
+        tmp_buf_start = tmp_buf - rgap;
+    }
+
+    // tmprecv points to the data initially on this rank, handle mpi_in_place case
+    tmprecv = (char*) rbuf;
+    if (MPI_IN_PLACE != sbuf) {
+        tmpsend = (char*) sbuf;
+        err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+    } else if (0 != rank) {
+        // root data placement is at the correct poistion
+        tmpsend = ((char*)rbuf) + (ptrdiff_t)rank * (ptrdiff_t)rcount * rextent;
+        err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmprecv, tmpsend);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+    }
+    /*
+       Maximum number of communication phases logk(n)
+       For each phase i, rank r:
+       - increase the distance and recvcount by k times
+       - sends (k - 1) messages which starts at beginning of rbuf and has size
+       (recvcount) to rank (r - distance * j)
+       - receives (k - 1) messages of size recvcount from rank (r + distance * j)
+       at location (rbuf + distance * j * rcount * rext)
+       - calculate the remaining data for each of the (k - 1) messages in the last
+       phase to complete all transactions
+    */
+    max_reqs = 2 * (k - 1);
+    reqs = ompi_coll_base_comm_get_reqs(module->base_data, max_reqs);
+    recvcount = 1;
+    tmpsend = (char*) rbuf;
+    for (distance = 1; distance < size; distance *= k) {
+        num_reqs = 0;
+        for (int j = 1; j < k; j++)
+        {
+            if (distance * j >= size) {
+                break;
+            }
+            src = (rank + distance * j) % size;
+            dst = (rank - distance * j + size) % size;
+
+            tmprecv = tmpsend + (ptrdiff_t)distance * j * rcount * rextent;
+
+            if (distance <= (size / k)) {
+                recvcount = distance;
+            } else {
+                recvcount = (distance < (size - distance * j)?
+                            distance:(size - distance * j));
+            }
+
+            err = MCA_PML_CALL(irecv(tmprecv,
+                                     recvcount * rcount,
+                                     rdtype,
+                                     src,
+                                     MCA_COLL_BASE_TAG_ALLGATHER,
+                                     comm,
+                                     &reqs[num_reqs++]));
+            if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+            err = MCA_PML_CALL(isend(tmpsend,
+                                     recvcount * rcount,
+                                     rdtype,
+                                     dst,
+                                     MCA_COLL_BASE_TAG_ALLGATHER,
+                                     MCA_PML_BASE_SEND_STANDARD,
+                                     comm,
+                                     &reqs[num_reqs++]));
+            if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+        }
+        err = ompi_request_wait_all(num_reqs, reqs, MPI_STATUSES_IGNORE);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+    }
+
+    // Finalization step:        On all ranks except 0, data needs to be shifted locally
+    if (0 != rank) {
+        err = ompi_datatype_copy_content_same_ddt(rdtype,
+                                                  ((ptrdiff_t)(size - rank) * rcount),
+                                                  tmp_buf_start,
+                                                  rbuf);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+        tmpsend = (char*) rbuf + (ptrdiff_t)(size - rank) * rcount * rextent;
+        err = ompi_datatype_copy_content_same_ddt(rdtype,
+                                                  (ptrdiff_t)rank * rcount,
+                                                  rbuf,
+                                                  tmpsend);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+        tmprecv = (char*) rbuf + (ptrdiff_t)rank * rcount * rextent;
+        err = ompi_datatype_copy_content_same_ddt(rdtype,
+                                                  (ptrdiff_t)(size - rank) * rcount,
+                                                  tmprecv,
+                                                  tmp_buf_start);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+    }
+
+    if(tmp_buf != NULL) {
+        free(tmp_buf);
+        tmp_buf = NULL;
+        tmp_buf_start = NULL;
+    }
+
+    return OMPI_SUCCESS;
+
+ err_hndl:
+    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
+                 __FILE__, line, err, rank));
+    if(tmp_buf != NULL) {
+        free(tmp_buf);
+        tmp_buf = NULL;
+        tmp_buf_start = NULL;
+    }
+    (void)line;  // silence compiler warning
+    return err;
+}
 /* copied function (with appropriate renaming) ends here */
diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index 1eb9d2ca64c..bf375ea9c53 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -1376,4 +1376,74 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
     return err;
 }
 
+int ompi_coll_base_allreduce_intra_k_bruck(const void *sbuf, void *rbuf, int count,
+                                           struct ompi_datatype_t *dtype,
+                                           struct ompi_op_t *op,
+                                           struct ompi_communicator_t *comm,
+                                           mca_coll_base_module_t *module)
+{
+    int line = -1;
+    char *partial_buf = NULL;
+    char *partial_buf_start = NULL;
+    char *sendtmpbuf = NULL;
+    char *buffer1 = NULL;
+    char *buffer1_start = NULL;
+    int err = OMPI_SUCCESS;
+
+    ptrdiff_t extent, lb;
+    ompi_datatype_get_extent(dtype, &lb, &extent);
+
+    int rank = ompi_comm_rank(comm);
+    int size = ompi_comm_size(comm);
+
+    sendtmpbuf = (char*) sbuf;
+    if( sbuf == MPI_IN_PLACE ) {
+        sendtmpbuf = (char *)rbuf;
+    }
+    ptrdiff_t buf_size, gap = 0;
+    buf_size = opal_datatype_span(&dtype->super, (int64_t)count * size, &gap);
+    partial_buf = (char *) malloc(buf_size);
+    partial_buf_start = partial_buf - gap;
+    buf_size = opal_datatype_span(&dtype->super, (int64_t)count, &gap);
+    buffer1 = (char *) malloc(buf_size);
+    buffer1_start = buffer1 - gap;
+
+    err = ompi_datatype_copy_content_same_ddt(dtype, count,
+                                              (char*)buffer1_start,
+                                              (char*)sendtmpbuf);
+    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+    /* Local roots perform a allreduce on the upper comm */
+    err = comm->c_coll->coll_allgather(buffer1_start, count, dtype,
+                                       partial_buf_start, count, dtype,
+                                       comm, comm->c_coll->coll_allgather_module);
+    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+    for(int target = 1; target < size; target++)
+    {
+        ompi_op_reduce(op,
+                       partial_buf_start + (ptrdiff_t)target * count * extent,
+                       partial_buf_start,
+                       count,
+                       dtype);
+    }
+
+    // move data to rbuf
+    err = ompi_datatype_copy_content_same_ddt(dtype, count,
+                                              (char*)rbuf,
+                                              (char*)partial_buf_start);
+    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+err_hndl:
+    if (NULL != partial_buf) {
+        free(partial_buf);
+        partial_buf = NULL;
+        partial_buf_start = NULL;
+    }
+    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
+                 __FILE__, line, err, rank));
+    (void)line;  // silence compiler warning
+    return err;
+
+}
 /* copied function (with appropriate renaming) ends here */
diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h
index 1c73d01d37e..e75a7c6c21f 100644
--- a/ompi/mca/coll/base/coll_base_functions.h
+++ b/ompi/mca/coll/base/coll_base_functions.h
@@ -194,6 +194,7 @@ int ompi_coll_base_allgather_intra_ring(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_neighborexchange(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_basic_linear(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_two_procs(ALLGATHER_ARGS);
+int ompi_coll_base_allgather_intra_k_bruck(ALLGATHER_ARGS);
 
 /* All GatherV */
 int ompi_coll_base_allgatherv_intra_bruck(ALLGATHERV_ARGS);
@@ -211,6 +212,7 @@ int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsi
 int ompi_coll_base_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
 int ompi_coll_base_allreduce_intra_redscat_allgather(ALLREDUCE_ARGS);
 int ompi_coll_base_allreduce_intra_allgather_reduce(ALLREDUCE_ARGS);
+int ompi_coll_base_allreduce_intra_k_bruck(ALLREDUCE_ARGS);
 
 /* AlltoAll */
 int ompi_coll_base_alltoall_intra_pairwise(ALLTOALL_ARGS);
@@ -274,6 +276,7 @@ int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_ou
 int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
 int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
 int ompi_coll_base_reduce_intra_redscat_gather(REDUCE_ARGS);
+int ompi_coll_base_reduce_intra_knomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
 
 /* Reduce_scatter */
 int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c
index e7cf7e0656a..d70b470ff53 100644
--- a/ompi/mca/coll/base/coll_base_reduce.c
+++ b/ompi/mca/coll/base/coll_base_reduce.c
@@ -1142,3 +1142,150 @@ int ompi_coll_base_reduce_intra_redscat_gather(
         free(scount);
     return err;
 }
+
+int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
+                                           int count, ompi_datatype_t* datatype,
+                                           ompi_op_t* op, int root,
+                                           ompi_communicator_t* comm,
+                                           mca_coll_base_module_t *module,
+                                           uint32_t segsize,
+                                           int max_outstanding_reqs  )
+{
+    int err = OMPI_SUCCESS, rank, size, line, radix;
+    ptrdiff_t extent, lb;
+    size_t dtype_size;
+    int seg_count = count;
+    char *child_buf = NULL;
+    char *child_buf_start = NULL;
+    char *reduce_buf = NULL;
+    char *reduce_buf_start = NULL;
+    char *sendtmpbuf = NULL;
+    mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
+    mca_coll_base_comm_t *data = base_module->base_data;
+    ompi_coll_tree_t* tree;
+    int num_children;
+    bool is_leaf;
+    ptrdiff_t buf_size, gap = 0;
+    int max_reqs, num_reqs;
+    ompi_request_t **reqs;
+
+    OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:ompi_coll_base_reduce_intra_knomial msg size %d, max_requests %d",
+                 count, max_outstanding_reqs));
+
+    rank = ompi_comm_rank(comm);
+    size = ompi_comm_size(comm);
+
+    // create a k-nomial tree with radix 4
+    radix = 4;
+    COLL_BASE_UPDATE_KMTREE(comm, base_module, root, radix);
+    if (NULL == data->cached_kmtree) {
+        // fail to create knomial tree fallback to previous allreduce method
+        OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
+                     "REDUCE: failed to create knomial tree. \n"));
+        goto err_hndl;
+    }
+
+    tree = data->cached_kmtree;
+    num_children = tree->tree_nextsize;
+    is_leaf = (tree->tree_nextsize == 0) ? true : false;
+
+    ompi_datatype_get_extent(datatype, &lb, &extent);
+    ompi_datatype_type_size(datatype, &dtype_size);
+
+    sendtmpbuf = (char*) sendbuf;
+    if( sendbuf == MPI_IN_PLACE ) {
+        sendtmpbuf = (char *)recvbuf;
+    }
+    buf_size = opal_datatype_span(&datatype->super, (int64_t)count, &gap);
+    reduce_buf = (char *)malloc(buf_size);
+    reduce_buf_start = reduce_buf - gap;
+    err = ompi_datatype_copy_content_same_ddt(datatype, count,
+                                              (char*)reduce_buf_start,
+                                              (char*)sendtmpbuf);
+    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+    // do transfer in a single transaction instead of segments
+    num_reqs = 0;
+    max_reqs = num_children;
+    if(!is_leaf) {
+        buf_size = opal_datatype_span(&datatype->super, (int64_t)count * num_children, &gap);
+        child_buf = (char *)malloc(buf_size);
+        child_buf_start = child_buf - gap;
+        reqs = ompi_coll_base_comm_get_reqs(data, max_reqs);
+    }
+
+    for (int i = 0; i < num_children; i++) {
+        int child = tree->tree_next[i];
+        err = MCA_PML_CALL(irecv(child_buf_start + (ptrdiff_t)i * count * extent,
+                                 count,
+                                 datatype,
+                                 child,
+                                 MCA_COLL_BASE_TAG_REDUCE,
+                                 comm,
+                                 &reqs[num_reqs++]));
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+    }
+
+    if (num_reqs > 0) {
+        err = ompi_request_wait_all(num_reqs, reqs, MPI_STATUS_IGNORE);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+    }
+
+    for (int i = 0; i < num_children; i++) {
+        ompi_op_reduce(op,
+                       child_buf_start + (ptrdiff_t)i * count * extent,
+                       reduce_buf,
+                       count,
+                       datatype);
+    }
+
+    if (rank != root) {
+        err = MCA_PML_CALL(send(reduce_buf_start,
+                                count,
+                                datatype,
+                                tree->tree_prev,
+                                MCA_COLL_BASE_TAG_REDUCE,
+                                MCA_PML_BASE_SEND_STANDARD,
+                                comm));
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+    }
+
+    if (rank == root) {
+        err = ompi_datatype_copy_content_same_ddt(datatype, count,
+                                                  (char*)recvbuf,
+                                                  (char*)reduce_buf_start);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+    }
+
+    return OMPI_SUCCESS;
+
+ err_hndl:
+    if (NULL != child_buf) {
+        free(child_buf);
+        child_buf = NULL;
+        child_buf_start = NULL;
+    }
+    if (NULL != reduce_buf) {
+        free(child_buf);
+        child_buf = NULL;
+        child_buf_start = NULL;
+    }
+    if( NULL != reqs ) {
+        if (MPI_ERR_IN_STATUS == err) {
+            for( num_reqs = 0; num_reqs < tree->tree_nextsize; num_reqs++ ) {
+                if (MPI_REQUEST_NULL == reqs[num_reqs]) continue;
+                if (MPI_ERR_PENDING == reqs[num_reqs]->req_status.MPI_ERROR) continue;
+                if (reqs[num_reqs]->req_status.MPI_ERROR != MPI_SUCCESS) {
+                    err = reqs[num_reqs]->req_status.MPI_ERROR;
+                    break;
+                }
+            }
+        }
+        ompi_coll_base_free_reqs(reqs, max_reqs);
+    }
+    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
+                 __FILE__, line, err, rank));
+    (void)line;  // silence compiler warning
+    return err;
+
+}
diff --git a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c
index 1ae5f506ef1..6ae558c699c 100644
--- a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c
@@ -41,6 +41,7 @@ static const mca_base_var_enum_value_t allgather_algorithms[] = {
     {5, "neighbor"},
     {6, "two_proc"},
     {7, "sparbit"},
+    {8, "bruck-k-fanout"},
     {0, NULL}
 };
 
@@ -79,7 +80,7 @@ ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca
     mca_param_indices->algorithm_param_index =
         mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
                                         "allgather_algorithm",
-                                        "Which allgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only, 7: sparbit. "
+                                        "Which allgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only, 7: sparbit, 8: bruck with radix k. "
                                         "Only relevant if coll_tuned_use_dynamic_rules is true.",
                                         MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                         OPAL_INFO_LVL_5,
@@ -168,6 +169,10 @@ int ompi_coll_tuned_allgather_intra_do_this(const void *sbuf, int scount,
         return ompi_coll_base_allgather_intra_sparbit(sbuf, scount, sdtype,
                                                         rbuf, rcount, rdtype,
                                                         comm, module);
+    case (8):
+        return ompi_coll_base_allgather_intra_k_bruck(sbuf, scount, sdtype,
+                                                        rbuf, rcount, rdtype,
+                                                        comm, module);
     } /* switch */
     OPAL_OUTPUT((ompi_coll_tuned_stream,
                  "coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
index 3711cdb8eb1..f29c083d373 100644
--- a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
@@ -43,6 +43,7 @@ static const mca_base_var_enum_value_t allreduce_algorithms[] = {
     {5, "segmented_ring"},
     {6, "rabenseifner"},
     {7, "allgather_reduce"},
+    {8, "allreduce_bruck"},
     {0, NULL}
 };
 
@@ -78,7 +79,7 @@ int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorith
     mca_param_indices->algorithm_param_index =
         mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
                                         "allreduce_algorithm",
-                                        "Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring. "
+                                        "Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring, 6 rabenseifner, 7 allgather_reduce, 8 allreduce_bruck. "
                                         "Only relevant if coll_tuned_use_dynamic_rules is true.",
                                         MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                         OPAL_INFO_LVL_5,
@@ -149,6 +150,8 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, int co
         return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, module);
     case (7):
         return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module);
+    case (8):
+        return ompi_coll_base_allreduce_intra_k_bruck(sbuf, rbuf, count, dtype, op, comm, module);
     } /* switch */
     OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
                  algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
index 40e500d1c04..0fa57909d50 100644
--- a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
@@ -42,6 +42,7 @@ static const mca_base_var_enum_value_t reduce_algorithms[] = {
     {5, "binomial"},
     {6, "in-order_binary"},
     {7, "rabenseifner"},
+    {8, "knomial"},
     {0, NULL}
 };
 
@@ -80,7 +81,7 @@ int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_m
     mca_param_indices->algorithm_param_index =
         mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
                                         "reduce_algorithm",
-                                        "Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary, 7 rabenseifner. "
+                                        "Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary, 7 rabenseifner, 8 knomial. "
                                         "Only relevant if coll_tuned_use_dynamic_rules is true.",
                                         MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                         OPAL_INFO_LVL_5,
@@ -177,6 +178,9 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, int count
                                                                   segsize, max_requests);
     case (7):  return ompi_coll_base_reduce_intra_redscat_gather(sbuf, rbuf, count, dtype,
                                                                   op, root, comm, module);
+    case (8):  return ompi_coll_base_reduce_intra_knomial(sbuf, rbuf, count, dtype,
+                                                          op, root, comm, module,
+                                                          segsize, max_requests);
     } /* switch */
     OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
                  algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));