diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index 8132f3fadd3..cb82b47e6be 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -136,6 +136,7 @@ int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_ /* Bcast */ int ompi_coll_tuned_bcast_intra_dec_fixed(BCAST_ARGS); +int ompi_coll_tuned_bcast_intra_disjoint_dec_fixed(BCAST_ARGS); int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS); int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize); int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index 1c445c0c889..ebf4c409422 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -651,6 +651,78 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count, alg, 0, 0); } + +/* + * bcast_intra_dec for inter node communicators + * + * Function: - selects broadcast algorithm to use + * Accepts: - same arguments as MPI_Bcast() + * Returns: - MPI_SUCCESS or error code (passed from the bcast implementation) + */ +int ompi_coll_tuned_bcast_intra_disjoint_dec_fixed(void *buff, int count, + struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { + size_t total_dsize, dsize; + int communicator_size, alg; + communicator_size = ompi_comm_size(comm); + + ompi_datatype_type_size(datatype, &dsize); + total_dsize = dsize * (unsigned long)count; + + OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_bcast_intra_disjoint_dec_fixed" + " root %d rank %d com_size %d", + root, ompi_comm_rank(comm), communicator_size)); + + /** Algorithms: + * {1, "basic_linear"}, + * {2, "chain"}, + * {3, "pipeline"}, + * {4, "split_binary_tree"}, + * {5, "binary_tree"}, + * {6, "binomial"}, + * {7, "knomial"}, + * {8, "scatter_allgather"}, + * {9, "scatter_allgather_ring"}, + */ + if (communicator_size < 4) { + alg = 1; + } else if (communicator_size < 8) { + if (total_dsize < 1048576) { + alg = 1; + } else { + alg = 5; + } + } else if (communicator_size < 16) { + if (total_dsize < 1048576) { + alg = 1; + } else { + alg = 5; + } + } else if (communicator_size < 32) { + if (total_dsize < 262144) { + alg = 1; + } else if (total_dsize < 1048576) { + alg = 7; + } else { + alg = 5; + } + } else { + if (total_dsize < 65536) { + alg = 1; + } else if (total_dsize < 1048576) { + alg = 7; + } else { + alg = 5; + } + } + + return ompi_coll_tuned_bcast_intra_do_this (buff, count, datatype, root, + comm, module, + alg, 0, 0); +} + + /* * reduce_intra_dec * diff --git a/ompi/mca/coll/tuned/coll_tuned_module.c b/ompi/mca/coll/tuned/coll_tuned_module.c index f1be43440cc..1b03ee90acd 100644 --- a/ompi/mca/coll/tuned/coll_tuned_module.c +++ b/ompi/mca/coll/tuned/coll_tuned_module.c @@ -93,7 +93,14 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority) /* By default stick with the fixed version of the tuned collectives. Later on, * when the module get enabled, set the correct version based on the availability * of the dynamic rules. + * For some collectives, we distinguish between disjoint communicatiors to make + * decision specific for inter node communication. */ + if (OMPI_COMM_IS_DISJOINT(comm)) { + tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_disjoint_dec_fixed; + } else { + tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_dec_fixed; + } tuned_module->super.coll_allgather = ompi_coll_tuned_allgather_intra_dec_fixed; tuned_module->super.coll_allgatherv = ompi_coll_tuned_allgatherv_intra_dec_fixed; tuned_module->super.coll_allreduce = ompi_coll_tuned_allreduce_intra_dec_fixed; @@ -101,7 +108,6 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority) tuned_module->super.coll_alltoallv = ompi_coll_tuned_alltoallv_intra_dec_fixed; tuned_module->super.coll_alltoallw = NULL; tuned_module->super.coll_barrier = ompi_coll_tuned_barrier_intra_dec_fixed; - tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_dec_fixed; tuned_module->super.coll_exscan = NULL; tuned_module->super.coll_gather = ompi_coll_tuned_gather_intra_dec_fixed; tuned_module->super.coll_gatherv = NULL;