From 7def5606ab6fb669105e6a3226c64486d06dbc27 Mon Sep 17 00:00:00 2001 From: Jacob Domagala Date: Sun, 24 Mar 2024 22:27:09 +0100 Subject: [PATCH 01/29] #2240: Initial work for new allreduce --- .../collective/reduce/allreduce/allreduce.h | 130 ++++++++++++++++++ .../reduce/allreduce/rabenseifner.h | 128 +++++++++++++++++ tests/perf/allreduce.cc | 111 +++++++++++++++ tests/perf/reduce.cc | 16 ++- 4 files changed, 380 insertions(+), 5 deletions(-) create mode 100644 src/vt/collective/reduce/allreduce/allreduce.h create mode 100644 src/vt/collective/reduce/allreduce/rabenseifner.h create mode 100644 tests/perf/allreduce.cc diff --git a/src/vt/collective/reduce/allreduce/allreduce.h b/src/vt/collective/reduce/allreduce/allreduce.h new file mode 100644 index 0000000000..83f8f16472 --- /dev/null +++ b/src/vt/collective/reduce/allreduce/allreduce.h @@ -0,0 +1,130 @@ +/* +//@HEADER +// ***************************************************************************** +// +// reduce.h +// DARMA/vt => Virtual Transport +// +// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ + +#if !defined INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_ALLREDUCE_H +#define INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_ALLREDUCE_H + +#include "vt/config.h" +#include "vt/context/context.h" +#include "vt/messaging/message/message.h" + +#include +#include + +namespace vt::collective::reduce::alleduce { + +template +struct AllreduceMsg + : SerializeIfNeeded, DataT> { + using MessageParentType = + SerializeIfNeeded<::vt::Message, AllreduceMsg, DataT>; + + AllreduceMsg() = default; + AllreduceMsg(AllreduceMsg const&) = default; + AllreduceMsg(AllreduceMsg&&) = default; + + explicit AllreduceMsg(DataT&& in_val) + : MessageParentType(), + val_(std::forward(in_val)) { } + explicit AllreduceMsg(DataT const& in_val) + : MessageParentType(), + val_(in_val) { } + + template + void serialize(SerializeT& s) { + MessageParentType::serialize(s); + s | val_; + } + + DataT val_ = {}; +}; + +template +struct Allreduce { + void rightHalf(AllreduceMsg* msg) { + for (int i = 0; i < msg->vec_.size(); i++) { + val_[(val_.size() / 2) + i] += msg->vec_[i]; + } + } + + void rightHalfComplete(AllreduceMsg* msg) { + for (int i = 0; i < msg->vec_.size(); i++) { + val_[(val_.size() / 2) + i] = msg->vec_[i]; + } + } + + void leftHalf(AllreduceMsg* msg) { + for (int i = 0; i < msg->vec_.size(); i++) { + val_[i] += msg->vec_[i]; + } + } + + void leftHalfComplete(AllreduceMsg* msg) { + for (int i = 0; i < msg->vec_.size(); i++) { + val_[i] = msg->vec_[i]; + } + } + + void sendHandler(AllreduceMsg* msg) { + uint32_t start = is_even_ ? 0 : val_.size() / 2; + uint32_t end = is_even_ ? val_.size() / 2 : val_.size(); + for (int i = 0; start < end; start++) { + val_[start] += msg->vec_[i++]; + } + } + + void reducedHan(AllreduceMsg* msg) { + for (int i = 0; i < msg->vec_.size(); i++) { + val_[val_.size() / 2 + i] = msg->vec_[i]; + } + } + + Allreduce() { is_even_ = theContext()->getNode() % 2 == 0; } + + bool is_even_ = false; + DataT val_ = {}; +}; + +} // namespace vt::collective::reduce::alleduce + +#endif /*INCLUDED_VT_COLLECTIVE_REDUCE_REDUCE_H*/ diff --git a/src/vt/collective/reduce/allreduce/rabenseifner.h b/src/vt/collective/reduce/allreduce/rabenseifner.h new file mode 100644 index 0000000000..f15c522a80 --- /dev/null +++ b/src/vt/collective/reduce/allreduce/rabenseifner.h @@ -0,0 +1,128 @@ + + +#if !defined INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H +#define INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H + +#include "vt/messaging/message/shared_message.h" +#include "vt/objgroup/manager.h" +#include "vt/collective/reduce/allreduce/allreduce.h" + +#include + +namespace vt::collective::reduce::alleduce { + +template class Op, typename... Args> +void allreduce(Args&&... data) { + + auto msg = vt::makeMessage(std::forward(data)...); + auto const this_node = vt::theContext()->getNode(); + auto const num_nodes = theContext()->getNumNodes(); + + using Reducer = Allreduce; + + auto grp_proxy = + vt::theObjGroup()->makeCollective("allreduce_rabenseifner"); + + auto const lastNode = num_nodes - 1; + auto const num_steps = static_cast(log2(num_nodes)); + auto const nprocs_pof2 = 1 << num_steps; + auto const nprocs_rem = num_nodes - nprocs_pof2; + + //////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////// STEP 1 //////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////// + + int vrt_node; + bool const is_part_of_adjustment_group = this_node < (2 * nprocs_rem); + bool const is_even = this_node % 2 == 0; + vt::runInEpochCollective([=, &vrt_node] { + vt::runInEpochCollective([=] { + if (is_part_of_adjustment_group) { + auto const partner = is_even ? this_node + 1 : this_node - 1; + grp_proxy[partner].send<&Reducer::sendHandler>(std::forward(data...)); + } + }); + + vt::runInEpochCollective([=] { + if (is_part_of_adjustment_group and not is_even) { + auto& vec = grp_proxy[this_node].get()->data_; + grp_proxy[this_node - 1].send<&Reducer::reducedHan>( + std::vector{vec.begin() + (vec.size() / 2), vec.end()}); + } + }); + + if (is_part_of_adjustment_group) { + if (is_even) { + vrt_node = this_node / 2; + } else { + vrt_node = -1; + } + + } else { /* rank >= 2 * nprocs_rem */ + vrt_node = this_node - nprocs_rem; + } + }); + + //////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////// STEP 2 //////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////// + + // int step = 0; + // auto const wsize = data.size(); + + // auto& vec = grp_proxy[this_node].get()->data_; + + // /* + // Scatter Reduce (distance doubling with vector halving) + // */ + // for (int mask = 1; mask < (1 << num_steps); mask <<= 1) { + // int vdest = vrt_node ^ mask; + // int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem; + + // vt::runInEpochCollective([=] { + // if (vrt_node != -1) { + // if (this_node < dest) { + // grp_proxy[dest].send<&NodeObj::rightHalf>( + // std::vector{vec.begin() + (vec.size() / 2), vec.end()}); + // } else { + // grp_proxy[dest].send<&NodeObj::leftHalf>( + // std::vector{vec.begin(), vec.end() - (vec.size() / 2)}); + // } + // } + // }); + // } + + // step = num_steps - 1; + + // /* + // AllGather (distance halving with vector halving) + // */ + // for (int mask = (1 << num_steps) >> 1; mask > 0; mask >>= 1) { + // int vdest = vrt_node ^ mask; + // /* Translate vdest virtual rank to real rank */ + // int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem; + // vt::runInEpochCollective([=] { + // if (vrt_node != -1) { + // if (this_node < dest) { + // grp_proxy[dest].send<&NodeObj::leftHalfComplete>( + // std::vector{vec.begin(), vec.end() - (vec.size() / 2)}); + // } else { + // grp_proxy[dest].send<&NodeObj::rightHalfComplete>( + // std::vector{vec.begin() + (vec.size() / 2), vec.end()}); + // } + // } + // }); + // } + + /* + Send to excluded nodes (if needed) + */ + + /* + Local invoke of the handler + */ +} + +} // namespace vt::collective::reduce::alleduce + +#endif // INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H \ No newline at end of file diff --git a/tests/perf/allreduce.cc b/tests/perf/allreduce.cc new file mode 100644 index 0000000000..a093fd0a37 --- /dev/null +++ b/tests/perf/allreduce.cc @@ -0,0 +1,111 @@ +/* +//@HEADER +// ***************************************************************************** +// +// reduce.cc +// DARMA/vt => Virtual Transport +// +// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ +#include "common/test_harness.h" +#include "vt/context/context.h" +#include +#include +#include +#include +#include + +#include + +using namespace vt; +using namespace vt::tests::perf::common; + +static constexpr int num_iters = 1; + +struct MyTest : PerfTestHarness { }; + +struct NodeObj { + explicit NodeObj(MyTest* test_obj) : test_obj_(test_obj) { } + + void initialize() { proxy_ = vt::theObjGroup()->getProxy(this); + } + struct MyMsg : vt::Message {}; + + void reduceComplete(std::vector in) { + reduce_counter_++; + test_obj_->StopTimer(fmt::format("{} reduce", i)); + test_obj_->GetMemoryUsage(); + if (i < num_iters) { + i++; + auto this_node = theContext()->getNode(); + proxy_[this_node].send(); + } else if (theContext()->getNode() == 0) { + theTerm()->enableTD(); + } + } + + void perfReduce(MyMsg* in_msg) { + test_obj_->StartTimer(fmt::format("{} reduce", i)); + + proxy_.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data_); + } + +private: + MyTest* test_obj_ = nullptr; + vt::objgroup::proxy::Proxy proxy_ = {}; + int reduce_counter_ = -1; + int i = 0; + std::vector data_ = {}; +}; + +VT_PERF_TEST(MyTest, test_reduce) { + auto grp_proxy = vt::theObjGroup()->makeCollective( + "test_reduce", this + ); + + if (theContext()->getNode() == 0) { + theTerm()->disableTD(); + } + + std::vector data(1024, theContext()->getNode()); + grp_proxy.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data); + + if (theContext()->getNode() == 0) { + theTerm()->enableTD(); + } +} + +VT_PERF_TEST_MAIN() diff --git a/tests/perf/reduce.cc b/tests/perf/reduce.cc index edda6c8a3d..eb0a3f102f 100644 --- a/tests/perf/reduce.cc +++ b/tests/perf/reduce.cc @@ -41,16 +41,19 @@ //@HEADER */ #include "common/test_harness.h" +#include "vt/context/context.h" +#include #include #include #include +#include #include INCLUDE_FMT_CORE using namespace vt; using namespace vt::tests::perf::common; -static constexpr int num_iters = 100; +static constexpr int num_iters = 1; struct MyTest : PerfTestHarness { MyTest() { DisableGlobalTimer(); } @@ -59,11 +62,12 @@ struct MyTest : PerfTestHarness { struct NodeObj { explicit NodeObj(MyTest* test_obj) : test_obj_(test_obj) { } - void initialize() { proxy_ = vt::theObjGroup()->getProxy(this); } - + void initialize() { proxy_ = vt::theObjGroup()->getProxy(this); +// data_["Node"] = theContext()->getNode(); } + } struct MyMsg : vt::Message {}; - void reduceComplete() { + void reduceComplete(std::vector in) { reduce_counter_++; test_obj_->StopTimer(fmt::format("{} reduce", i)); test_obj_->GetMemoryUsage(); @@ -78,7 +82,8 @@ struct NodeObj { void perfReduce(MyMsg* in_msg) { test_obj_->StartTimer(fmt::format("{} reduce", i)); - proxy_.allreduce<&NodeObj::reduceComplete>(); + + proxy_.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data_); } private: @@ -86,6 +91,7 @@ struct NodeObj { vt::objgroup::proxy::Proxy proxy_ = {}; int reduce_counter_ = -1; int i = 0; + std::vector data_ = {}; }; VT_PERF_TEST(MyTest, test_reduce) { From 01b6afb2c427c90d922387d10cb5563893c4f5b2 Mon Sep 17 00:00:00 2001 From: Jacob Domagala Date: Thu, 28 Mar 2024 00:08:03 +0100 Subject: [PATCH 02/29] #2240: Semi working Rabenseifner --- .../collective/reduce/allreduce/allreduce.h | 230 ++++++++++-- .../reduce/allreduce/rabenseifner.h | 12 +- src/vt/objgroup/manager.h | 3 + src/vt/objgroup/manager.impl.h | 29 ++ src/vt/objgroup/proxy/proxy_objgroup.h | 9 + src/vt/objgroup/proxy/proxy_objgroup.impl.h | 18 + tests/perf/reduce.cc | 85 +++-- tests/perf/send_cost.cc | 351 ++++++++++++++++++ 8 files changed, 679 insertions(+), 58 deletions(-) diff --git a/src/vt/collective/reduce/allreduce/allreduce.h b/src/vt/collective/reduce/allreduce/allreduce.h index 83f8f16472..db92fc890f 100644 --- a/src/vt/collective/reduce/allreduce/allreduce.h +++ b/src/vt/collective/reduce/allreduce/allreduce.h @@ -47,11 +47,12 @@ #include "vt/config.h" #include "vt/context/context.h" #include "vt/messaging/message/message.h" +#include "vt/objgroup/proxy/proxy_objgroup.h" #include #include -namespace vt::collective::reduce::alleduce { +namespace vt::collective::reduce::allreduce { template struct AllreduceMsg @@ -66,65 +67,244 @@ struct AllreduceMsg explicit AllreduceMsg(DataT&& in_val) : MessageParentType(), val_(std::forward(in_val)) { } - explicit AllreduceMsg(DataT const& in_val) + explicit AllreduceMsg(DataT const& in_val, int step = 0) : MessageParentType(), - val_(in_val) { } + val_(in_val), + step_(step) { } template void serialize(SerializeT& s) { MessageParentType::serialize(s); s | val_; + s | step_; } DataT val_ = {}; + int32_t step_ = {}; }; template struct Allreduce { - void rightHalf(AllreduceMsg* msg) { - for (int i = 0; i < msg->vec_.size(); i++) { - val_[(val_.size() / 2) + i] += msg->vec_[i]; + void initialize( + const DataT& data, vt::objgroup::proxy::Proxy proxy, + uint32_t num_nodes) { + this_node_ = vt::theContext()->getNode(); + is_even_ = this_node_ % 2 == 0; + val_ = data; + proxy_ = proxy; + num_steps_ = static_cast(log2(num_nodes)); + nprocs_pof2_ = 1 << num_steps_; + nprocs_rem_ = num_nodes - nprocs_pof2_; + is_part_of_adjustment_group_ = this_node_ < (2 * nprocs_rem_); + if (is_part_of_adjustment_group_) { + if (is_even_) { + vrt_node_ = this_node_ / 2; + } else { + vrt_node_ = -1; + } + } else { + vrt_node_ = this_node_ - nprocs_rem_; + } + + r_index_.resize(num_steps_, 0); + r_count_.resize(num_steps_, 0); + s_index_.resize(num_steps_, 0); + s_count_.resize(num_steps_, 0); + + w_size_ = data.size(); + + int step = 0; + size_t wsize = data.size(); + for (int mask = 1; mask < nprocs_pof2_; mask <<= 1) { + auto vdest = vrt_node_ ^ mask; + auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; + + if (this_node_ < dest) { + r_count_[step] = wsize / 2; + s_count_[step] = wsize - r_count_[step]; + s_index_[step] = r_index_[step] + r_count_[step]; + } else { + s_count_[step] = wsize / 2; + r_count_[step] = wsize - s_count_[step]; + r_index_[step] = s_index_[step] + s_count_[step]; + } + + if (step + 1 < num_steps_) { + r_index_[step + 1] = r_index_[step]; + s_index_[step + 1] = r_index_[step]; + wsize = r_count_[step]; + step++; + } + } + + // std::string str(1024, 0x0); + // for (int i = 0; i < num_steps_; ++i) { + // str.append(fmt::format( + // "Step{}: send_idx = {} send_count = {} recieve_idx = {} recieve_count " + // "= {}\n", + // i, s_index_[i], s_count_[i], r_index_[i], r_count_[i])); + // } + // fmt::print( + // "[{}] Initialize with size = {} num_steps {} \n {}", this_node_, w_size_, + // num_steps_, str); + } + + void partOneCollective() { + if (is_part_of_adjustment_group_) { + auto const partner = is_even_ ? this_node_ + 1 : this_node_ - 1; + + if (is_even_) { + proxy_[partner].template send<&Allreduce::partOneRightHalf>( + std::vector{val_.begin() + (val_.size() / 2), val_.end()}); + vrt_node_ = this_node_ / 2; + } else { + proxy_[partner].template send<&Allreduce::partOneLeftHalf>( + std::vector{val_.begin(), val_.end() - (val_.size() / 2)}); + vrt_node_ = -1; + } + } else { + vrt_node_ = this_node_ - nprocs_rem_; + } + + if (nprocs_rem_ == 0) { + partTwo(); } } - void rightHalfComplete(AllreduceMsg* msg) { - for (int i = 0; i < msg->vec_.size(); i++) { - val_[(val_.size() / 2) + i] = msg->vec_[i]; + void partOneRightHalf(AllreduceMsg* msg) { + for (int i = 0; i < msg->val_.size(); i++) { + val_[(val_.size() / 2) + i] += msg->val_[i]; } + + // Send to left node + proxy_[theContext()->getNode() - 1] + .template send<&Allreduce::partOneFinalPart>( + std::vector{val_.begin() + (val_.size() / 2), val_.end()}); } - void leftHalf(AllreduceMsg* msg) { - for (int i = 0; i < msg->vec_.size(); i++) { - val_[i] += msg->vec_[i]; + void partOneLeftHalf(AllreduceMsg* msg) { + for (int i = 0; i < msg->val_.size(); i++) { + val_[i] += msg->val_[i]; } } - void leftHalfComplete(AllreduceMsg* msg) { - for (int i = 0; i < msg->vec_.size(); i++) { - val_[i] = msg->vec_[i]; + void partOneFinalPart(AllreduceMsg* msg) { + for (int i = 0; i < msg->val_.size(); i++) { + val_[(val_.size() / 2) + i] = msg->val_[i]; } + + partTwo(); } - void sendHandler(AllreduceMsg* msg) { - uint32_t start = is_even_ ? 0 : val_.size() / 2; - uint32_t end = is_even_ ? val_.size() / 2 : val_.size(); - for (int i = 0; start < end; start++) { - val_[start] += msg->vec_[i++]; + void partTwo() { + auto vdest = vrt_node_ ^ mask_; + auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; + + // fmt::print( + // "[{}] Part2 Step {}: Sending to Node {} starting with idx = {} and count " + // "{} \n", + // this_node_, step_, dest, s_index_[step_], s_count_[step_]); + proxy_[dest].template send<&Allreduce::partTwoHandler>( + std::vector{ + val_.begin() + (s_index_[step_]), + val_.begin() + (s_index_[step_]) + s_count_[step_]}, + step_); + + mask_ <<= 1; + if (step_ + 1 < num_steps_) { + step_++; } } - void reducedHan(AllreduceMsg* msg) { - for (int i = 0; i < msg->vec_.size(); i++) { - val_[val_.size() / 2 + i] = msg->vec_[i]; + void partTwoHandler(AllreduceMsg* msg) { + for (int i = 0; i < msg->val_.size(); i++) { + val_[r_index_[msg->step_] + i] += msg->val_[i]; + } + + // std::string data(128, 0x0); + // for (auto val : msg->val_) { + // data.append(fmt::format("{} ", val)); + // } + // fmt::print( + // "[{}] Part2 Step {}: Received data ({}) idx = {} from {}\n", this_node_, + // msg->step_, data, r_index_[msg->step_], + // theContext()->getFromNodeCurrentTask()); + + if (mask_ < nprocs_pof2_) { + partTwo(); + } else { + step_ = num_steps_ - 1; + mask_ = nprocs_pof2_ >> 1; + partThree(); } } - Allreduce() { is_even_ = theContext()->getNode() % 2 == 0; } + void partThree() { + auto vdest = vrt_node_ ^ mask_; + auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; + + // std::string data(128, 0x0); + // auto subV = std::vector{ + // val_.begin() + (r_index_[step_]), + // val_.begin() + (r_index_[step_]) + r_count_[step_]}; + // for (auto val : subV) { + // data.append(fmt::format("{} ", val)); + // } + + // fmt::print( + // "[{}] Part3 Step {}: Sending to Node {} starting with idx = {} and count " + // "{} " + // "data={} \n", + // this_node_, step_, dest, r_index_[step_], r_count_[step_], data); + + proxy_[dest].template send<&Allreduce::partThreeHandler>( + std::vector{ + val_.begin() + (r_index_[step_]), + val_.begin() + (r_index_[step_]) + r_count_[step_]}, + step_); + + mask_ >>= 1; + step_--; + } + + void partThreeHandler(AllreduceMsg* msg) { + for (int i = 0; i < msg->val_.size(); i++) { + val_[s_index_[msg->step_] + i] = msg->val_[i]; + } + + // std::string data(128, 0x0); + // for (auto val : msg->val_) { + // data.append(fmt::format("{} ", val)); + // } + // fmt::print( + // "[{}] Part3 Step {}: Received data ({}) idx = {} from {}\n", this_node_, + // msg->step_, data, s_index_[msg->step_], + // theContext()->getFromNodeCurrentTask()); + if (mask_ > 0) { + partThree(); + } + } + + NodeType this_node_ = {}; bool is_even_ = false; + vt::objgroup::proxy::Proxy proxy_ = {}; DataT val_ = {}; + NodeType vrt_node_ = {}; + bool is_part_of_adjustment_group_ = false; + int32_t num_steps_ = {}; + int32_t nprocs_pof2_ = {}; + int32_t nprocs_rem_ = {}; + int32_t mask_ = 1; + + size_t w_size_ = {}; + int32_t step_ = 0; + std::vector r_index_ = {}; + std::vector r_count_ = {}; + std::vector s_index_ = {}; + std::vector s_count_ = {}; }; -} // namespace vt::collective::reduce::alleduce +} // namespace vt::collective::reduce::allreduce #endif /*INCLUDED_VT_COLLECTIVE_REDUCE_REDUCE_H*/ diff --git a/src/vt/collective/reduce/allreduce/rabenseifner.h b/src/vt/collective/reduce/allreduce/rabenseifner.h index f15c522a80..bc5275352f 100644 --- a/src/vt/collective/reduce/allreduce/rabenseifner.h +++ b/src/vt/collective/reduce/allreduce/rabenseifner.h @@ -9,11 +9,10 @@ #include -namespace vt::collective::reduce::alleduce { +namespace vt::collective::reduce::allreduce { template class Op, typename... Args> -void allreduce(Args&&... data) { - +void allreduce_r(Args&&... data) { auto msg = vt::makeMessage(std::forward(data)...); auto const this_node = vt::theContext()->getNode(); auto const num_nodes = theContext()->getNumNodes(); @@ -39,7 +38,8 @@ void allreduce(Args&&... data) { vt::runInEpochCollective([=] { if (is_part_of_adjustment_group) { auto const partner = is_even ? this_node + 1 : this_node - 1; - grp_proxy[partner].send<&Reducer::sendHandler>(std::forward(data...)); + grp_proxy[partner].send<&Reducer::sendHandler>( + std::forward(data...)); } }); @@ -123,6 +123,6 @@ void allreduce(Args&&... data) { */ } -} // namespace vt::collective::reduce::alleduce +} // namespace vt::collective::reduce::allreduce -#endif // INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H \ No newline at end of file +#endif // INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H diff --git a/src/vt/objgroup/manager.h b/src/vt/objgroup/manager.h index 8d144dc14c..5c29647dc0 100644 --- a/src/vt/objgroup/manager.h +++ b/src/vt/objgroup/manager.h @@ -291,6 +291,9 @@ struct ObjGroupManager : runtime::component::Component { ProxyType proxy, std::string const& name, std::string const& parent = "" ); +template class Op, typename DataT> +ObjGroupManager::PendingSendType allreduce_r(ProxyType proxy, const DataT& data); + /** * \brief Perform a reduction over an objgroup * diff --git a/src/vt/objgroup/manager.impl.h b/src/vt/objgroup/manager.impl.h index ff24ee549c..3db771007c 100644 --- a/src/vt/objgroup/manager.impl.h +++ b/src/vt/objgroup/manager.impl.h @@ -41,6 +41,8 @@ //@HEADER */ +#include "vt/messaging/message/smart_ptr.h" +#include #if !defined INCLUDED_VT_OBJGROUP_MANAGER_IMPL_H #define INCLUDED_VT_OBJGROUP_MANAGER_IMPL_H @@ -57,6 +59,7 @@ #include "vt/collective/collective_alg.h" #include "vt/messaging/active.h" #include "vt/elm/elm_id_bits.h" +#include "vt/collective/reduce/allreduce/allreduce.h" #include @@ -262,6 +265,32 @@ ObjGroupManager::PendingSendType ObjGroupManager::broadcast(MsgSharedPtr m return objgroup::broadcast(msg,han); } +template < + auto f, typename ObjT, template class Op, typename DataT> +ObjGroupManager::PendingSendType +ObjGroupManager::allreduce_r(ProxyType proxy, const DataT& data) { + // check payload size and choose appropriate algorithm + + auto const this_node = vt::theContext()->getNode(); + auto const num_nodes = theContext()->getNumNodes(); + + using Reducer = collective::reduce::allreduce::Allreduce; + + auto grp_proxy = + vt::theObjGroup()->makeCollective("allreduce_rabenseifner"); + + grp_proxy[this_node].template invoke<&Reducer::initialize>( + data, grp_proxy, num_nodes); + + vt::runInEpochCollective([=] { + grp_proxy[this_node].template invoke<&Reducer::partOneCollective>(); + }); + + proxy[this_node].template invoke(grp_proxy.get()->val_); + + return PendingSendType{nullptr}; +} + template *f> ObjGroupManager::PendingSendType ObjGroupManager::reduce( ProxyType proxy, MsgSharedPtr msg, diff --git a/src/vt/objgroup/proxy/proxy_objgroup.h b/src/vt/objgroup/proxy/proxy_objgroup.h index a38cb984f2..1854888009 100644 --- a/src/vt/objgroup/proxy/proxy_objgroup.h +++ b/src/vt/objgroup/proxy/proxy_objgroup.h @@ -198,6 +198,15 @@ struct Proxy { Args&&... args ) const; + template < + auto f, + template class Op = collective::NoneOp, + typename... Args + > + PendingSendType allreduce_h( + Args&&... args + ) const; + /** * \brief Reduce back to a point target. Performs a reduction using operator * `Op` followed by a send to `f` with the result. diff --git a/src/vt/objgroup/proxy/proxy_objgroup.impl.h b/src/vt/objgroup/proxy/proxy_objgroup.impl.h index f546b65719..f9c4ba5b06 100644 --- a/src/vt/objgroup/proxy/proxy_objgroup.impl.h +++ b/src/vt/objgroup/proxy/proxy_objgroup.impl.h @@ -203,6 +203,24 @@ Proxy::allreduce( >(proxy, msg.get(), stamp); } +template +template < + auto f, + template class Op, + typename... Args +> +typename Proxy::PendingSendType +Proxy::allreduce_h( + Args&&... args +) const { + auto proxy = Proxy(*this); + return theObjGroup()->allreduce_r< + f, + ObjT, + Op + >(proxy, std::forward(args)...); +} + template template < auto f, diff --git a/tests/perf/reduce.cc b/tests/perf/reduce.cc index eb0a3f102f..62b5d6e4b8 100644 --- a/tests/perf/reduce.cc +++ b/tests/perf/reduce.cc @@ -41,12 +41,13 @@ //@HEADER */ #include "common/test_harness.h" +#include "vt/collective/collective_alg.h" #include "vt/context/context.h" #include #include #include #include -#include +#include #include INCLUDE_FMT_CORE @@ -62,51 +63,81 @@ struct MyTest : PerfTestHarness { struct NodeObj { explicit NodeObj(MyTest* test_obj) : test_obj_(test_obj) { } - void initialize() { proxy_ = vt::theObjGroup()->getProxy(this); -// data_["Node"] = theContext()->getNode(); } + void initialize() { + proxy_ = vt::theObjGroup()->getProxy(this); + // data_["Node"] = theContext()->getNode(); } } - struct MyMsg : vt::Message {}; + struct MyMsg : vt::Message { }; - void reduceComplete(std::vector in) { - reduce_counter_++; - test_obj_->StopTimer(fmt::format("{} reduce", i)); - test_obj_->GetMemoryUsage(); - if (i < num_iters) { - i++; - auto this_node = theContext()->getNode(); - proxy_[this_node].send(); - } else if (theContext()->getNode() == 0) { - theTerm()->enableTD(); - } + void newReduceComplete(std::vector in) { + // fmt::print( + // "\n[{}]: allreduce_h done! (Size == {}) Results are ...\n", + // theContext()->getNode(), in.size()); + + // for (int node = 0; node < theContext()->getNumNodes(); ++node) { + // if (node == theContext()->getNode()) { + // std::string printer(128, 0x0); + // for (auto val : in) { + // printer.append(fmt::format("{} ", val)); + // } + + // fmt::print("{}\n", printer); + + // theCollective()->barrier(); + // } + // } + + // fmt::print("\n"); } - void perfReduce(MyMsg* in_msg) { - test_obj_->StartTimer(fmt::format("{} reduce", i)); + void reduceComplete(std::vector in) { + // fmt::print( + // "[{}]: allreduce done! Results are ...\n", theContext()->getNode()); + // for (auto val : in) { + // fmt::print("{} ", val); + // } - proxy_.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data_); + // fmt::print("\n"); } private: MyTest* test_obj_ = nullptr; vt::objgroup::proxy::Proxy proxy_ = {}; - int reduce_counter_ = -1; - int i = 0; - std::vector data_ = {}; }; VT_PERF_TEST(MyTest, test_reduce) { - auto grp_proxy = vt::theObjGroup()->makeCollective( - "test_reduce", this - ); + auto grp_proxy = + vt::theObjGroup()->makeCollective("test_allreduce", this); + + if (theContext()->getNode() == 0) { + theTerm()->disableTD(); + } + + vt::runInEpochCollective([=] { + grp_proxy.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data); + }); + + if (theContext()->getNode() == 0) { + theTerm()->enableTD(); + } +} + +VT_PERF_TEST(MyTest, test_allreduce) { + auto grp_proxy = + vt::theObjGroup()->makeCollective("test_allreduce", this); if (theContext()->getNode() == 0) { theTerm()->disableTD(); } - grp_proxy[my_node_].invoke<&NodeObj::initialize>(); + vt::runInEpochCollective([=] { + grp_proxy.allreduce_h<&NodeObj::newReduceComplete, collective::PlusOp>( + data); + }); - using MsgType = typename NodeObj::MyMsg; - grp_proxy[my_node_].send(); + if (theContext()->getNode() == 0) { + theTerm()->enableTD(); + } } VT_PERF_TEST_MAIN() diff --git a/tests/perf/send_cost.cc b/tests/perf/send_cost.cc index f15d0bb8db..ec6d2c2eb8 100644 --- a/tests/perf/send_cost.cc +++ b/tests/perf/send_cost.cc @@ -40,6 +40,357 @@ // ***************************************************************************** //@HEADER */ +#include "common/test_harness.h" +#include "vt/collective/collective_alg.h" +#include "vt/configs/error/config_assert.h" +#include "vt/configs/error/hard_error.h" +#include "vt/context/context.h" +#include "vt/messaging/message/shared_message.h" +#include "vt/scheduler/scheduler.h" +#include +#include +#include +#include +#include + +#include + +#include + +using namespace vt; +using namespace vt::tests::perf::common; + +// static constexpr std::array const payloadSizes = { +// 1, 64, 128, 2048, 16384, 524288, 268435456}; + +static constexpr std::array const payloadSizes = {1, 64, 128}; + +vt::EpochType the_epoch = vt::no_epoch; + +struct SendTest : PerfTestHarness { }; + +//////////////////////////////////////// +//////////////// RAW MPI /////////////// +//////////////////////////////////////// + +// VT_PERF_TEST(SendTest, test_send) { +// auto const thisNode = vt::theContext()->getNode(); + +// if (thisNode == 0) { +// vt::theTerm()->disableTD(); +// } + +// auto const lastNode = theContext()->getNumNodes() - 1; + +// auto const prevNode = (thisNode - 1 + num_nodes_) % num_nodes_; +// auto const nextNode = (thisNode + 1) % num_nodes_; +// int data = thisNode; + +// for (auto size : payloadSizes) { +// std::vector dataVec(size, data); +// std::vector recvData(size, data); + +// StartTimer(fmt::format("Payload size {}", size)); + +// MPI_Request request; +// MPI_Irecv( +// &recvData[0], size, MPI_INT, prevNode, 0, MPI_COMM_WORLD, &request); +// MPI_Send(&dataVec[0], size, MPI_INT, nextNode, 0, MPI_COMM_WORLD); + +// MPI_Wait(&request, MPI_STATUS_IGNORE); + +// StopTimer(fmt::format("Payload size {}", size)); +// } + +// if (vt::theContext()->getNode() == 0) { +// vt::theTerm()->enableTD(); +// } +// } + +//////////////////////////////////////// +///////////// OBJECT GROUP ///////////// +//////////////////////////////////////// + +struct NodeObj { + struct PingMsg : Message { + using MessageParentType = vt::Message; + vt_msg_serialize_required(); + std::vector vec_; + + PingMsg() : Message() { } + explicit PingMsg(std::vector data) : Message() { vec_ = data; } + + explicit PingMsg(size_t size) : Message() { + vec_.resize(size, vt::theContext()->getNode() + 1); + } + PingMsg(size_t size, int32_t val) : Message() { vec_.resize(size, val); } + + template + void serialize(SerializerT& s) { + MessageParentType::serialize(s); + s | vec_; + } + }; + + void rightHalf(NodeObj::PingMsg* msg) { + for (int i = 0; i < msg->vec_.size(); i++) { + data_[(data_.size() / 2) + i] += msg->vec_[i]; + } + } + + void rightHalfComplete(NodeObj::PingMsg* msg) { + for (int i = 0; i < msg->vec_.size(); i++) { + data_[(data_.size() / 2) + i] = msg->vec_[i]; + } + } + + void leftHalf(NodeObj::PingMsg* msg) { + for (int i = 0; i < msg->vec_.size(); i++) { + data_[i] += msg->vec_[i]; + } + } + + void leftHalfComplete(NodeObj::PingMsg* msg) { + for (int i = 0; i < msg->vec_.size(); i++) { + data_[i] = msg->vec_[i]; + } + } + + void sendHandler(NodeObj::PingMsg* msg) { + uint32_t start = isEven_ ? 0 : data_.size() / 2; + uint32_t end = isEven_ ? data_.size() / 2 : data_.size(); + for (int i = 0; start < end; start++) { + data_[start] += msg->vec_[i++]; + } + } + + void reducedHan(NodeObj::PingMsg* msg) { + for (int i = 0; i < msg->vec_.size(); i++) { + data_[data_.size() / 2 + i] = msg->vec_[i]; + } + } + + explicit NodeObj(SendTest* test_obj) : test_obj_(test_obj) { + data_.resize(268435456, theContext()->getNode() + 1); + isEven_ = theContext()->getNode() % 2 == 0; + } + + void initialize() { proxy_ = vt::theObjGroup()->getProxy(this); } + void printData() { + for (auto v : data_) { + fmt::print("[{}] {}\n", theContext()->getNode(), v); + } + } + + void printDataFinal(std::vector vec) { + for (auto v : vec) { + // fmt::print("[{}] {}\n", theContext()->getNode(), v); + } + handled_ = true; + } + + bool isEven_ = false; + bool handled_ = false; + SendTest* test_obj_ = nullptr; + vt::objgroup::proxy::Proxy proxy_ = {}; + + std::vector data_ = {}; +}; + +static inline int opal_hibit(int value, int start) { + unsigned int mask; + --start; + mask = 1 << start; + + for (; start >= 0; --start, mask >>= 1) { + if (value & mask) { + break; + } + } + + return start; +} + +// VT_PERF_TEST(SendTest, test_allreduce) { +// auto grp_proxy = +// vt::theObjGroup()->makeCollective("test_objgroup_send", this); +// grp_proxy[my_node_].invoke<&NodeObj::initialize>(); + +// if (theContext()->getNode() == 0) { +// theTerm()->disableTD(); +// } + +// vt::runInEpochCollective([=] { +// grp_proxy.allreduce<&NodeObj::printDataFinal, vt::collective::PlusOp>( +// std::vector(268435456, theContext()->getNode() + 1)); +// }); + +// vtAssert(grp_proxy[theContext()->getNode()].get()->handled_, ""); +// if (vt::theContext()->getNode() == 0) { +// vt::theTerm()->enableTD(); +// } +// } + +VT_PERF_TEST(SendTest, test_objgroup_send) { + auto grp_proxy = + vt::theObjGroup()->makeCollective("test_objgroup_send", this); + grp_proxy[my_node_].invoke<&NodeObj::initialize>(); + + if (theContext()->getNode() == 0) { + theTerm()->disableTD(); + } + + auto const thisNode = vt::theContext()->getNode(); + auto const lastNode = theContext()->getNumNodes() - 1; + + int nsteps = 2; + auto nprocs_rem = 0; + size_t count = 32; //1 << 6; + auto* buf = (int32_t*)malloc(sizeof(int32_t) * count); + auto nprocs_pof2 = 1 << nsteps; + auto rank = theContext()->getNode(); + auto vrank = theContext()->getNode(); + int *rindex = NULL, *rcount = NULL, *sindex = NULL, *scount = NULL; + rindex = (int*)malloc(sizeof(*rindex) * nsteps); + sindex = (int*)malloc(sizeof(*sindex) * nsteps); + rcount = (int*)malloc(sizeof(*rcount) * nsteps); + scount = (int*)malloc(sizeof(*scount) * nsteps); + + int step = 0; + auto wsize = count; + sindex[0] = rindex[0] = 0; + + fmt::print( + "[{}] Starting with numNodes = {} dataSize = {} \n", rank, nprocs_pof2, + count); + for (int mask = 1; mask < nprocs_pof2; mask <<= 1) { + /* + * On each iteration: rindex[step] = sindex[step] -- beginning of the + * current window. Length of the current window is storded in wsize. + */ + int vdest = vrank ^ mask; + /* Translate vdest virtual rank to real rank */ + int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem; + + if (rank < dest) { + /* + * Recv into the left half of the current window, send the right + * half of the window to the peer (perform reduce on the left + * half of the current window) + */ + rcount[step] = wsize / 2; + scount[step] = wsize - rcount[step]; + sindex[step] = rindex[step] + rcount[step]; + } else { + /* + * Recv into the right half of the current window, send the left + * half of the window to the peer (perform reduce on the right + * half of the current window) + */ + scount[step] = wsize / 2; + rcount[step] = wsize - scount[step]; + rindex[step] = sindex[step] + scount[step]; + } + + /* Send part of data from the rbuf, recv into the tmp_buf */ + // err = ompi_coll_base_sendrecv( + // (char*)rbuf + (ptrdiff_t)sindex[step] * extent, scount[step], dtype, dest, + // MCA_COLL_BASE_TAG_ALLREDUCE, + // (char*)tmp_buf + (ptrdiff_t)rindex[step] * extent, rcount[step], dtype, + // dest, MCA_COLL_BASE_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE, rank); + + fmt::print( + "[{}] Sending to rank {} data + offset({}) with count = {}\nPerforming " + "reduce on data starting with offset {} and count {}\n", + rank, dest, sindex[step], scount[step], rindex[step], rcount[step]); + + theCollective()->barrier(); + /* Local reduce: rbuf[] = tmp_buf[] rbuf[] */ + // ompi_op_reduce( + // op, (char*)tmp_buf + (ptrdiff_t)rindex[step] * extent, + // (char*)rbuf + (ptrdiff_t)rindex[step] * extent, rcount[step], dtype); + + /* Move the current window to the received message */ + if (step + 1 < nsteps) { + rindex[step + 1] = rindex[step]; + sindex[step + 1] = rindex[step]; + wsize = rcount[step]; + step++; + } + } + step = nsteps - 1; + + for (int mask = nprocs_pof2 >> 1; mask > 0; mask >>= 1) { + int vdest = vrank ^ mask; + /* Translate vdest virtual rank to real rank */ + int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem; + + /* + * Send rcount[step] elements from rbuf[rindex[step]...] + * Recv scount[step] elements to rbuf[sindex[step]...] + */ + // err = ompi_coll_base_sendrecv((char *)rbuf + (ptrdiff_t)rindex[step] * extent, + // rcount[step], dtype, dest, + // MCA_COLL_BASE_TAG_ALLREDUCE, + // (char *)rbuf + (ptrdiff_t)sindex[step] * extent, + // scount[step], dtype, dest, + // MCA_COLL_BASE_TAG_ALLREDUCE, comm, + // MPI_STATUS_IGNORE, rank); + fmt::print( + "[{}] Sending to rank {} data + offset({}) with count = {}\nReceiving " + "data starting with offset {} and count {}\n", + rank, dest, rindex[step], rcount[step], sindex[step], scount[step]); + step--; + } + + if (vt::theContext()->getNode() == 0) { + vt::theTerm()->enableTD(); + } +} + +VT_PERF_TEST_MAIN() +/* +//@HEADER +// ***************************************************************************** +// +// send_cost.cc +// DARMA/vt => Virtual Transport +// +// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ #include "common/test_harness.h" #include "vt/collective/collective_alg.h" From 5289362fa1817a08ab73bd678f9a890c269bb4b6 Mon Sep 17 00:00:00 2001 From: Jacob Domagala Date: Thu, 4 Apr 2024 15:53:21 +0200 Subject: [PATCH 03/29] #2240: Working Rabenseifner (non-commutative ops) --- .../collective/reduce/allreduce/allreduce.h | 218 ++++++++++++------ src/vt/objgroup/manager.impl.h | 10 +- tests/perf/reduce.cc | 7 +- tests/perf/send_cost.cc | 4 +- 4 files changed, 167 insertions(+), 72 deletions(-) diff --git a/src/vt/collective/reduce/allreduce/allreduce.h b/src/vt/collective/reduce/allreduce/allreduce.h index db92fc890f..3dabb10b5d 100644 --- a/src/vt/collective/reduce/allreduce/allreduce.h +++ b/src/vt/collective/reduce/allreduce/allreduce.h @@ -54,6 +54,8 @@ namespace vt::collective::reduce::allreduce { +constexpr bool debug = false; + template struct AllreduceMsg : SerializeIfNeeded, DataT> { @@ -137,37 +139,37 @@ struct Allreduce { } } - // std::string str(1024, 0x0); - // for (int i = 0; i < num_steps_; ++i) { - // str.append(fmt::format( - // "Step{}: send_idx = {} send_count = {} recieve_idx = {} recieve_count " - // "= {}\n", - // i, s_index_[i], s_count_[i], r_index_[i], r_count_[i])); - // } - // fmt::print( - // "[{}] Initialize with size = {} num_steps {} \n {}", this_node_, w_size_, - // num_steps_, str); + expected_send_ = num_steps_; + expected_recv_ = num_steps_; + steps_sent_.resize(num_steps_, false); + steps_recv_.resize(num_steps_, false); + + if constexpr (debug) { + std::string str(1024, 0x0); + for (int i = 0; i < num_steps_; ++i) { + str.append(fmt::format( + "Step{}: send_idx = {} send_count = {} recieve_idx = {} " + "recieve_count " + "= {}\n", + i, s_index_[i], s_count_[i], r_index_[i], r_count_[i])); + } + fmt::print( + "[{}] Initialize with size = {} num_steps {} \n {}", this_node_, + w_size_, num_steps_, str); + } } - void partOneCollective() { + void partOne() { if (is_part_of_adjustment_group_) { auto const partner = is_even_ ? this_node_ + 1 : this_node_ - 1; if (is_even_) { proxy_[partner].template send<&Allreduce::partOneRightHalf>( - std::vector{val_.begin() + (val_.size() / 2), val_.end()}); - vrt_node_ = this_node_ / 2; + DataT{val_.begin() + (val_.size() / 2), val_.end()}); } else { proxy_[partner].template send<&Allreduce::partOneLeftHalf>( - std::vector{val_.begin(), val_.end() - (val_.size() / 2)}); - vrt_node_ = -1; + DataT{val_.begin(), val_.end() - (val_.size() / 2)}); } - } else { - vrt_node_ = this_node_ - nprocs_rem_; - } - - if (nprocs_rem_ == 0) { - partTwo(); } } @@ -179,7 +181,7 @@ struct Allreduce { // Send to left node proxy_[theContext()->getNode() - 1] .template send<&Allreduce::partOneFinalPart>( - std::vector{val_.begin() + (val_.size() / 2), val_.end()}); + DataT{val_.begin() + (val_.size() / 2), val_.end()}); } void partOneLeftHalf(AllreduceMsg* msg) { @@ -193,26 +195,42 @@ struct Allreduce { val_[(val_.size() / 2) + i] = msg->val_[i]; } - partTwo(); + // partTwo(); } void partTwo() { + if ( + vrt_node_ == -1 or (step_ >= num_steps_) or + (not std::all_of( + steps_recv_.cbegin(), steps_recv_.cbegin() + step_, + [](const auto val) { return val; }))) { + return; + } + auto vdest = vrt_node_ ^ mask_; auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; - - // fmt::print( - // "[{}] Part2 Step {}: Sending to Node {} starting with idx = {} and count " - // "{} \n", - // this_node_, step_, dest, s_index_[step_], s_count_[step_]); + if constexpr (debug) { + fmt::print( + "[{}] Part2 Step {}: Sending to Node {} starting with idx = {} and " + "count " + "{} \n", + this_node_, step_, dest, s_index_[step_], s_count_[step_]); + } proxy_[dest].template send<&Allreduce::partTwoHandler>( - std::vector{ + DataT{ val_.begin() + (s_index_[step_]), val_.begin() + (s_index_[step_]) + s_count_[step_]}, step_); mask_ <<= 1; - if (step_ + 1 < num_steps_) { - step_++; + num_send_++; + steps_sent_[step_] = true; + step_++; + + if (std::all_of( + steps_recv_.cbegin(), steps_recv_.cbegin() + step_, + [](const auto val) { return val; })) { + partTwo(); } } @@ -220,51 +238,87 @@ struct Allreduce { for (int i = 0; i < msg->val_.size(); i++) { val_[r_index_[msg->step_] + i] += msg->val_[i]; } - - // std::string data(128, 0x0); - // for (auto val : msg->val_) { - // data.append(fmt::format("{} ", val)); - // } - // fmt::print( - // "[{}] Part2 Step {}: Received data ({}) idx = {} from {}\n", this_node_, - // msg->step_, data, r_index_[msg->step_], - // theContext()->getFromNodeCurrentTask()); - + if constexpr (debug) { + std::string data(128, 0x0); + for (auto val : msg->val_) { + data.append(fmt::format("{} ", val)); + } + fmt::print( + "[{}] Part2 Step {} mask_= {} nprocs_pof2_ = {}: Received data ({}) " + "idx = {} from {}\n", + this_node_, msg->step_, mask_, nprocs_pof2_, data, r_index_[msg->step_], + theContext()->getFromNodeCurrentTask()); + } + steps_recv_[msg->step_] = true; + num_recv_++; if (mask_ < nprocs_pof2_) { - partTwo(); + if (std::all_of( + steps_recv_.cbegin(), steps_recv_.cbegin() + step_, + [](const auto val) { return val; })) { + partTwo(); + } } else { - step_ = num_steps_ - 1; - mask_ = nprocs_pof2_ >> 1; - partThree(); + // step_ = num_steps_ - 1; + // mask_ = nprocs_pof2_ >> 1; + // partThree(); } } void partThree() { + if ( + vrt_node_ == -1 or + (not std::all_of( + steps_recv_.cbegin() + step_ + 1, steps_recv_.cend(), + [](const auto val) { return val; }))) { + return; + } + + if (not startedPartThree_) { + step_ = num_steps_ - 1; + mask_ = nprocs_pof2_ >> 1; + num_send_ = 0; + num_recv_ = 0; + startedPartThree_ = true; + std::fill(steps_sent_.begin(), steps_sent_.end(), false); + std::fill(steps_recv_.begin(), steps_recv_.end(), false); + } + auto vdest = vrt_node_ ^ mask_; auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; - // std::string data(128, 0x0); - // auto subV = std::vector{ - // val_.begin() + (r_index_[step_]), - // val_.begin() + (r_index_[step_]) + r_count_[step_]}; - // for (auto val : subV) { - // data.append(fmt::format("{} ", val)); - // } - - // fmt::print( - // "[{}] Part3 Step {}: Sending to Node {} starting with idx = {} and count " - // "{} " - // "data={} \n", - // this_node_, step_, dest, r_index_[step_], r_count_[step_], data); + if constexpr (debug) { + std::string data(128, 0x0); + auto subV = std::vector{ + val_.begin() + (r_index_[step_]), + val_.begin() + (r_index_[step_]) + r_count_[step_]}; + for (auto val : subV) { + data.append(fmt::format("{} ", val)); + } + fmt::print( + "[{}] Part3 Step {}: Sending to Node {} starting with idx = {} and " + "count " + "{} " + "data={} \n", + this_node_, step_, dest, r_index_[step_], r_count_[step_], data); + } proxy_[dest].template send<&Allreduce::partThreeHandler>( - std::vector{ + DataT{ val_.begin() + (r_index_[step_]), val_.begin() + (r_index_[step_]) + r_count_[step_]}, step_); + steps_sent_[step_] = true; + num_send_++; mask_ >>= 1; step_--; + if ( + step_ >= 0 and + std::all_of( + steps_recv_.cbegin() + step_ + 1, steps_recv_.cend(), + [](const auto val) { return val; })) { + partThree(); + } } void partThreeHandler(AllreduceMsg* msg) { @@ -272,16 +326,36 @@ struct Allreduce { val_[s_index_[msg->step_] + i] = msg->val_[i]; } - // std::string data(128, 0x0); - // for (auto val : msg->val_) { - // data.append(fmt::format("{} ", val)); - // } - // fmt::print( - // "[{}] Part3 Step {}: Received data ({}) idx = {} from {}\n", this_node_, - // msg->step_, data, s_index_[msg->step_], - // theContext()->getFromNodeCurrentTask()); + if (not startedPartThree_) { + step_ = num_steps_ - 1; + mask_ = nprocs_pof2_ >> 1; + num_send_ = 0; + num_recv_ = 0; + startedPartThree_ = true; + std::fill(steps_sent_.begin(), steps_sent_.end(), false); + std::fill(steps_recv_.begin(), steps_recv_.end(), false); + } - if (mask_ > 0) { + num_recv_++; + if constexpr (debug) { + std::string data(128, 0x0); + for (auto val : msg->val_) { + data.append(fmt::format("{} ", val)); + } + fmt::print( + "[{}] Part3 Step {}: Received data ({}) idx = {} from {}\n", this_node_, + msg->step_, data, s_index_[msg->step_], + theContext()->getFromNodeCurrentTask()); + } + + steps_recv_[msg->step_] = true; + + if ( + mask_ > 0 and + ((step_ == num_steps_ - 1) or + std::all_of( + steps_recv_.cbegin() + step_ + 1, steps_recv_.cend(), + [](const auto val) { return val; }))) { partThree(); } } @@ -296,9 +370,17 @@ struct Allreduce { int32_t nprocs_pof2_ = {}; int32_t nprocs_rem_ = {}; int32_t mask_ = 1; + bool startedPartThree_ = false; size_t w_size_ = {}; int32_t step_ = 0; + int32_t num_send_ = 0; + int32_t expected_send_ = 0; + int32_t num_recv_ = 0; + int32_t expected_recv_ = 0; + + std::vector steps_recv_ = {}; + std::vector steps_sent_ = {}; std::vector r_index_ = {}; std::vector r_count_ = {}; std::vector s_index_ = {}; diff --git a/src/vt/objgroup/manager.impl.h b/src/vt/objgroup/manager.impl.h index 3db771007c..770005e9ce 100644 --- a/src/vt/objgroup/manager.impl.h +++ b/src/vt/objgroup/manager.impl.h @@ -283,7 +283,15 @@ ObjGroupManager::allreduce_r(ProxyType proxy, const DataT& data) { data, grp_proxy, num_nodes); vt::runInEpochCollective([=] { - grp_proxy[this_node].template invoke<&Reducer::partOneCollective>(); + grp_proxy[this_node].template invoke<&Reducer::partOne>(); + }); + + vt::runInEpochCollective([=] { + grp_proxy[this_node].template invoke<&Reducer::partTwo>(); + }); + + vt::runInEpochCollective([=] { + grp_proxy[this_node].template invoke<&Reducer::partThree>(); }); proxy[this_node].template invoke(grp_proxy.get()->val_); diff --git a/tests/perf/reduce.cc b/tests/perf/reduce.cc index 62b5d6e4b8..08d92c148d 100644 --- a/tests/perf/reduce.cc +++ b/tests/perf/reduce.cc @@ -42,6 +42,7 @@ */ #include "common/test_harness.h" #include "vt/collective/collective_alg.h" +#include "vt/configs/error/config_assert.h" #include "vt/context/context.h" #include #include @@ -73,7 +74,11 @@ struct NodeObj { // fmt::print( // "\n[{}]: allreduce_h done! (Size == {}) Results are ...\n", // theContext()->getNode(), in.size()); - + // const auto p = theContext()->getNumNodes(); + // const auto expected = (p * (p + 1)) / 2; + // for (auto val : in) { + // vtAssert(val == expected, "FAILURE!"); + // } // for (int node = 0; node < theContext()->getNumNodes(); ++node) { // if (node == theContext()->getNode()) { // std::string printer(128, 0x0); diff --git a/tests/perf/send_cost.cc b/tests/perf/send_cost.cc index ec6d2c2eb8..6ce249eb2f 100644 --- a/tests/perf/send_cost.cc +++ b/tests/perf/send_cost.cc @@ -243,9 +243,9 @@ VT_PERF_TEST(SendTest, test_objgroup_send) { auto const thisNode = vt::theContext()->getNode(); auto const lastNode = theContext()->getNumNodes() - 1; - int nsteps = 2; + int nsteps = static_cast(log2(theContext()->getNumNodes())); auto nprocs_rem = 0; - size_t count = 32; //1 << 6; + size_t count = 16; //1 << 6; auto* buf = (int32_t*)malloc(sizeof(int32_t) * count); auto nprocs_pof2 = 1 << nsteps; auto rank = theContext()->getNode(); From d52afebfc11a485eeddf5ca5634d65ccf4e36c25 Mon Sep 17 00:00:00 2001 From: Jacob Domagala Date: Sun, 7 Apr 2024 22:22:37 +0200 Subject: [PATCH 04/29] #2240: Fix non power of 2 for new allreduce --- src/vt/collective/reduce/allreduce/allreduce.h | 12 ++++++++++++ src/vt/objgroup/manager.impl.h | 8 ++++++++ 2 files changed, 20 insertions(+) diff --git a/src/vt/collective/reduce/allreduce/allreduce.h b/src/vt/collective/reduce/allreduce/allreduce.h index 3dabb10b5d..621d220df0 100644 --- a/src/vt/collective/reduce/allreduce/allreduce.h +++ b/src/vt/collective/reduce/allreduce/allreduce.h @@ -360,6 +360,18 @@ struct Allreduce { } } + void partFour() { + if (is_part_of_adjustment_group_ and is_even_) { + if constexpr (debug) { + fmt::print( + "[{}] Part4 : Sending to Node {} \n", this_node_, this_node_ + 1); + } + proxy_[this_node_ + 1].template send<&Allreduce::partFourHandler>(val_); + } + } + + void partFourHandler(AllreduceMsg* msg) { val_ = msg->val_; } + NodeType this_node_ = {}; bool is_even_ = false; vt::objgroup::proxy::Proxy proxy_ = {}; diff --git a/src/vt/objgroup/manager.impl.h b/src/vt/objgroup/manager.impl.h index 770005e9ce..13f4d80de8 100644 --- a/src/vt/objgroup/manager.impl.h +++ b/src/vt/objgroup/manager.impl.h @@ -274,6 +274,10 @@ ObjGroupManager::allreduce_r(ProxyType proxy, const DataT& data) { auto const this_node = vt::theContext()->getNode(); auto const num_nodes = theContext()->getNumNodes(); + if(num_nodes < 2){ + return PendingSendType{nullptr}; + } + using Reducer = collective::reduce::allreduce::Allreduce; auto grp_proxy = @@ -294,6 +298,10 @@ ObjGroupManager::allreduce_r(ProxyType proxy, const DataT& data) { grp_proxy[this_node].template invoke<&Reducer::partThree>(); }); + vt::runInEpochCollective([=] { + grp_proxy[this_node].template invoke<&Reducer::partFour>(); + }); + proxy[this_node].template invoke(grp_proxy.get()->val_); return PendingSendType{nullptr}; From 5372da5147d64a4dc293f0c5657b056f4497562f Mon Sep 17 00:00:00 2001 From: Jacob Domagala Date: Wed, 10 Apr 2024 12:26:53 +0200 Subject: [PATCH 05/29] #2240: Initial work for adding recursive doubling allreduce algorithm --- .../{allreduce.h => distance_doubling.h} | 174 ++----- .../reduce/allreduce/rabenseifner.h | 481 ++++++++++++++---- src/vt/objgroup/manager.impl.h | 23 +- tests/perf/allreduce.cc | 1 - tests/perf/common/test_harness_macros.h | 2 - tests/perf/reduce.cc | 34 +- 6 files changed, 446 insertions(+), 269 deletions(-) rename src/vt/collective/reduce/allreduce/{allreduce.h => distance_doubling.h} (60%) diff --git a/src/vt/collective/reduce/allreduce/allreduce.h b/src/vt/collective/reduce/allreduce/distance_doubling.h similarity index 60% rename from src/vt/collective/reduce/allreduce/allreduce.h rename to src/vt/collective/reduce/allreduce/distance_doubling.h index 621d220df0..a5b508fc0f 100644 --- a/src/vt/collective/reduce/allreduce/allreduce.h +++ b/src/vt/collective/reduce/allreduce/distance_doubling.h @@ -41,8 +41,8 @@ //@HEADER */ -#if !defined INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_ALLREDUCE_H -#define INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_ALLREDUCE_H +#if !defined INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_DISTANCE_DOUBLING_H +#define INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_DISTANCE_DOUBLING_H #include "vt/config.h" #include "vt/context/context.h" @@ -54,22 +54,22 @@ namespace vt::collective::reduce::allreduce { -constexpr bool debug = false; +constexpr bool isdebug = false; template -struct AllreduceMsg - : SerializeIfNeeded, DataT> { +struct AllreduceDblMsg + : SerializeIfNeeded, DataT> { using MessageParentType = - SerializeIfNeeded<::vt::Message, AllreduceMsg, DataT>; + SerializeIfNeeded<::vt::Message, AllreduceDblMsg, DataT>; - AllreduceMsg() = default; - AllreduceMsg(AllreduceMsg const&) = default; - AllreduceMsg(AllreduceMsg&&) = default; + AllreduceDblMsg() = default; + AllreduceDblMsg(AllreduceDblMsg const&) = default; + AllreduceDblMsg(AllreduceDblMsg&&) = default; - explicit AllreduceMsg(DataT&& in_val) + explicit AllreduceDblMsg(DataT&& in_val) : MessageParentType(), val_(std::forward(in_val)) { } - explicit AllreduceMsg(DataT const& in_val, int step = 0) + explicit AllreduceDblMsg(DataT const& in_val, int step = 0) : MessageParentType(), val_(in_val), step_(step) { } @@ -86,9 +86,9 @@ struct AllreduceMsg }; template -struct Allreduce { +struct DistanceDoubling { void initialize( - const DataT& data, vt::objgroup::proxy::Proxy proxy, + const DataT& data, vt::objgroup::proxy::Proxy proxy, uint32_t num_nodes) { this_node_ = vt::theContext()->getNode(); is_even_ = this_node_ % 2 == 0; @@ -108,96 +108,27 @@ struct Allreduce { vrt_node_ = this_node_ - nprocs_rem_; } - r_index_.resize(num_steps_, 0); - r_count_.resize(num_steps_, 0); - s_index_.resize(num_steps_, 0); - s_count_.resize(num_steps_, 0); - w_size_ = data.size(); - int step = 0; - size_t wsize = data.size(); - for (int mask = 1; mask < nprocs_pof2_; mask <<= 1) { - auto vdest = vrt_node_ ^ mask; - auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; - - if (this_node_ < dest) { - r_count_[step] = wsize / 2; - s_count_[step] = wsize - r_count_[step]; - s_index_[step] = r_index_[step] + r_count_[step]; - } else { - s_count_[step] = wsize / 2; - r_count_[step] = wsize - s_count_[step]; - r_index_[step] = s_index_[step] + s_count_[step]; - } - - if (step + 1 < num_steps_) { - r_index_[step + 1] = r_index_[step]; - s_index_[step + 1] = r_index_[step]; - wsize = r_count_[step]; - step++; - } - } - expected_send_ = num_steps_; expected_recv_ = num_steps_; steps_sent_.resize(num_steps_, false); steps_recv_.resize(num_steps_, false); - - if constexpr (debug) { - std::string str(1024, 0x0); - for (int i = 0; i < num_steps_; ++i) { - str.append(fmt::format( - "Step{}: send_idx = {} send_count = {} recieve_idx = {} " - "recieve_count " - "= {}\n", - i, s_index_[i], s_count_[i], r_index_[i], r_count_[i])); - } - fmt::print( - "[{}] Initialize with size = {} num_steps {} \n {}", this_node_, - w_size_, num_steps_, str); - } } void partOne() { - if (is_part_of_adjustment_group_) { - auto const partner = is_even_ ? this_node_ + 1 : this_node_ - 1; - - if (is_even_) { - proxy_[partner].template send<&Allreduce::partOneRightHalf>( - DataT{val_.begin() + (val_.size() / 2), val_.end()}); - } else { - proxy_[partner].template send<&Allreduce::partOneLeftHalf>( - DataT{val_.begin(), val_.end() - (val_.size() / 2)}); - } - } - } - - void partOneRightHalf(AllreduceMsg* msg) { - for (int i = 0; i < msg->val_.size(); i++) { - val_[(val_.size() / 2) + i] += msg->val_[i]; + if (is_part_of_adjustment_group_ and is_even_) { + proxy_[this_node_ + 1].template send<&DistanceDoubling::partOneHandler>( + val_); } - - // Send to left node - proxy_[theContext()->getNode() - 1] - .template send<&Allreduce::partOneFinalPart>( - DataT{val_.begin() + (val_.size() / 2), val_.end()}); } - void partOneLeftHalf(AllreduceMsg* msg) { + void partOneHandler(AllreduceDblMsg* msg) { for (int i = 0; i < msg->val_.size(); i++) { val_[i] += msg->val_[i]; } } - void partOneFinalPart(AllreduceMsg* msg) { - for (int i = 0; i < msg->val_.size(); i++) { - val_[(val_.size() / 2) + i] = msg->val_[i]; - } - - // partTwo(); - } - void partTwo() { if ( vrt_node_ == -1 or (step_ >= num_steps_) or @@ -209,18 +140,11 @@ struct Allreduce { auto vdest = vrt_node_ ^ mask_; auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; - if constexpr (debug) { + if constexpr (isdebug) { fmt::print( - "[{}] Part2 Step {}: Sending to Node {} starting with idx = {} and " - "count " - "{} \n", - this_node_, step_, dest, s_index_[step_], s_count_[step_]); + "[{}] Part2 Step {}: Sending to Node {} \n", this_node_, step_, dest); } - proxy_[dest].template send<&Allreduce::partTwoHandler>( - DataT{ - val_.begin() + (s_index_[step_]), - val_.begin() + (s_index_[step_]) + s_count_[step_]}, - step_); + proxy_[dest].template send<&DistanceDoubling::partTwoHandler>(val_, step_); mask_ <<= 1; num_send_++; @@ -234,11 +158,11 @@ struct Allreduce { } } - void partTwoHandler(AllreduceMsg* msg) { + void partTwoHandler(AllreduceDblMsg* msg) { for (int i = 0; i < msg->val_.size(); i++) { - val_[r_index_[msg->step_] + i] += msg->val_[i]; + val_[i] += msg->val_[i]; } - if constexpr (debug) { + if constexpr (isdebug) { std::string data(128, 0x0); for (auto val : msg->val_) { data.append(fmt::format("{} ", val)); @@ -246,7 +170,7 @@ struct Allreduce { fmt::print( "[{}] Part2 Step {} mask_= {} nprocs_pof2_ = {}: Received data ({}) " "idx = {} from {}\n", - this_node_, msg->step_, mask_, nprocs_pof2_, data, r_index_[msg->step_], + this_node_, msg->step_, mask_, nprocs_pof2_, data, theContext()->getFromNodeCurrentTask()); } steps_recv_[msg->step_] = true; @@ -286,27 +210,19 @@ struct Allreduce { auto vdest = vrt_node_ ^ mask_; auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; - if constexpr (debug) { - std::string data(128, 0x0); - auto subV = std::vector{ - val_.begin() + (r_index_[step_]), - val_.begin() + (r_index_[step_]) + r_count_[step_]}; - for (auto val : subV) { + if constexpr (isdebug) { + std::string data(1024, 0x0); + + for (auto val : val_) { data.append(fmt::format("{} ", val)); } fmt::print( - "[{}] Part3 Step {}: Sending to Node {} starting with idx = {} and " - "count " - "{} " - "data={} \n", - this_node_, step_, dest, r_index_[step_], r_count_[step_], data); + "[{}] Part3 Step {}: Sending to Node {} data={} \n", this_node_, step_, + dest, data); } - proxy_[dest].template send<&Allreduce::partThreeHandler>( - DataT{ - val_.begin() + (r_index_[step_]), - val_.begin() + (r_index_[step_]) + r_count_[step_]}, - step_); + proxy_[dest].template send<&DistanceDoubling::partThreeHandler>( + val_, step_); steps_sent_[step_] = true; num_send_++; @@ -321,9 +237,9 @@ struct Allreduce { } } - void partThreeHandler(AllreduceMsg* msg) { + void partThreeHandler(AllreduceDblMsg* msg) { for (int i = 0; i < msg->val_.size(); i++) { - val_[s_index_[msg->step_] + i] = msg->val_[i]; + val_[i] = msg->val_[i]; } if (not startedPartThree_) { @@ -337,15 +253,14 @@ struct Allreduce { } num_recv_++; - if constexpr (debug) { + if constexpr (isdebug) { std::string data(128, 0x0); for (auto val : msg->val_) { data.append(fmt::format("{} ", val)); } fmt::print( - "[{}] Part3 Step {}: Received data ({}) idx = {} from {}\n", this_node_, - msg->step_, data, s_index_[msg->step_], - theContext()->getFromNodeCurrentTask()); + "[{}] Part3 Step {}: Received data ({}) from {}\n", this_node_, + msg->step_, data, theContext()->getFromNodeCurrentTask()); } steps_recv_[msg->step_] = true; @@ -362,19 +277,20 @@ struct Allreduce { void partFour() { if (is_part_of_adjustment_group_ and is_even_) { - if constexpr (debug) { + if constexpr (isdebug) { fmt::print( "[{}] Part4 : Sending to Node {} \n", this_node_, this_node_ + 1); } - proxy_[this_node_ + 1].template send<&Allreduce::partFourHandler>(val_); + proxy_[this_node_ + 1].template send<&DistanceDoubling::partFourHandler>( + val_); } } - void partFourHandler(AllreduceMsg* msg) { val_ = msg->val_; } + void partFourHandler(AllreduceDblMsg* msg) { val_ = msg->val_; } NodeType this_node_ = {}; bool is_even_ = false; - vt::objgroup::proxy::Proxy proxy_ = {}; + vt::objgroup::proxy::Proxy proxy_ = {}; DataT val_ = {}; NodeType vrt_node_ = {}; bool is_part_of_adjustment_group_ = false; @@ -393,12 +309,8 @@ struct Allreduce { std::vector steps_recv_ = {}; std::vector steps_sent_ = {}; - std::vector r_index_ = {}; - std::vector r_count_ = {}; - std::vector s_index_ = {}; - std::vector s_count_ = {}; }; } // namespace vt::collective::reduce::allreduce -#endif /*INCLUDED_VT_COLLECTIVE_REDUCE_REDUCE_H*/ +#endif /*INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H*/ diff --git a/src/vt/collective/reduce/allreduce/rabenseifner.h b/src/vt/collective/reduce/allreduce/rabenseifner.h index bc5275352f..f8284833cc 100644 --- a/src/vt/collective/reduce/allreduce/rabenseifner.h +++ b/src/vt/collective/reduce/allreduce/rabenseifner.h @@ -1,128 +1,389 @@ - +/* +//@HEADER +// ***************************************************************************** +// +// reduce.h +// DARMA/vt => Virtual Transport +// +// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact darma@sandia.gov +// +// ***************************************************************************** +//@HEADER +*/ #if !defined INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H #define INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H -#include "vt/messaging/message/shared_message.h" -#include "vt/objgroup/manager.h" -#include "vt/collective/reduce/allreduce/allreduce.h" +#include "vt/config.h" +#include "vt/context/context.h" +#include "vt/messaging/message/message.h" +#include "vt/objgroup/proxy/proxy_objgroup.h" -#include +#include +#include namespace vt::collective::reduce::allreduce { -template class Op, typename... Args> -void allreduce_r(Args&&... data) { - auto msg = vt::makeMessage(std::forward(data)...); - auto const this_node = vt::theContext()->getNode(); - auto const num_nodes = theContext()->getNumNodes(); - - using Reducer = Allreduce; - - auto grp_proxy = - vt::theObjGroup()->makeCollective("allreduce_rabenseifner"); - - auto const lastNode = num_nodes - 1; - auto const num_steps = static_cast(log2(num_nodes)); - auto const nprocs_pof2 = 1 << num_steps; - auto const nprocs_rem = num_nodes - nprocs_pof2; - - //////////////////////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////// STEP 1 //////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////////////////////// - - int vrt_node; - bool const is_part_of_adjustment_group = this_node < (2 * nprocs_rem); - bool const is_even = this_node % 2 == 0; - vt::runInEpochCollective([=, &vrt_node] { - vt::runInEpochCollective([=] { - if (is_part_of_adjustment_group) { - auto const partner = is_even ? this_node + 1 : this_node - 1; - grp_proxy[partner].send<&Reducer::sendHandler>( - std::forward(data...)); +constexpr bool debug = true; + +template +struct AllreduceRbnMsg + : SerializeIfNeeded, DataT> { + using MessageParentType = + SerializeIfNeeded<::vt::Message, AllreduceRbnMsg, DataT>; + + AllreduceRbnMsg() = default; + AllreduceRbnMsg(AllreduceRbnMsg const&) = default; + AllreduceRbnMsg(AllreduceRbnMsg&&) = default; + + AllreduceRbnMsg(DataT&& in_val, int step = 0) + : MessageParentType(), + val_(std::forward(in_val)), + step_(step) { } + AllreduceRbnMsg(DataT const& in_val, int step = 0) + : MessageParentType(), + val_(in_val), + step_(step) { } + + template + void serialize(SerializeT& s) { + MessageParentType::serialize(s); + s | val_; + s | step_; + } + + DataT val_ = {}; + int32_t step_ = {}; +}; + +template +struct Rabenseifner { + void initialize( + const DataT& data, vt::objgroup::proxy::Proxy proxy, + uint32_t num_nodes) { + this_node_ = vt::theContext()->getNode(); + is_even_ = this_node_ % 2 == 0; + val_ = data; + proxy_ = proxy; + num_steps_ = static_cast(log2(num_nodes)); + nprocs_pof2_ = 1 << num_steps_; + nprocs_rem_ = num_nodes - nprocs_pof2_; + is_part_of_adjustment_group_ = this_node_ < (2 * nprocs_rem_); + if (is_part_of_adjustment_group_) { + if (is_even_) { + vrt_node_ = this_node_ / 2; + } else { + vrt_node_ = -1; } - }); + } else { + vrt_node_ = this_node_ - nprocs_rem_; + } - vt::runInEpochCollective([=] { - if (is_part_of_adjustment_group and not is_even) { - auto& vec = grp_proxy[this_node].get()->data_; - grp_proxy[this_node - 1].send<&Reducer::reducedHan>( - std::vector{vec.begin() + (vec.size() / 2), vec.end()}); + r_index_.resize(num_steps_, 0); + r_count_.resize(num_steps_, 0); + s_index_.resize(num_steps_, 0); + s_count_.resize(num_steps_, 0); + + w_size_ = data.size(); + + int step = 0; + size_t wsize = data.size(); + for (int mask = 1; mask < nprocs_pof2_; mask <<= 1) { + auto vdest = vrt_node_ ^ mask; + auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; + + if (this_node_ < dest) { + r_count_[step] = wsize / 2; + s_count_[step] = wsize - r_count_[step]; + s_index_[step] = r_index_[step] + r_count_[step]; + } else { + s_count_[step] = wsize / 2; + r_count_[step] = wsize - s_count_[step]; + r_index_[step] = s_index_[step] + s_count_[step]; } - }); - if (is_part_of_adjustment_group) { - if (is_even) { - vrt_node = this_node / 2; + if (step + 1 < num_steps_) { + r_index_[step + 1] = r_index_[step]; + s_index_[step + 1] = r_index_[step]; + wsize = r_count_[step]; + step++; + } + } + + expected_send_ = num_steps_; + expected_recv_ = num_steps_; + steps_sent_.resize(num_steps_, false); + steps_recv_.resize(num_steps_, false); + + if constexpr (debug) { + std::string str(1024, 0x0); + for (int i = 0; i < num_steps_; ++i) { + str.append(fmt::format( + "Step{}: send_idx = {} send_count = {} recieve_idx = {} " + "recieve_count " + "= {}\n", + i, s_index_[i], s_count_[i], r_index_[i], r_count_[i])); + } + fmt::print( + "[{}] Initialize with size = {} num_steps {} \n {}", this_node_, + w_size_, num_steps_, str); + } + } + + void partOne() { + if (is_part_of_adjustment_group_) { + auto const partner = is_even_ ? this_node_ + 1 : this_node_ - 1; + + if (is_even_) { + proxy_[partner].template send<&Rabenseifner::partOneRightHalf>( + DataT{val_.begin() + (val_.size() / 2), val_.end()}); } else { - vrt_node = -1; + proxy_[partner].template send<&Rabenseifner::partOneLeftHalf>( + DataT{val_.begin(), val_.end() - (val_.size() / 2)}); + } + } + } + + void partOneRightHalf(AllreduceRbnMsg* msg) { + for (int i = 0; i < msg->val_.size(); i++) { + val_[(val_.size() / 2) + i] += msg->val_[i]; + } + + // Send to left node + proxy_[theContext()->getNode() - 1] + .template send<&Rabenseifner::partOneFinalPart>( + DataT{val_.begin() + (val_.size() / 2), val_.end()}); + } + + void partOneLeftHalf(AllreduceRbnMsg* msg) { + for (int i = 0; i < msg->val_.size(); i++) { + val_[i] += msg->val_[i]; + } + } + + void partOneFinalPart(AllreduceRbnMsg* msg) { + for (int i = 0; i < msg->val_.size(); i++) { + val_[(val_.size() / 2) + i] = msg->val_[i]; + } + + // partTwo(); + } + + void partTwo() { + if ( + vrt_node_ == -1 or (step_ >= num_steps_) or + (not std::all_of( + steps_recv_.cbegin(), steps_recv_.cbegin() + step_, + [](const auto val) { return val; }))) { + return; + } + + auto vdest = vrt_node_ ^ mask_; + auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; + if constexpr (debug) { + fmt::print( + "[{}] Part2 Step {}: Sending to Node {} starting with idx = {} and " + "count " + "{} \n", + this_node_, step_, dest, s_index_[step_], s_count_[step_]); + } + proxy_[dest].template send<&Rabenseifner::partTwoHandler>( + DataT{ + val_.begin() + (s_index_[step_]), + val_.begin() + (s_index_[step_]) + s_count_[step_]}, + step_); + + mask_ <<= 1; + num_send_++; + steps_sent_[step_] = true; + step_++; + + if (std::all_of( + steps_recv_.cbegin(), steps_recv_.cbegin() + step_, + [](const auto val) { return val; })) { + partTwo(); + } + } + + void partTwoHandler(AllreduceRbnMsg* msg) { + for (int i = 0; i < msg->val_.size(); i++) { + val_[r_index_[msg->step_] + i] += msg->val_[i]; + } + if constexpr (debug) { + fmt::print( + "[{}] Part2 Step {} mask_= {} nprocs_pof2_ = {}: " + "idx = {} from {}\n", + this_node_, msg->step_, mask_, nprocs_pof2_, r_index_[msg->step_], + theContext()->getFromNodeCurrentTask()); + } + steps_recv_[msg->step_] = true; + num_recv_++; + if (mask_ < nprocs_pof2_) { + if (std::all_of( + steps_recv_.cbegin(), steps_recv_.cbegin() + step_, + [](const auto val) { return val; })) { + partTwo(); } + } else { + // step_ = num_steps_ - 1; + // mask_ = nprocs_pof2_ >> 1; + // partThree(); + } + } + + void partThree() { + if ( + vrt_node_ == -1 or + (not std::all_of( + steps_recv_.cbegin() + step_ + 1, steps_recv_.cend(), + [](const auto val) { return val; }))) { + return; + } + + if (not startedPartThree_) { + step_ = num_steps_ - 1; + mask_ = nprocs_pof2_ >> 1; + num_send_ = 0; + num_recv_ = 0; + startedPartThree_ = true; + std::fill(steps_sent_.begin(), steps_sent_.end(), false); + std::fill(steps_recv_.begin(), steps_recv_.end(), false); + } + + auto vdest = vrt_node_ ^ mask_; + auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; + + if constexpr (debug) { + fmt::print( + "[{}] Part3 Step {}: Sending to Node {} starting with idx = {} and " + "count " + "{} \n", + this_node_, step_, dest, r_index_[step_], r_count_[step_]); + } + proxy_[dest].template send<&Rabenseifner::partThreeHandler>( + DataT{ + val_.begin() + (r_index_[step_]), + val_.begin() + (r_index_[step_]) + r_count_[step_]}, + step_); + + steps_sent_[step_] = true; + num_send_++; + mask_ >>= 1; + step_--; + if ( + step_ >= 0 and + std::all_of( + steps_recv_.cbegin() + step_ + 1, steps_recv_.cend(), + [](const auto val) { return val; })) { + partThree(); + } + } + + void partThreeHandler(AllreduceRbnMsg* msg) { + for (int i = 0; i < msg->val_.size(); i++) { + val_[s_index_[msg->step_] + i] = msg->val_[i]; + } + + if (not startedPartThree_) { + step_ = num_steps_ - 1; + mask_ = nprocs_pof2_ >> 1; + num_send_ = 0; + num_recv_ = 0; + startedPartThree_ = true; + std::fill(steps_sent_.begin(), steps_sent_.end(), false); + std::fill(steps_recv_.begin(), steps_recv_.end(), false); + } + + num_recv_++; + if constexpr (debug) { + fmt::print( + "[{}] Part3 Step {}: Received idx = {} from {}\n", this_node_, + msg->step_, s_index_[msg->step_], + theContext()->getFromNodeCurrentTask()); + } + + steps_recv_[msg->step_] = true; + + if ( + mask_ > 0 and + ((step_ == num_steps_ - 1) or + std::all_of( + steps_recv_.cbegin() + step_ + 1, steps_recv_.cend(), + [](const auto val) { return val; }))) { + partThree(); + } + } + + void partFour() { + if (is_part_of_adjustment_group_ and is_even_) { + if constexpr (debug) { + fmt::print( + "[{}] Part4 : Sending to Node {} \n", this_node_, this_node_ + 1); + } + proxy_[this_node_ + 1].template send<&Rabenseifner::partFourHandler>( + val_, 0); + } + } + + void partFourHandler(AllreduceRbnMsg* msg) { val_ = msg->val_; } + + NodeType this_node_ = {}; + bool is_even_ = false; + vt::objgroup::proxy::Proxy proxy_ = {}; + DataT val_ = {}; + NodeType vrt_node_ = {}; + bool is_part_of_adjustment_group_ = false; + int32_t num_steps_ = {}; + int32_t nprocs_pof2_ = {}; + int32_t nprocs_rem_ = {}; + int32_t mask_ = 1; + bool startedPartThree_ = false; + + size_t w_size_ = {}; + int32_t step_ = 0; + int32_t num_send_ = 0; + int32_t expected_send_ = 0; + int32_t num_recv_ = 0; + int32_t expected_recv_ = 0; - } else { /* rank >= 2 * nprocs_rem */ - vrt_node = this_node - nprocs_rem; - } - }); - - //////////////////////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////// STEP 2 //////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////////////////////// - - // int step = 0; - // auto const wsize = data.size(); - - // auto& vec = grp_proxy[this_node].get()->data_; - - // /* - // Scatter Reduce (distance doubling with vector halving) - // */ - // for (int mask = 1; mask < (1 << num_steps); mask <<= 1) { - // int vdest = vrt_node ^ mask; - // int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem; - - // vt::runInEpochCollective([=] { - // if (vrt_node != -1) { - // if (this_node < dest) { - // grp_proxy[dest].send<&NodeObj::rightHalf>( - // std::vector{vec.begin() + (vec.size() / 2), vec.end()}); - // } else { - // grp_proxy[dest].send<&NodeObj::leftHalf>( - // std::vector{vec.begin(), vec.end() - (vec.size() / 2)}); - // } - // } - // }); - // } - - // step = num_steps - 1; - - // /* - // AllGather (distance halving with vector halving) - // */ - // for (int mask = (1 << num_steps) >> 1; mask > 0; mask >>= 1) { - // int vdest = vrt_node ^ mask; - // /* Translate vdest virtual rank to real rank */ - // int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem; - // vt::runInEpochCollective([=] { - // if (vrt_node != -1) { - // if (this_node < dest) { - // grp_proxy[dest].send<&NodeObj::leftHalfComplete>( - // std::vector{vec.begin(), vec.end() - (vec.size() / 2)}); - // } else { - // grp_proxy[dest].send<&NodeObj::rightHalfComplete>( - // std::vector{vec.begin() + (vec.size() / 2), vec.end()}); - // } - // } - // }); - // } - - /* - Send to excluded nodes (if needed) - */ - - /* - Local invoke of the handler - */ -} + std::vector steps_recv_ = {}; + std::vector steps_sent_ = {}; + std::vector r_index_ = {}; + std::vector r_count_ = {}; + std::vector s_index_ = {}; + std::vector s_count_ = {}; +}; } // namespace vt::collective::reduce::allreduce -#endif // INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H +#endif /*INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H*/ diff --git a/src/vt/objgroup/manager.impl.h b/src/vt/objgroup/manager.impl.h index 13f4d80de8..b58a864e19 100644 --- a/src/vt/objgroup/manager.impl.h +++ b/src/vt/objgroup/manager.impl.h @@ -41,8 +41,6 @@ //@HEADER */ -#include "vt/messaging/message/smart_ptr.h" -#include #if !defined INCLUDED_VT_OBJGROUP_MANAGER_IMPL_H #define INCLUDED_VT_OBJGROUP_MANAGER_IMPL_H @@ -59,7 +57,10 @@ #include "vt/collective/collective_alg.h" #include "vt/messaging/active.h" #include "vt/elm/elm_id_bits.h" -#include "vt/collective/reduce/allreduce/allreduce.h" +#include "vt/collective/reduce/allreduce/distance_doubling.h" +#include "vt/collective/reduce/allreduce/rabenseifner.h" +#include "vt/messaging/message/smart_ptr.h" +#include #include @@ -278,7 +279,8 @@ ObjGroupManager::allreduce_r(ProxyType proxy, const DataT& data) { return PendingSendType{nullptr}; } - using Reducer = collective::reduce::allreduce::Allreduce; + using Reducer = collective::reduce::allreduce::Rabenseifner; + // using Reducer = collective::reduce::allreduce::DistanceDoubling; auto grp_proxy = vt::theObjGroup()->makeCollective("allreduce_rabenseifner"); @@ -286,18 +288,31 @@ ObjGroupManager::allreduce_r(ProxyType proxy, const DataT& data) { grp_proxy[this_node].template invoke<&Reducer::initialize>( data, grp_proxy, num_nodes); + if(this_node == 0){ + fmt::print("\nStarting part one ...\n"); + } + vt::runInEpochCollective([=] { grp_proxy[this_node].template invoke<&Reducer::partOne>(); }); + if(this_node == 0){ + fmt::print("Starting part two ...\n"); + } vt::runInEpochCollective([=] { grp_proxy[this_node].template invoke<&Reducer::partTwo>(); }); + if(this_node == 0){ + fmt::print("Starting part three ...\n"); + } vt::runInEpochCollective([=] { grp_proxy[this_node].template invoke<&Reducer::partThree>(); }); + if(this_node == 0){ + fmt::print("Starting part four ...\n"); + } vt::runInEpochCollective([=] { grp_proxy[this_node].template invoke<&Reducer::partFour>(); }); diff --git a/tests/perf/allreduce.cc b/tests/perf/allreduce.cc index a093fd0a37..0d97bbe698 100644 --- a/tests/perf/allreduce.cc +++ b/tests/perf/allreduce.cc @@ -46,7 +46,6 @@ #include #include #include -#include #include diff --git a/tests/perf/common/test_harness_macros.h b/tests/perf/common/test_harness_macros.h index bb668cfe26..e1eb579b2a 100644 --- a/tests/perf/common/test_harness_macros.h +++ b/tests/perf/common/test_harness_macros.h @@ -69,8 +69,6 @@ namespace vt { namespace tests { namespace perf { namespace common { * * VT_PERF_TEST_MAIN() */ - - struct PerfTestRegistry{ static void AddTest(std::unique_ptr&& test) { tests_.push_back(std::move(test)); diff --git a/tests/perf/reduce.cc b/tests/perf/reduce.cc index 08d92c148d..643bb28c89 100644 --- a/tests/perf/reduce.cc +++ b/tests/perf/reduce.cc @@ -110,39 +110,31 @@ struct NodeObj { vt::objgroup::proxy::Proxy proxy_ = {}; }; -VT_PERF_TEST(MyTest, test_reduce) { - auto grp_proxy = - vt::theObjGroup()->makeCollective("test_allreduce", this); +// VT_PERF_TEST(MyTest, test_reduce) { +// auto grp_proxy = +// vt::theObjGroup()->makeCollective("test_allreduce", this); - if (theContext()->getNode() == 0) { - theTerm()->disableTD(); - } +// if (theContext()->getNode() == 0) { +// theTerm()->disableTD(); +// } - vt::runInEpochCollective([=] { - grp_proxy.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data); - }); +// vt::runInEpochCollective([=] { +// grp_proxy.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data); +// }); - if (theContext()->getNode() == 0) { - theTerm()->enableTD(); - } -} +// if (theContext()->getNode() == 0) { +// theTerm()->enableTD(); +// } +// } VT_PERF_TEST(MyTest, test_allreduce) { auto grp_proxy = vt::theObjGroup()->makeCollective("test_allreduce", this); - if (theContext()->getNode() == 0) { - theTerm()->disableTD(); - } - vt::runInEpochCollective([=] { grp_proxy.allreduce_h<&NodeObj::newReduceComplete, collective::PlusOp>( data); }); - - if (theContext()->getNode() == 0) { - theTerm()->enableTD(); - } } VT_PERF_TEST_MAIN() From 90a20e03af278346c431d6d61e2c631e48ba65ec Mon Sep 17 00:00:00 2001 From: Jacob Domagala Date: Thu, 11 Apr 2024 22:35:21 +0200 Subject: [PATCH 06/29] #2240: Make sure the order of reduce operations is correct --- .../reduce/allreduce/distance_doubling.h | 166 ++++++------------ .../reduce/allreduce/rabenseifner.h | 18 +- src/vt/objgroup/manager.impl.h | 91 +++++----- tests/perf/reduce.cc | 32 ++-- 4 files changed, 115 insertions(+), 192 deletions(-) diff --git a/src/vt/collective/reduce/allreduce/distance_doubling.h b/src/vt/collective/reduce/allreduce/distance_doubling.h index a5b508fc0f..ddef0b28c3 100644 --- a/src/vt/collective/reduce/allreduce/distance_doubling.h +++ b/src/vt/collective/reduce/allreduce/distance_doubling.h @@ -48,6 +48,8 @@ #include "vt/context/context.h" #include "vt/messaging/message/message.h" #include "vt/objgroup/proxy/proxy_objgroup.h" +#include "vt/configs/error/config_assert.h" +#include "vt/messaging/message/smart_ptr.h" #include #include @@ -85,16 +87,20 @@ struct AllreduceDblMsg int32_t step_ = {}; }; -template +template struct DistanceDoubling { void initialize( const DataT& data, vt::objgroup::proxy::Proxy proxy, + vt::objgroup::proxy::Proxy parentProxy, uint32_t num_nodes) { this_node_ = vt::theContext()->getNode(); is_even_ = this_node_ % 2 == 0; val_ = data; proxy_ = proxy; + parentProxy_ = parentProxy; num_steps_ = static_cast(log2(num_nodes)); + messages.resize(num_steps_, nullptr); + nprocs_pof2_ = 1 << num_steps_; nprocs_rem_ = num_nodes - nprocs_pof2_; is_part_of_adjustment_group_ = this_node_ < (2 * nprocs_rem_); @@ -117,24 +123,32 @@ struct DistanceDoubling { } void partOne() { - if (is_part_of_adjustment_group_ and is_even_) { - proxy_[this_node_ + 1].template send<&DistanceDoubling::partOneHandler>( + if (not nprocs_rem_) { + // we're running on power of 2 number of nodes, proceed to second step + partTwo(); + } else if (is_part_of_adjustment_group_ and not is_even_) { + proxy_[this_node_ - 1].template send<&DistanceDoubling::partOneHandler>( val_); } } void partOneHandler(AllreduceDblMsg* msg) { - for (int i = 0; i < msg->val_.size(); i++) { - val_[i] += msg->val_[i]; - } + Op(val_, msg->val_); + // for (int i = 0; i < msg->val_.size(); i++) { + // val_[i] += msg->val_[i]; + // } + + partTwo(); } + bool isValid() { return (vrt_node_ != -1) and (step_ < num_steps_); } + bool isReady() { + return std::all_of( + steps_recv_.cbegin(), steps_recv_.cbegin() + step_, + [](const auto val) { return val; }); + } void partTwo() { - if ( - vrt_node_ == -1 or (step_ >= num_steps_) or - (not std::all_of( - steps_recv_.cbegin(), steps_recv_.cbegin() + step_, - [](const auto val) { return val; }))) { + if (not isValid() or not isReady()) { return; } @@ -144,6 +158,12 @@ struct DistanceDoubling { fmt::print( "[{}] Part2 Step {}: Sending to Node {} \n", this_node_, step_, dest); } + if (step_) { + for (int i = 0; i < val_.size(); ++i) { + val_[i] += messages.at(step_ - 1)->val_[i]; + } + } + proxy_[dest].template send<&DistanceDoubling::partTwoHandler>(val_, step_); mask_ <<= 1; @@ -151,146 +171,60 @@ struct DistanceDoubling { steps_sent_[step_] = true; step_++; - if (std::all_of( - steps_recv_.cbegin(), steps_recv_.cbegin() + step_, - [](const auto val) { return val; })) { + if (isReady()) { partTwo(); } } void partTwoHandler(AllreduceDblMsg* msg) { - for (int i = 0; i < msg->val_.size(); i++) { - val_[i] += msg->val_[i]; - } + messages.at(msg->step_) = promoteMsg(msg); + if constexpr (isdebug) { - std::string data(128, 0x0); + std::string data(1024, 0x0); for (auto val : msg->val_) { data.append(fmt::format("{} ", val)); } fmt::print( "[{}] Part2 Step {} mask_= {} nprocs_pof2_ = {}: Received data ({}) " - "idx = {} from {}\n", + "from {}\n", this_node_, msg->step_, mask_, nprocs_pof2_, data, theContext()->getFromNodeCurrentTask()); } steps_recv_[msg->step_] = true; num_recv_++; if (mask_ < nprocs_pof2_) { - if (std::all_of( - steps_recv_.cbegin(), steps_recv_.cbegin() + step_, - [](const auto val) { return val; })) { + if (isReady()) { partTwo(); } - } else { - // step_ = num_steps_ - 1; - // mask_ = nprocs_pof2_ >> 1; - // partThree(); } } void partThree() { - if ( - vrt_node_ == -1 or - (not std::all_of( - steps_recv_.cbegin() + step_ + 1, steps_recv_.cend(), - [](const auto val) { return val; }))) { - return; - } - - if (not startedPartThree_) { - step_ = num_steps_ - 1; - mask_ = nprocs_pof2_ >> 1; - num_send_ = 0; - num_recv_ = 0; - startedPartThree_ = true; - std::fill(steps_sent_.begin(), steps_sent_.end(), false); - std::fill(steps_recv_.begin(), steps_recv_.end(), false); - } - - auto vdest = vrt_node_ ^ mask_; - auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; - - if constexpr (isdebug) { - std::string data(1024, 0x0); - - for (auto val : val_) { - data.append(fmt::format("{} ", val)); - } - - fmt::print( - "[{}] Part3 Step {}: Sending to Node {} data={} \n", this_node_, step_, - dest, data); - } - proxy_[dest].template send<&DistanceDoubling::partThreeHandler>( - val_, step_); - - steps_sent_[step_] = true; - num_send_++; - mask_ >>= 1; - step_--; - if ( - step_ >= 0 and - std::all_of( - steps_recv_.cbegin() + step_ + 1, steps_recv_.cend(), - [](const auto val) { return val; })) { - partThree(); - } - } - - void partThreeHandler(AllreduceDblMsg* msg) { - for (int i = 0; i < msg->val_.size(); i++) { - val_[i] = msg->val_[i]; - } - - if (not startedPartThree_) { - step_ = num_steps_ - 1; - mask_ = nprocs_pof2_ >> 1; - num_send_ = 0; - num_recv_ = 0; - startedPartThree_ = true; - std::fill(steps_sent_.begin(), steps_sent_.end(), false); - std::fill(steps_recv_.begin(), steps_recv_.end(), false); - } - - num_recv_++; - if constexpr (isdebug) { - std::string data(128, 0x0); - for (auto val : msg->val_) { - data.append(fmt::format("{} ", val)); - } - fmt::print( - "[{}] Part3 Step {}: Received data ({}) from {}\n", this_node_, - msg->step_, data, theContext()->getFromNodeCurrentTask()); - } - - steps_recv_[msg->step_] = true; - - if ( - mask_ > 0 and - ((step_ == num_steps_ - 1) or - std::all_of( - steps_recv_.cbegin() + step_ + 1, steps_recv_.cend(), - [](const auto val) { return val; }))) { - partThree(); - } - } - - void partFour() { if (is_part_of_adjustment_group_ and is_even_) { if constexpr (isdebug) { fmt::print( "[{}] Part4 : Sending to Node {} \n", this_node_, this_node_ + 1); } - proxy_[this_node_ + 1].template send<&DistanceDoubling::partFourHandler>( + proxy_[this_node_ + 1].template send<&DistanceDoubling::partThreeHandler>( val_); } } - void partFourHandler(AllreduceDblMsg* msg) { val_ = msg->val_; } + void partThreeHandler(AllreduceDblMsg* msg) { val_ = msg->val_; } + void finalPart() { + if (vrt_node_ != -1) { + for (int i = 0; i < val_.size(); ++i) { + val_[i] += messages.at(step_ - 1)->val_[i]; + } + } + + parentProxy_[this_node_] .template invoke(val_); + } NodeType this_node_ = {}; bool is_even_ = false; vt::objgroup::proxy::Proxy proxy_ = {}; + vt::objgroup::proxy::Proxy parentProxy_ = {}; DataT val_ = {}; NodeType vrt_node_ = {}; bool is_part_of_adjustment_group_ = false; @@ -309,6 +243,8 @@ struct DistanceDoubling { std::vector steps_recv_ = {}; std::vector steps_sent_ = {}; + + std::vector>> messages = {}; }; } // namespace vt::collective::reduce::allreduce diff --git a/src/vt/collective/reduce/allreduce/rabenseifner.h b/src/vt/collective/reduce/allreduce/rabenseifner.h index f8284833cc..e27b42de8c 100644 --- a/src/vt/collective/reduce/allreduce/rabenseifner.h +++ b/src/vt/collective/reduce/allreduce/rabenseifner.h @@ -54,7 +54,7 @@ namespace vt::collective::reduce::allreduce { -constexpr bool debug = true; +constexpr bool debug = false; template struct AllreduceRbnMsg @@ -140,23 +140,13 @@ struct Rabenseifner { } } - expected_send_ = num_steps_; - expected_recv_ = num_steps_; steps_sent_.resize(num_steps_, false); steps_recv_.resize(num_steps_, false); if constexpr (debug) { - std::string str(1024, 0x0); - for (int i = 0; i < num_steps_; ++i) { - str.append(fmt::format( - "Step{}: send_idx = {} send_count = {} recieve_idx = {} " - "recieve_count " - "= {}\n", - i, s_index_[i], s_count_[i], r_index_[i], r_count_[i])); - } fmt::print( - "[{}] Initialize with size = {} num_steps {} \n {}", this_node_, - w_size_, num_steps_, str); + "[{}] Initialize with size = {} num_steps {} \n", this_node_, + w_size_, num_steps_); } } @@ -372,9 +362,7 @@ struct Rabenseifner { size_t w_size_ = {}; int32_t step_ = 0; int32_t num_send_ = 0; - int32_t expected_send_ = 0; int32_t num_recv_ = 0; - int32_t expected_recv_ = 0; std::vector steps_recv_ = {}; std::vector steps_sent_ = {}; diff --git a/src/vt/objgroup/manager.impl.h b/src/vt/objgroup/manager.impl.h index b58a864e19..7544336b48 100644 --- a/src/vt/objgroup/manager.impl.h +++ b/src/vt/objgroup/manager.impl.h @@ -41,6 +41,7 @@ //@HEADER */ +#include #if !defined INCLUDED_VT_OBJGROUP_MANAGER_IMPL_H #define INCLUDED_VT_OBJGROUP_MANAGER_IMPL_H @@ -275,51 +276,57 @@ ObjGroupManager::allreduce_r(ProxyType proxy, const DataT& data) { auto const this_node = vt::theContext()->getNode(); auto const num_nodes = theContext()->getNumNodes(); - if(num_nodes < 2){ + if (num_nodes < 2) { return PendingSendType{nullptr}; } - using Reducer = collective::reduce::allreduce::Rabenseifner; - // using Reducer = collective::reduce::allreduce::DistanceDoubling; - - auto grp_proxy = - vt::theObjGroup()->makeCollective("allreduce_rabenseifner"); - - grp_proxy[this_node].template invoke<&Reducer::initialize>( - data, grp_proxy, num_nodes); - - if(this_node == 0){ - fmt::print("\nStarting part one ...\n"); - } - - vt::runInEpochCollective([=] { - grp_proxy[this_node].template invoke<&Reducer::partOne>(); - }); - - if(this_node == 0){ - fmt::print("Starting part two ...\n"); - } - vt::runInEpochCollective([=] { - grp_proxy[this_node].template invoke<&Reducer::partTwo>(); - }); - - if(this_node == 0){ - fmt::print("Starting part three ...\n"); - } - vt::runInEpochCollective([=] { - grp_proxy[this_node].template invoke<&Reducer::partThree>(); - }); - - if(this_node == 0){ - fmt::print("Starting part four ...\n"); - } - vt::runInEpochCollective([=] { - grp_proxy[this_node].template invoke<&Reducer::partFour>(); - }); - - proxy[this_node].template invoke(grp_proxy.get()->val_); - - return PendingSendType{nullptr}; + // using Reducer = collective::reduce::allreduce::Rabenseifner; + using Reducer = collective::reduce::allreduce::DistanceDoubling; + + return PendingSendType{[=] { + auto grp_proxy = + vt::theObjGroup()->makeCollective("allreduce_rabenseifner"); + if constexpr (std::is_same_v< + Reducer, + collective::reduce::allreduce::DistanceDoubling>) { + grp_proxy[this_node].template invoke<&Reducer::initialize>( + data, grp_proxy, num_nodes); + + grp_proxy[this_node].template invoke<&Reducer::partOne>(); + + // vt::runInEpochCollective( + // [=] { grp_proxy[this_node].template invoke<&Reducer::partTwo>(); }); + + // grp_proxy[this_node].template invoke<&Reducer::finalPart>(); + + // if (grp_proxy.get()->nprocs_rem_) { + // vt::runInEpochCollective( + // [=] { grp_proxy[this_node].template invoke<&Reducer::partThree>(); }); + // } + } else if constexpr (std::is_same_v< + Reducer, + collective::reduce::allreduce::Rabenseifner< + DataT>>) { + grp_proxy[this_node].template invoke<&Reducer::initialize>( + data, grp_proxy, num_nodes); + + if (grp_proxy.get()->nprocs_rem_) { + vt::runInEpochCollective( + [=] { grp_proxy[this_node].template invoke<&Reducer::partOne>(); }); + } + + vt::runInEpochCollective( + [=] { grp_proxy[this_node].template invoke<&Reducer::partTwo>(); }); + + vt::runInEpochCollective( + [=] { grp_proxy[this_node].template invoke<&Reducer::partThree>(); }); + + if (grp_proxy.get()->nprocs_rem_) { + vt::runInEpochCollective( + [=] { grp_proxy[this_node].template invoke<&Reducer::partFour>(); }); + } + } + }}; } template *f> diff --git a/tests/perf/reduce.cc b/tests/perf/reduce.cc index 643bb28c89..a2b703600b 100644 --- a/tests/perf/reduce.cc +++ b/tests/perf/reduce.cc @@ -110,31 +110,23 @@ struct NodeObj { vt::objgroup::proxy::Proxy proxy_ = {}; }; -// VT_PERF_TEST(MyTest, test_reduce) { -// auto grp_proxy = -// vt::theObjGroup()->makeCollective("test_allreduce", this); - -// if (theContext()->getNode() == 0) { -// theTerm()->disableTD(); -// } - -// vt::runInEpochCollective([=] { -// grp_proxy.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data); -// }); +VT_PERF_TEST(MyTest, test_reduce) { + auto grp_proxy = + vt::theObjGroup()->makeCollective("test_allreduce", this); -// if (theContext()->getNode() == 0) { -// theTerm()->enableTD(); -// } -// } + grp_proxy.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data); +} VT_PERF_TEST(MyTest, test_allreduce) { auto grp_proxy = - vt::theObjGroup()->makeCollective("test_allreduce", this); + vt::theObjGroup()->makeCollective("test_allreduce_new", this); + + grp_proxy.allreduce_h<&NodeObj::newReduceComplete, collective::PlusOp>(data); +} - vt::runInEpochCollective([=] { - grp_proxy.allreduce_h<&NodeObj::newReduceComplete, collective::PlusOp>( - data); - }); +VT_PERF_TEST(MyTest, test_epoch_collective) { + vt::runInEpochCollective([] {}); + vt::runInEpochCollective([] {}); } VT_PERF_TEST_MAIN() From 8bf1cc94dce3879a6a184aaa42c84853e4c9864d Mon Sep 17 00:00:00 2001 From: Jacob Domagala Date: Mon, 15 Apr 2024 21:51:29 +0200 Subject: [PATCH 07/29] #2240: Working Recursive doubling --- .../reduce/allreduce/rabenseifner.h | 13 +- ...stance_doubling.h => recursive_doubling.h} | 203 ++++++++++++------ src/vt/objgroup/manager.impl.h | 83 ++++--- tests/perf/allreduce.cc | 140 +++++++++--- tests/perf/reduce.cc | 84 +++----- tests/perf/send_cost.cc | 1 + 6 files changed, 318 insertions(+), 206 deletions(-) rename src/vt/collective/reduce/allreduce/{distance_doubling.h => recursive_doubling.h} (58%) diff --git a/src/vt/collective/reduce/allreduce/rabenseifner.h b/src/vt/collective/reduce/allreduce/rabenseifner.h index e27b42de8c..8af07bff6a 100644 --- a/src/vt/collective/reduce/allreduce/rabenseifner.h +++ b/src/vt/collective/reduce/allreduce/rabenseifner.h @@ -86,11 +86,13 @@ struct AllreduceRbnMsg int32_t step_ = {}; }; -template +template < + typename DataT, template class Op, typename ObjT, + auto finalHandler> struct Rabenseifner { void initialize( const DataT& data, vt::objgroup::proxy::Proxy proxy, - uint32_t num_nodes) { + vt::objgroup::proxy::Proxy parentProxy, uint32_t num_nodes) { this_node_ = vt::theContext()->getNode(); is_even_ = this_node_ % 2 == 0; val_ = data; @@ -145,8 +147,8 @@ struct Rabenseifner { if constexpr (debug) { fmt::print( - "[{}] Initialize with size = {} num_steps {} \n", this_node_, - w_size_, num_steps_); + "[{}] Initialize with size = {} num_steps {} \n", this_node_, w_size_, + num_steps_); } } @@ -186,7 +188,7 @@ struct Rabenseifner { val_[(val_.size() / 2) + i] = msg->val_[i]; } - // partTwo(); + partTwo(); } void partTwo() { @@ -350,6 +352,7 @@ struct Rabenseifner { NodeType this_node_ = {}; bool is_even_ = false; vt::objgroup::proxy::Proxy proxy_ = {}; + vt::objgroup::proxy::Proxy parentProxy_ = {}; DataT val_ = {}; NodeType vrt_node_ = {}; bool is_part_of_adjustment_group_ = false; diff --git a/src/vt/collective/reduce/allreduce/distance_doubling.h b/src/vt/collective/reduce/allreduce/recursive_doubling.h similarity index 58% rename from src/vt/collective/reduce/allreduce/distance_doubling.h rename to src/vt/collective/reduce/allreduce/recursive_doubling.h index ddef0b28c3..e1ac7873d6 100644 --- a/src/vt/collective/reduce/allreduce/distance_doubling.h +++ b/src/vt/collective/reduce/allreduce/recursive_doubling.h @@ -87,22 +87,23 @@ struct AllreduceDblMsg int32_t step_ = {}; }; -template +template < + typename DataT, template class Op, typename ObjT, + auto finalHandler> struct DistanceDoubling { - void initialize( - const DataT& data, vt::objgroup::proxy::Proxy proxy, - vt::objgroup::proxy::Proxy parentProxy, - uint32_t num_nodes) { + template + DistanceDoubling(NodeType num_nodes, Args&&... args) + : val_(std::forward(args)...), + num_nodes_(num_nodes) { } + + void initialize() { this_node_ = vt::theContext()->getNode(); is_even_ = this_node_ % 2 == 0; - val_ = data; - proxy_ = proxy; - parentProxy_ = parentProxy; - num_steps_ = static_cast(log2(num_nodes)); + num_steps_ = static_cast(log2(num_nodes_)); messages.resize(num_steps_, nullptr); nprocs_pof2_ = 1 << num_steps_; - nprocs_rem_ = num_nodes - nprocs_pof2_; + nprocs_rem_ = num_nodes_ - nprocs_pof2_; is_part_of_adjustment_group_ = this_node_ < (2 * nprocs_rem_); if (is_part_of_adjustment_group_) { if (is_even_) { @@ -114,41 +115,76 @@ struct DistanceDoubling { vrt_node_ = this_node_ - nprocs_rem_; } - w_size_ = data.size(); - - expected_send_ = num_steps_; - expected_recv_ = num_steps_; - steps_sent_.resize(num_steps_, false); steps_recv_.resize(num_steps_, false); + steps_reduced_.resize(num_steps_, false); + + initialized_ = true; + } + + void allreduce( + vt::objgroup::proxy::Proxy proxy, + vt::objgroup::proxy::Proxy parentProxy) { + if (not initialized_) { + initialize(); + } + + proxy_ = proxy; + parent_proxy_ = parentProxy; + + if (nprocs_rem_) { + adjustForPowerOfTwo(); + } else { + reduceIter(); + } } - void partOne() { - if (not nprocs_rem_) { - // we're running on power of 2 number of nodes, proceed to second step - partTwo(); - } else if (is_part_of_adjustment_group_ and not is_even_) { - proxy_[this_node_ - 1].template send<&DistanceDoubling::partOneHandler>( - val_); + void adjustForPowerOfTwo() { + if (is_part_of_adjustment_group_ and not is_even_) { + if constexpr (isdebug) { + fmt::print( + "[{}] Part1: Sending to Node {} \n", this_node_, this_node_ - 1); + } + + proxy_[this_node_ - 1] + .template send<&DistanceDoubling::adjustForPowerOfTwoHandler>(val_); } } - void partOneHandler(AllreduceDblMsg* msg) { - Op(val_, msg->val_); - // for (int i = 0; i < msg->val_.size(); i++) { - // val_[i] += msg->val_[i]; - // } + void adjustForPowerOfTwoHandler(AllreduceDblMsg* msg) { + if constexpr (isdebug) { + std::string data(1024, 0x0); + for (auto val : msg->val_) { + data.append(fmt::format("{} ", val)); + } + fmt::print( + "[{}] Part1 Handler initialized_ = {}: Received data ({}) " + "from {}\n", + this_node_, initialized_, data, theContext()->getFromNodeCurrentTask()); + } + + Op()(val_, msg->val_); - partTwo(); + finished_adjustment_part_ = true; + + reduceIter(); } + bool done() { return step_ == num_steps_ and allMessagesReceived(); } bool isValid() { return (vrt_node_ != -1) and (step_ < num_steps_); } - bool isReady() { + bool allMessagesReceived() { return std::all_of( steps_recv_.cbegin(), steps_recv_.cbegin() + step_, [](const auto val) { return val; }); } - void partTwo() { - if (not isValid() or not isReady()) { + bool isReady() { + return (is_part_of_adjustment_group_ and finished_adjustment_part_) and + step_ == 0 or + allMessagesReceived(); + } + + void reduceIter() { + // Ensure we have received all necessary messages + if (not isReady()) { return; } @@ -158,91 +194,122 @@ struct DistanceDoubling { fmt::print( "[{}] Part2 Step {}: Sending to Node {} \n", this_node_, step_, dest); } - if (step_) { - for (int i = 0; i < val_.size(); ++i) { - val_[i] += messages.at(step_ - 1)->val_[i]; - } - } - proxy_[dest].template send<&DistanceDoubling::partTwoHandler>(val_, step_); + proxy_[dest].template send<&DistanceDoubling::reduceIterHandler>( + val_, step_); mask_ <<= 1; - num_send_++; - steps_sent_[step_] = true; step_++; - if (isReady()) { - partTwo(); + tryReduce(step_ - 1); + + if (done()) { + finalPart(); + } else if (isReady()) { + reduceIter(); } } - void partTwoHandler(AllreduceDblMsg* msg) { - messages.at(msg->step_) = promoteMsg(msg); + void tryReduce(int32_t step) { + if ( + (step < step_) and not steps_reduced_[step] and steps_recv_[step] and + std::all_of( + steps_reduced_.cbegin(), steps_reduced_.cbegin() + step, + [](const auto val) { return val; })) { + Op()(val_, messages.at(step)->val_); + steps_reduced_[step] = true; + } + } + void reduceIterHandler(AllreduceDblMsg* msg) { if constexpr (isdebug) { std::string data(1024, 0x0); for (auto val : msg->val_) { data.append(fmt::format("{} ", val)); } fmt::print( - "[{}] Part2 Step {} mask_= {} nprocs_pof2_ = {}: Received data ({}) " + "[{}] Part2 Step {} initialized_ = {} mask_= {} nprocs_pof2_ = {}: " + "Received data ({}) " "from {}\n", - this_node_, msg->step_, mask_, nprocs_pof2_, data, + this_node_, msg->step_, initialized_, mask_, nprocs_pof2_, data, theContext()->getFromNodeCurrentTask()); } - steps_recv_[msg->step_] = true; - num_recv_++; - if (mask_ < nprocs_pof2_) { - if (isReady()) { - partTwo(); + + // Special case when we receive step 2 message before step 1 is done on this node + if (not finished_adjustment_part_) { + if (not initialized_) { + initialize(); } + + messages.at(msg->step_) = promoteMsg(msg); + steps_recv_[msg->step_] = true; + + return; + } + + messages.at(msg->step_) = promoteMsg(msg); + steps_recv_[msg->step_] = true; + + tryReduce(msg->step_); + + if ((mask_ < nprocs_pof2_) and isReady()) { + reduceIter(); + + } else if (done()) { + finalPart(); } } - void partThree() { + void sendToExcludedNodes() { if (is_part_of_adjustment_group_ and is_even_) { if constexpr (isdebug) { fmt::print( - "[{}] Part4 : Sending to Node {} \n", this_node_, this_node_ + 1); + "[{}] Part3 : Sending to Node {} \n", this_node_, this_node_ + 1); } - proxy_[this_node_ + 1].template send<&DistanceDoubling::partThreeHandler>( - val_); + proxy_[this_node_ + 1] + .template send<&DistanceDoubling::sendToExcludedNodesHandler>(val_); } } - void partThreeHandler(AllreduceDblMsg* msg) { val_ = msg->val_; } + void sendToExcludedNodesHandler(AllreduceDblMsg* msg) { + val_ = msg->val_; + + parent_proxy_[this_node_].template invoke(val_); + } + void finalPart() { - if (vrt_node_ != -1) { - for (int i = 0; i < val_.size(); ++i) { - val_[i] += messages.at(step_ - 1)->val_[i]; - } + if (completed_) { + return; + } + + if (nprocs_rem_) { + sendToExcludedNodes(); } - parentProxy_[this_node_] .template invoke(val_); + parent_proxy_[this_node_].template invoke(val_); + completed_ = true; } NodeType this_node_ = {}; + uint32_t num_nodes_ = {}; bool is_even_ = false; vt::objgroup::proxy::Proxy proxy_ = {}; - vt::objgroup::proxy::Proxy parentProxy_ = {}; + vt::objgroup::proxy::Proxy parent_proxy_ = {}; DataT val_ = {}; NodeType vrt_node_ = {}; + bool initialized_ = false; bool is_part_of_adjustment_group_ = false; + bool finished_adjustment_part_ = false; int32_t num_steps_ = {}; int32_t nprocs_pof2_ = {}; int32_t nprocs_rem_ = {}; int32_t mask_ = 1; - bool startedPartThree_ = false; - size_t w_size_ = {}; int32_t step_ = 0; - int32_t num_send_ = 0; - int32_t expected_send_ = 0; - int32_t num_recv_ = 0; - int32_t expected_recv_ = 0; + bool completed_ = false; std::vector steps_recv_ = {}; - std::vector steps_sent_ = {}; + std::vector steps_reduced_ = {}; std::vector>> messages = {}; }; diff --git a/src/vt/objgroup/manager.impl.h b/src/vt/objgroup/manager.impl.h index 7544336b48..7ea2e8c87b 100644 --- a/src/vt/objgroup/manager.impl.h +++ b/src/vt/objgroup/manager.impl.h @@ -41,7 +41,6 @@ //@HEADER */ -#include #if !defined INCLUDED_VT_OBJGROUP_MANAGER_IMPL_H #define INCLUDED_VT_OBJGROUP_MANAGER_IMPL_H @@ -58,7 +57,6 @@ #include "vt/collective/collective_alg.h" #include "vt/messaging/active.h" #include "vt/elm/elm_id_bits.h" -#include "vt/collective/reduce/allreduce/distance_doubling.h" #include "vt/collective/reduce/allreduce/rabenseifner.h" #include "vt/messaging/message/smart_ptr.h" #include @@ -281,51 +279,42 @@ ObjGroupManager::allreduce_r(ProxyType proxy, const DataT& data) { } // using Reducer = collective::reduce::allreduce::Rabenseifner; - using Reducer = collective::reduce::allreduce::DistanceDoubling; - - return PendingSendType{[=] { - auto grp_proxy = - vt::theObjGroup()->makeCollective("allreduce_rabenseifner"); - if constexpr (std::is_same_v< - Reducer, - collective::reduce::allreduce::DistanceDoubling>) { - grp_proxy[this_node].template invoke<&Reducer::initialize>( - data, grp_proxy, num_nodes); - - grp_proxy[this_node].template invoke<&Reducer::partOne>(); - - // vt::runInEpochCollective( - // [=] { grp_proxy[this_node].template invoke<&Reducer::partTwo>(); }); - - // grp_proxy[this_node].template invoke<&Reducer::finalPart>(); - - // if (grp_proxy.get()->nprocs_rem_) { - // vt::runInEpochCollective( - // [=] { grp_proxy[this_node].template invoke<&Reducer::partThree>(); }); - // } - } else if constexpr (std::is_same_v< - Reducer, - collective::reduce::allreduce::Rabenseifner< - DataT>>) { - grp_proxy[this_node].template invoke<&Reducer::initialize>( - data, grp_proxy, num_nodes); - - if (grp_proxy.get()->nprocs_rem_) { - vt::runInEpochCollective( - [=] { grp_proxy[this_node].template invoke<&Reducer::partOne>(); }); - } - - vt::runInEpochCollective( - [=] { grp_proxy[this_node].template invoke<&Reducer::partTwo>(); }); - - vt::runInEpochCollective( - [=] { grp_proxy[this_node].template invoke<&Reducer::partThree>(); }); - - if (grp_proxy.get()->nprocs_rem_) { - vt::runInEpochCollective( - [=] { grp_proxy[this_node].template invoke<&Reducer::partFour>(); }); - } - } + // using Reducer = collective::reduce::allreduce::DistanceDoubling; + + return PendingSendType{theTerm()->getEpoch(), [=] { + // auto grp_proxy = + // vt::theObjGroup()->makeCollective("allreduce_rabenseifner"); + // if constexpr (std::is_same_v< + // Reducer, + // collective::reduce::allreduce::DistanceDoubling>) { + // grp_proxy[this_node].template invoke<&Reducer::initialize>( + // data, grp_proxy, proxy, num_nodes); + + // grp_proxy[this_node].template invoke<&Reducer::partOne>(); + + // } else if constexpr (std::is_same_v< + // Reducer, + // collective::reduce::allreduce::Rabenseifner< + // DataT, Op, ObjT, f>>) { + // grp_proxy[this_node].template invoke<&Reducer::initialize>( + // data, grp_proxy, num_nodes); + + // if (grp_proxy.get()->nprocs_rem_) { + // vt::runInEpochCollective( + // [=] { grp_proxy[this_node].template invoke<&Reducer::partOne>(); }); + // } + + // vt::runInEpochCollective( + // [=] { grp_proxy[this_node].template invoke<&Reducer::partTwo>(); }); + + // vt::runInEpochCollective( + // [=] { grp_proxy[this_node].template invoke<&Reducer::partThree>(); }); + + // if (grp_proxy.get()->nprocs_rem_) { + // vt::runInEpochCollective( + // [=] { grp_proxy[this_node].template invoke<&Reducer::partFour>(); }); + // } + // } }}; } diff --git a/tests/perf/allreduce.cc b/tests/perf/allreduce.cc index 0d97bbe698..645f83136d 100644 --- a/tests/perf/allreduce.cc +++ b/tests/perf/allreduce.cc @@ -41,11 +41,16 @@ //@HEADER */ #include "common/test_harness.h" +#include "vt/collective/collective_alg.h" +#include "vt/collective/reduce/operators/functors/plus_op.h" +#include "vt/configs/error/config_assert.h" #include "vt/context/context.h" #include #include #include #include +#include +#include #include @@ -53,58 +58,131 @@ using namespace vt; using namespace vt::tests::perf::common; static constexpr int num_iters = 1; +struct MyTest : PerfTestHarness { + void SetUp() override { + PerfTestHarness::SetUp(); + data.resize(1 << 4); + for (auto& val : data) { + val = theContext()->getNode() + 1; + } + } -struct MyTest : PerfTestHarness { }; + std::vector data; +}; struct NodeObj { explicit NodeObj(MyTest* test_obj) : test_obj_(test_obj) { } - void initialize() { proxy_ = vt::theObjGroup()->getProxy(this); + void initialize() { + proxy_ = vt::theObjGroup()->getProxy(this); + // data_["Node"] = theContext()->getNode(); } } - struct MyMsg : vt::Message {}; + struct MyMsg : vt::Message { }; - void reduceComplete(std::vector in) { - reduce_counter_++; - test_obj_->StopTimer(fmt::format("{} reduce", i)); - test_obj_->GetMemoryUsage(); - if (i < num_iters) { - i++; - auto this_node = theContext()->getNode(); - proxy_[this_node].send(); - } else if (theContext()->getNode() == 0) { - theTerm()->enableTD(); - } + void recursiveDoubling(std::vector in) { + // std::string printer(1024, 0x0); + // printer.append(fmt::format("\n[{}]: recursiveDoubling done! ", theContext()->getNode())); + + // for (int node = 0; node < theContext()->getNumNodes(); ++node) { + // if (node == theContext()->getNode()) { + + // for (auto val : in) { + // printer.append(fmt::format("{} ", val)); + // } + + // fmt::print("{}\n", printer); + + // theCollective()->barrier(); + // } + // } + + // fmt::print("\n"); + // const auto p = theContext()->getNumNodes(); + // const auto expected = (p * (p + 1)) / 2; + // for (auto val : in) { + // vtAssert(val == expected, "FAILURE!"); + // } + } + + void newReduceComplete(std::vector in) { + // fmt::print( + // "\n[{}]: allreduce_h done! (Size == {}) Results are ...\n", + // theContext()->getNode(), in.size()); + // const auto p = theContext()->getNumNodes(); + // const auto expected = (p * (p + 1)) / 2; + // for (auto val : in) { + // vtAssert(val == expected, "FAILURE!"); + // } + // for (int node = 0; node < theContext()->getNumNodes(); ++node) { + // if (node == theContext()->getNode()) { + // std::string printer(128, 0x0); + // for (auto val : in) { + // printer.append(fmt::format("{} ", val)); + // } + + // fmt::print("{}\n", printer); + + // theCollective()->barrier(); + // } + // } + + // fmt::print("\n"); } - void perfReduce(MyMsg* in_msg) { - test_obj_->StartTimer(fmt::format("{} reduce", i)); + void reduceComplete(std::vector in) { + // fmt::print( + // "[{}]: allreduce done! Results are ...\n", theContext()->getNode()); + // for (auto val : in) { + // fmt::print("{} ", val); + // } - proxy_.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data_); + // fmt::print("\n"); } private: MyTest* test_obj_ = nullptr; vt::objgroup::proxy::Proxy proxy_ = {}; - int reduce_counter_ = -1; - int i = 0; - std::vector data_ = {}; }; VT_PERF_TEST(MyTest, test_reduce) { - auto grp_proxy = vt::theObjGroup()->makeCollective( - "test_reduce", this - ); + auto grp_proxy = + vt::theObjGroup()->makeCollective("test_allreduce", this); - if (theContext()->getNode() == 0) { - theTerm()->disableTD(); - } + vt::runInEpochCollective([=] { + grp_proxy.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data); + }); +} - std::vector data(1024, theContext()->getNode()); - grp_proxy.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data); +VT_PERF_TEST(MyTest, test_allreduce_rabenseifner) { + auto proxy = + vt::theObjGroup()->makeCollective("test_allreduce_new", this); - if (theContext()->getNode() == 0) { - theTerm()->enableTD(); - } + using DataT = decltype(data); + using Reducer = collective::reduce::allreduce::Rabenseifner< + DataT, collective::PlusOp, NodeObj, &NodeObj::newReduceComplete>; + + auto grp_proxy = + vt::theObjGroup()->makeCollective("allreduce_rabenseifner"); + vt::runInEpochCollective([=] { + grp_proxy[my_node_].template invoke<&Reducer::initialize>( + data, grp_proxy, proxy, num_nodes_); + grp_proxy[my_node_].template invoke<&Reducer::partOne>(); + }); +} + +VT_PERF_TEST(MyTest, test_allreduce_recursive_doubling) { + auto proxy = + vt::theObjGroup()->makeCollective("test_allreduce_new_2", this); + + using DataT = decltype(data); + using Reducer = collective::reduce::allreduce::DistanceDoubling< + DataT, collective::PlusOp, NodeObj, &NodeObj::recursiveDoubling>; + + auto grp_proxy = vt::theObjGroup()->makeCollective( + "allreduce_recursive_doubling", num_nodes_, data); + vt::runInEpochCollective([=] { + grp_proxy[my_node_].template invoke<&Reducer::allreduce>(grp_proxy, proxy); + }); } VT_PERF_TEST_MAIN() diff --git a/tests/perf/reduce.cc b/tests/perf/reduce.cc index a2b703600b..4ab9147b07 100644 --- a/tests/perf/reduce.cc +++ b/tests/perf/reduce.cc @@ -41,14 +41,9 @@ //@HEADER */ #include "common/test_harness.h" -#include "vt/collective/collective_alg.h" -#include "vt/configs/error/config_assert.h" -#include "vt/context/context.h" -#include #include #include #include -#include #include INCLUDE_FMT_CORE @@ -64,69 +59,48 @@ struct MyTest : PerfTestHarness { struct NodeObj { explicit NodeObj(MyTest* test_obj) : test_obj_(test_obj) { } - void initialize() { - proxy_ = vt::theObjGroup()->getProxy(this); - // data_["Node"] = theContext()->getNode(); } + void initialize() { proxy_ = vt::theObjGroup()->getProxy(this); } + + struct MyMsg : vt::Message {}; + + void reduceComplete() { + reduce_counter_++; + test_obj_->StopTimer(fmt::format("{} reduce", i)); + test_obj_->GetMemoryUsage(); + if (i < num_iters) { + i++; + auto this_node = theContext()->getNode(); + proxy_[this_node].send(); + } else if (theContext()->getNode() == 0) { + theTerm()->enableTD(); + } } - struct MyMsg : vt::Message { }; - void newReduceComplete(std::vector in) { - // fmt::print( - // "\n[{}]: allreduce_h done! (Size == {}) Results are ...\n", - // theContext()->getNode(), in.size()); - // const auto p = theContext()->getNumNodes(); - // const auto expected = (p * (p + 1)) / 2; - // for (auto val : in) { - // vtAssert(val == expected, "FAILURE!"); - // } - // for (int node = 0; node < theContext()->getNumNodes(); ++node) { - // if (node == theContext()->getNode()) { - // std::string printer(128, 0x0); - // for (auto val : in) { - // printer.append(fmt::format("{} ", val)); - // } - - // fmt::print("{}\n", printer); - - // theCollective()->barrier(); - // } - // } - - // fmt::print("\n"); - } - - void reduceComplete(std::vector in) { - // fmt::print( - // "[{}]: allreduce done! Results are ...\n", theContext()->getNode()); - // for (auto val : in) { - // fmt::print("{} ", val); - // } - - // fmt::print("\n"); + void perfReduce(MyMsg* in_msg) { + test_obj_->StartTimer(fmt::format("{} reduce", i)); + proxy_.allreduce<&NodeObj::reduceComplete>(); } private: MyTest* test_obj_ = nullptr; vt::objgroup::proxy::Proxy proxy_ = {}; + int reduce_counter_ = -1; + int i = 0; }; VT_PERF_TEST(MyTest, test_reduce) { - auto grp_proxy = - vt::theObjGroup()->makeCollective("test_allreduce", this); - - grp_proxy.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data); -} + auto grp_proxy = vt::theObjGroup()->makeCollective( + "test_reduce", this + ); -VT_PERF_TEST(MyTest, test_allreduce) { - auto grp_proxy = - vt::theObjGroup()->makeCollective("test_allreduce_new", this); + if (theContext()->getNode() == 0) { + theTerm()->disableTD(); + } - grp_proxy.allreduce_h<&NodeObj::newReduceComplete, collective::PlusOp>(data); -} + grp_proxy[my_node_].invoke<&NodeObj::initialize>(); -VT_PERF_TEST(MyTest, test_epoch_collective) { - vt::runInEpochCollective([] {}); - vt::runInEpochCollective([] {}); + using MsgType = typename NodeObj::MyMsg; + grp_proxy[my_node_].send(); } VT_PERF_TEST_MAIN() diff --git a/tests/perf/send_cost.cc b/tests/perf/send_cost.cc index 6ce249eb2f..0140b30cbc 100644 --- a/tests/perf/send_cost.cc +++ b/tests/perf/send_cost.cc @@ -609,3 +609,4 @@ VT_PERF_TEST(SendTest, test_collection_send) { } VT_PERF_TEST_MAIN() + From bb1ca10839c4e0b85c320a7b7338b56cab005445 Mon Sep 17 00:00:00 2001 From: Jacob Domagala Date: Tue, 16 Apr 2024 11:53:44 +0200 Subject: [PATCH 08/29] #2240: Code cleanup and make Rabenseifner work with any Op type --- .../reduce/allreduce/rabenseifner.h | 388 +++++++++++------- .../reduce/allreduce/recursive_doubling.h | 95 ++--- src/vt/objgroup/manager.h | 2 +- src/vt/objgroup/manager.impl.h | 3 +- src/vt/objgroup/proxy/proxy_objgroup.impl.h | 2 +- tests/perf/allreduce.cc | 42 +- 6 files changed, 310 insertions(+), 222 deletions(-) diff --git a/src/vt/collective/reduce/allreduce/rabenseifner.h b/src/vt/collective/reduce/allreduce/rabenseifner.h index 8af07bff6a..fa5ea0a557 100644 --- a/src/vt/collective/reduce/allreduce/rabenseifner.h +++ b/src/vt/collective/reduce/allreduce/rabenseifner.h @@ -2,7 +2,7 @@ //@HEADER // ***************************************************************************** // -// reduce.h +// rabenseifner.h // DARMA/vt => Virtual Transport // // Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC @@ -90,16 +90,21 @@ template < typename DataT, template class Op, typename ObjT, auto finalHandler> struct Rabenseifner { - void initialize( - const DataT& data, vt::objgroup::proxy::Proxy proxy, - vt::objgroup::proxy::Proxy parentProxy, uint32_t num_nodes) { - this_node_ = vt::theContext()->getNode(); - is_even_ = this_node_ % 2 == 0; - val_ = data; - proxy_ = proxy; - num_steps_ = static_cast(log2(num_nodes)); - nprocs_pof2_ = 1 << num_steps_; - nprocs_rem_ = num_nodes - nprocs_pof2_; + template + Rabenseifner( + vt::objgroup::proxy::Proxy parentProxy, NodeType num_nodes, + Args&&... args) + : parent_proxy_(parentProxy), + val_(std::forward(args)...), + num_nodes_(num_nodes), + this_node_(vt::theContext()->getNode()), + is_even_(this_node_ % 2 == 0), + num_steps_(static_cast(log2(num_nodes_))), + nprocs_pof2_(1 << num_steps_), + nprocs_rem_(num_nodes_ - nprocs_pof2_), + gather_step_(num_steps_ - 1), + gather_mask_(nprocs_pof2_ >> 1), + finished_adjustment_part_(nprocs_rem_ == 0) { is_part_of_adjustment_group_ = this_node_ < (2 * nprocs_rem_); if (is_part_of_adjustment_group_) { if (is_even_) { @@ -111,15 +116,21 @@ struct Rabenseifner { vrt_node_ = this_node_ - nprocs_rem_; } + scatter_messages_.resize(num_steps_, nullptr); + scatter_steps_recv_.resize(num_steps_, false); + scatter_steps_reduced_.resize(num_steps_, false); + + gather_messages_.resize(num_steps_, nullptr); + gather_steps_recv_.resize(num_steps_, false); + gather_steps_reduced_.resize(num_steps_, false); + r_index_.resize(num_steps_, 0); r_count_.resize(num_steps_, 0); s_index_.resize(num_steps_, 0); s_count_.resize(num_steps_, 0); - w_size_ = data.size(); - int step = 0; - size_t wsize = data.size(); + size_t wsize = val_.size(); for (int mask = 1; mask < nprocs_pof2_; mask <<= 1) { auto vdest = vrt_node_ ^ mask; auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; @@ -142,137 +153,206 @@ struct Rabenseifner { } } - steps_sent_.resize(num_steps_, false); - steps_recv_.resize(num_steps_, false); + scatter_steps_recv_.resize(num_steps_, false); + } - if constexpr (debug) { - fmt::print( - "[{}] Initialize with size = {} num_steps {} \n", this_node_, w_size_, - num_steps_); + void allreduce() { + if (nprocs_rem_) { + adjustForPowerOfTwo(); + } else { + scatterReduceIter(); } } - void partOne() { + void adjustForPowerOfTwo() { if (is_part_of_adjustment_group_) { auto const partner = is_even_ ? this_node_ + 1 : this_node_ - 1; if (is_even_) { - proxy_[partner].template send<&Rabenseifner::partOneRightHalf>( - DataT{val_.begin() + (val_.size() / 2), val_.end()}); + proxy_[partner] + .template send<&Rabenseifner::adjustForPowerOfTwoRightHalf>( + DataT{val_.begin() + (val_.size() / 2), val_.end()}); } else { - proxy_[partner].template send<&Rabenseifner::partOneLeftHalf>( - DataT{val_.begin(), val_.end() - (val_.size() / 2)}); + proxy_[partner] + .template send<&Rabenseifner::adjustForPowerOfTwoLeftHalf>( + DataT{val_.begin(), val_.end() - (val_.size() / 2)}); } } } - void partOneRightHalf(AllreduceRbnMsg* msg) { + void adjustForPowerOfTwoRightHalf(AllreduceRbnMsg* msg) { for (int i = 0; i < msg->val_.size(); i++) { val_[(val_.size() / 2) + i] += msg->val_[i]; } // Send to left node proxy_[theContext()->getNode() - 1] - .template send<&Rabenseifner::partOneFinalPart>( + .template send<&Rabenseifner::adjustForPowerOfTwoFinalPart>( DataT{val_.begin() + (val_.size() / 2), val_.end()}); } - void partOneLeftHalf(AllreduceRbnMsg* msg) { + void adjustForPowerOfTwoLeftHalf(AllreduceRbnMsg* msg) { for (int i = 0; i < msg->val_.size(); i++) { val_[i] += msg->val_[i]; } } - void partOneFinalPart(AllreduceRbnMsg* msg) { + void adjustForPowerOfTwoFinalPart(AllreduceRbnMsg* msg) { for (int i = 0; i < msg->val_.size(); i++) { val_[(val_.size() / 2) + i] = msg->val_[i]; } - partTwo(); + finished_adjustment_part_ = true; + + scatterReduceIter(); } - void partTwo() { + void printValues() { + if constexpr (debug) { + std::string printer(1024, 0x0); + for (auto val : val_) { + printer.append(fmt::format("{} ", val)); + } + fmt::print("[{}] Values = {} \n", this_node_, printer); + } + } + + bool scatterAllMessagesReceived() { + return std::all_of( + scatter_steps_recv_.cbegin(), + scatter_steps_recv_.cbegin() + scatter_step_, + [](const auto val) { return val; }); + } + + bool scatterIsDone() { + return scatter_step_ == num_steps_ and scatter_num_recv_ == num_steps_; + } + + bool scatterIsReady() { + return (is_part_of_adjustment_group_ and finished_adjustment_part_) and + scatter_step_ == 0 or + scatterAllMessagesReceived(); + } + + void scatterTryReduce(int32_t step) { if ( - vrt_node_ == -1 or (step_ >= num_steps_) or - (not std::all_of( - steps_recv_.cbegin(), steps_recv_.cbegin() + step_, - [](const auto val) { return val; }))) { + (step < scatter_step_) and not scatter_steps_reduced_[step] and + scatter_steps_recv_[step] and + std::all_of( + scatter_steps_reduced_.cbegin(), scatter_steps_reduced_.cbegin() + step, + [](const auto val) { return val; })) { + auto& in_msg = scatter_messages_.at(step); + auto& in_val = in_msg->val_; + for (int i = 0; i < in_val.size(); i++) { + Op()( + val_[r_index_[in_msg->step_] + i], in_val[i]); + } + + scatter_steps_reduced_[step] = true; + } + } + + void scatterReduceIter() { + if (not scatterIsReady()) { return; } - auto vdest = vrt_node_ ^ mask_; + auto vdest = vrt_node_ ^ scatter_mask_; auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; if constexpr (debug) { fmt::print( "[{}] Part2 Step {}: Sending to Node {} starting with idx = {} and " "count " "{} \n", - this_node_, step_, dest, s_index_[step_], s_count_[step_]); + this_node_, scatter_step_, dest, s_index_[scatter_step_], + s_count_[scatter_step_]); } - proxy_[dest].template send<&Rabenseifner::partTwoHandler>( + proxy_[dest].template send<&Rabenseifner::scatterReduceIterHandler>( DataT{ - val_.begin() + (s_index_[step_]), - val_.begin() + (s_index_[step_]) + s_count_[step_]}, - step_); - - mask_ <<= 1; - num_send_++; - steps_sent_[step_] = true; - step_++; - - if (std::all_of( - steps_recv_.cbegin(), steps_recv_.cbegin() + step_, - [](const auto val) { return val; })) { - partTwo(); + val_.begin() + (s_index_[scatter_step_]), + val_.begin() + (s_index_[scatter_step_]) + s_count_[scatter_step_]}, + scatter_step_); + + scatter_mask_ <<= 1; + scatter_step_++; + + scatterTryReduce(scatter_step_ - 1); + + if (scatterIsDone()) { + printValues(); + finished_scatter_part_ = true; + gatherIter(); + } else if (scatterAllMessagesReceived()) { + scatterReduceIter(); } } - void partTwoHandler(AllreduceRbnMsg* msg) { - for (int i = 0; i < msg->val_.size(); i++) { - val_[r_index_[msg->step_] + i] += msg->val_[i]; + void scatterReduceIterHandler(AllreduceRbnMsg* msg) { + scatter_messages_[msg->step_] = promoteMsg(msg); + scatter_steps_recv_[msg->step_] = true; + scatter_num_recv_++; + + if (not finished_adjustment_part_) { + return; } + + scatterTryReduce(msg->step_); + if constexpr (debug) { fmt::print( - "[{}] Part2 Step {} mask_= {} nprocs_pof2_ = {}: " + "[{}] Part2 Step {} scatter_mask_= {} nprocs_pof2_ = {}: " "idx = {} from {}\n", - this_node_, msg->step_, mask_, nprocs_pof2_, r_index_[msg->step_], - theContext()->getFromNodeCurrentTask()); + this_node_, msg->step_, scatter_mask_, nprocs_pof2_, + r_index_[msg->step_], theContext()->getFromNodeCurrentTask()); } - steps_recv_[msg->step_] = true; - num_recv_++; - if (mask_ < nprocs_pof2_) { - if (std::all_of( - steps_recv_.cbegin(), steps_recv_.cbegin() + step_, - [](const auto val) { return val; })) { - partTwo(); - } - } else { - // step_ = num_steps_ - 1; - // mask_ = nprocs_pof2_ >> 1; - // partThree(); + + if ((scatter_mask_ < nprocs_pof2_) and scatterAllMessagesReceived()) { + scatterReduceIter(); + } else if (scatterIsDone()) { + printValues(); + finished_scatter_part_ = true; + gatherIter(); } } - void partThree() { - if ( - vrt_node_ == -1 or - (not std::all_of( - steps_recv_.cbegin() + step_ + 1, steps_recv_.cend(), - [](const auto val) { return val; }))) { - return; + bool gatherAllMessagesReceived() { + return std::all_of( + gather_steps_recv_.cbegin() + gather_step_ + 1, gather_steps_recv_.cend(), + [](const auto val) { return val; }); + } + + bool gatherIsDone() { + return (gather_step_ < 0) and (gather_num_recv_ == num_steps_); + } + + bool gatherIsReady() { + return (gather_step_ == num_steps_ - 1) or gatherAllMessagesReceived(); + } + + void gatherTryReduce(int32_t step) { + const auto doRed = (step > gather_step_) and + not gather_steps_reduced_[step] and gather_steps_recv_[step] and + std::all_of(gather_steps_reduced_.cbegin() + step + 1, + gather_steps_reduced_.cend(), + [](const auto val) { return val; }); + + if (doRed) { + auto& in_msg = gather_messages_.at(step); + auto& in_val = in_msg->val_; + for (int i = 0; i < in_val.size(); i++) { + val_[s_index_[in_msg->step_] + i] = in_val[i]; + } + + gather_steps_reduced_[step] = true; } + } - if (not startedPartThree_) { - step_ = num_steps_ - 1; - mask_ = nprocs_pof2_ >> 1; - num_send_ = 0; - num_recv_ = 0; - startedPartThree_ = true; - std::fill(steps_sent_.begin(), steps_sent_.end(), false); - std::fill(steps_recv_.begin(), steps_recv_.end(), false); + void gatherIter() { + if (not gatherIsReady()) { + return; } - auto vdest = vrt_node_ ^ mask_; + auto vdest = vrt_node_ ^ gather_mask_; auto dest = (vdest < nprocs_rem_) ? vdest * 2 : vdest + nprocs_rem_; if constexpr (debug) { @@ -280,43 +360,29 @@ struct Rabenseifner { "[{}] Part3 Step {}: Sending to Node {} starting with idx = {} and " "count " "{} \n", - this_node_, step_, dest, r_index_[step_], r_count_[step_]); + this_node_, gather_step_, dest, r_index_[gather_step_], + r_count_[gather_step_]); } - proxy_[dest].template send<&Rabenseifner::partThreeHandler>( + proxy_[dest].template send<&Rabenseifner::gatherIterHandler>( DataT{ - val_.begin() + (r_index_[step_]), - val_.begin() + (r_index_[step_]) + r_count_[step_]}, - step_); - - steps_sent_[step_] = true; - num_send_++; - mask_ >>= 1; - step_--; - if ( - step_ >= 0 and - std::all_of( - steps_recv_.cbegin() + step_ + 1, steps_recv_.cend(), - [](const auto val) { return val; })) { - partThree(); - } - } + val_.begin() + (r_index_[gather_step_]), + val_.begin() + (r_index_[gather_step_]) + r_count_[gather_step_]}, + gather_step_); - void partThreeHandler(AllreduceRbnMsg* msg) { - for (int i = 0; i < msg->val_.size(); i++) { - val_[s_index_[msg->step_] + i] = msg->val_[i]; - } + gather_mask_ >>= 1; + gather_step_--; + + gatherTryReduce(gather_step_ + 1); + printValues(); - if (not startedPartThree_) { - step_ = num_steps_ - 1; - mask_ = nprocs_pof2_ >> 1; - num_send_ = 0; - num_recv_ = 0; - startedPartThree_ = true; - std::fill(steps_sent_.begin(), steps_sent_.end(), false); - std::fill(steps_recv_.begin(), steps_recv_.end(), false); + if (gatherIsDone()) { + finalPart(); + } else if (gatherIsReady()) { + gatherIter(); } + } - num_recv_++; + void gatherIterHandler(AllreduceRbnMsg* msg) { if constexpr (debug) { fmt::print( "[{}] Part3 Step {}: Received idx = {} from {}\n", this_node_, @@ -324,55 +390,93 @@ struct Rabenseifner { theContext()->getFromNodeCurrentTask()); } - steps_recv_[msg->step_] = true; + gather_messages_[msg->step_] = promoteMsg(msg); + gather_steps_recv_[msg->step_] = true; + gather_num_recv_++; - if ( - mask_ > 0 and - ((step_ == num_steps_ - 1) or - std::all_of( - steps_recv_.cbegin() + step_ + 1, steps_recv_.cend(), - [](const auto val) { return val; }))) { - partThree(); + if (not finished_scatter_part_) { + return; + } + + gatherTryReduce(msg->step_); + printValues(); + + if (gather_mask_ > 0 and gatherIsReady()) { + gatherIter(); + } else if (gatherIsDone()) { + finalPart(); } } - void partFour() { + void finalPart() { + if (completed_) { + return; + } + + if (nprocs_rem_) { + sendToExcludedNodes(); + } + + parent_proxy_[this_node_].template invoke(val_); + completed_ = true; + } + + void sendToExcludedNodes() { if (is_part_of_adjustment_group_ and is_even_) { if constexpr (debug) { fmt::print( "[{}] Part4 : Sending to Node {} \n", this_node_, this_node_ + 1); } - proxy_[this_node_ + 1].template send<&Rabenseifner::partFourHandler>( - val_, 0); + proxy_[this_node_ + 1] + .template send<&Rabenseifner::sendToExcludedNodesHandler>(val_, 0); } } - void partFourHandler(AllreduceRbnMsg* msg) { val_ = msg->val_; } + void sendToExcludedNodesHandler(AllreduceRbnMsg* msg) { + val_ = msg->val_; + + parent_proxy_[this_node_].template invoke(val_); + completed_ = true; + } - NodeType this_node_ = {}; - bool is_even_ = false; vt::objgroup::proxy::Proxy proxy_ = {}; - vt::objgroup::proxy::Proxy parentProxy_ = {}; + vt::objgroup::proxy::Proxy parent_proxy_ = {}; + DataT val_ = {}; - NodeType vrt_node_ = {}; - bool is_part_of_adjustment_group_ = false; + NodeType this_node_ = {}; + NodeType num_nodes_ = {}; + bool is_even_ = false; int32_t num_steps_ = {}; int32_t nprocs_pof2_ = {}; int32_t nprocs_rem_ = {}; - int32_t mask_ = 1; - bool startedPartThree_ = false; - - size_t w_size_ = {}; - int32_t step_ = 0; - int32_t num_send_ = 0; - int32_t num_recv_ = 0; - std::vector steps_recv_ = {}; - std::vector steps_sent_ = {}; std::vector r_index_ = {}; std::vector r_count_ = {}; std::vector s_index_ = {}; std::vector s_count_ = {}; + + NodeType vrt_node_ = {}; + bool is_part_of_adjustment_group_ = false; + bool finished_adjustment_part_ = false; + + bool completed_ = false; + + // Scatter + int32_t scatter_mask_ = 1; + int32_t scatter_step_ = 0; + int32_t scatter_num_recv_ = 0; + std::vector scatter_steps_recv_ = {}; + std::vector scatter_steps_reduced_ = {}; + std::vector>> scatter_messages_ = {}; + bool finished_scatter_part_ = false; + + // Gather + int32_t gather_mask_ = 1; + int32_t gather_step_ = 0; + int32_t gather_num_recv_ = 0; + std::vector gather_steps_recv_ = {}; + std::vector gather_steps_reduced_ = {}; + std::vector>> gather_messages_ = {}; }; } // namespace vt::collective::reduce::allreduce diff --git a/src/vt/collective/reduce/allreduce/recursive_doubling.h b/src/vt/collective/reduce/allreduce/recursive_doubling.h index e1ac7873d6..917196a4d3 100644 --- a/src/vt/collective/reduce/allreduce/recursive_doubling.h +++ b/src/vt/collective/reduce/allreduce/recursive_doubling.h @@ -2,7 +2,7 @@ //@HEADER // ***************************************************************************** // -// reduce.h +// recursive_doubling.h // DARMA/vt => Virtual Transport // // Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC @@ -41,8 +41,8 @@ //@HEADER */ -#if !defined INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_DISTANCE_DOUBLING_H -#define INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_DISTANCE_DOUBLING_H +#if !defined INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RECURSIVE_DOUBLING_H +#define INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RECURSIVE_DOUBLING_H #include "vt/config.h" #include "vt/context/context.h" @@ -68,10 +68,11 @@ struct AllreduceDblMsg AllreduceDblMsg(AllreduceDblMsg const&) = default; AllreduceDblMsg(AllreduceDblMsg&&) = default; - explicit AllreduceDblMsg(DataT&& in_val) + AllreduceDblMsg(DataT&& in_val, int step = 0) : MessageParentType(), - val_(std::forward(in_val)) { } - explicit AllreduceDblMsg(DataT const& in_val, int step = 0) + val_(std::forward(in_val)), + step_(step) { } + AllreduceDblMsg(DataT const& in_val, int step = 0) : MessageParentType(), val_(in_val), step_(step) { } @@ -92,18 +93,18 @@ template < auto finalHandler> struct DistanceDoubling { template - DistanceDoubling(NodeType num_nodes, Args&&... args) - : val_(std::forward(args)...), - num_nodes_(num_nodes) { } - - void initialize() { - this_node_ = vt::theContext()->getNode(); - is_even_ = this_node_ % 2 == 0; - num_steps_ = static_cast(log2(num_nodes_)); - messages.resize(num_steps_, nullptr); - - nprocs_pof2_ = 1 << num_steps_; - nprocs_rem_ = num_nodes_ - nprocs_pof2_; + DistanceDoubling( + vt::objgroup::proxy::Proxy parentProxy, NodeType num_nodes, + Args&&... args) + : parent_proxy_(parentProxy), + val_(std::forward(args)...), + num_nodes_(num_nodes), + this_node_(vt::theContext()->getNode()), + is_even_(this_node_ % 2 == 0), + num_steps_(static_cast(log2(num_nodes_))), + nprocs_pof2_(1 << num_steps_), + nprocs_rem_(num_nodes_ - nprocs_pof2_), + finished_adjustment_part_(nprocs_rem_ == 0) { is_part_of_adjustment_group_ = this_node_ < (2 * nprocs_rem_); if (is_part_of_adjustment_group_) { if (is_even_) { @@ -115,22 +116,12 @@ struct DistanceDoubling { vrt_node_ = this_node_ - nprocs_rem_; } + messages_.resize(num_steps_, nullptr); steps_recv_.resize(num_steps_, false); steps_reduced_.resize(num_steps_, false); - - initialized_ = true; } - void allreduce( - vt::objgroup::proxy::Proxy proxy, - vt::objgroup::proxy::Proxy parentProxy) { - if (not initialized_) { - initialize(); - } - - proxy_ = proxy; - parent_proxy_ = parentProxy; - + void allreduce() { if (nprocs_rem_) { adjustForPowerOfTwo(); } else { @@ -157,9 +148,9 @@ struct DistanceDoubling { data.append(fmt::format("{} ", val)); } fmt::print( - "[{}] Part1 Handler initialized_ = {}: Received data ({}) " + "[{}] Part1 Handler: Received data ({}) " "from {}\n", - this_node_, initialized_, data, theContext()->getFromNodeCurrentTask()); + this_node_, data, theContext()->getFromNodeCurrentTask()); } Op()(val_, msg->val_); @@ -216,7 +207,7 @@ struct DistanceDoubling { std::all_of( steps_reduced_.cbegin(), steps_reduced_.cbegin() + step, [](const auto val) { return val; })) { - Op()(val_, messages.at(step)->val_); + Op()(val_, messages_.at(step)->val_); steps_reduced_[step] = true; } } @@ -228,28 +219,21 @@ struct DistanceDoubling { data.append(fmt::format("{} ", val)); } fmt::print( - "[{}] Part2 Step {} initialized_ = {} mask_= {} nprocs_pof2_ = {}: " + "[{}] Part2 Step {} mask_= {} nprocs_pof2_ = {}: " "Received data ({}) " "from {}\n", - this_node_, msg->step_, initialized_, mask_, nprocs_pof2_, data, + this_node_, msg->step_, mask_, nprocs_pof2_, data, theContext()->getFromNodeCurrentTask()); } + messages_.at(msg->step_) = promoteMsg(msg); + steps_recv_[msg->step_] = true; + // Special case when we receive step 2 message before step 1 is done on this node if (not finished_adjustment_part_) { - if (not initialized_) { - initialize(); - } - - messages.at(msg->step_) = promoteMsg(msg); - steps_recv_[msg->step_] = true; - return; } - messages.at(msg->step_) = promoteMsg(msg); - steps_recv_[msg->step_] = true; - tryReduce(msg->step_); if ((mask_ < nprocs_pof2_) and isReady()) { @@ -275,6 +259,7 @@ struct DistanceDoubling { val_ = msg->val_; parent_proxy_[this_node_].template invoke(val_); + completed_ = true; } void finalPart() { @@ -290,19 +275,21 @@ struct DistanceDoubling { completed_ = true; } - NodeType this_node_ = {}; - uint32_t num_nodes_ = {}; - bool is_even_ = false; vt::objgroup::proxy::Proxy proxy_ = {}; vt::objgroup::proxy::Proxy parent_proxy_ = {}; + DataT val_ = {}; - NodeType vrt_node_ = {}; - bool initialized_ = false; - bool is_part_of_adjustment_group_ = false; - bool finished_adjustment_part_ = false; + NodeType this_node_ = {}; + NodeType num_nodes_ = {}; + bool is_even_ = false; int32_t num_steps_ = {}; int32_t nprocs_pof2_ = {}; int32_t nprocs_rem_ = {}; + + NodeType vrt_node_ = {}; + bool is_part_of_adjustment_group_ = false; + bool finished_adjustment_part_ = false; + int32_t mask_ = 1; int32_t step_ = 0; @@ -311,9 +298,9 @@ struct DistanceDoubling { std::vector steps_recv_ = {}; std::vector steps_reduced_ = {}; - std::vector>> messages = {}; + std::vector>> messages_ = {}; }; } // namespace vt::collective::reduce::allreduce -#endif /*INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H*/ +#endif /*INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RECURSIVE_DOUBLING_H*/ diff --git a/src/vt/objgroup/manager.h b/src/vt/objgroup/manager.h index 5c29647dc0..3ea00d7191 100644 --- a/src/vt/objgroup/manager.h +++ b/src/vt/objgroup/manager.h @@ -292,7 +292,7 @@ struct ObjGroupManager : runtime::component::Component { ); template class Op, typename DataT> -ObjGroupManager::PendingSendType allreduce_r(ProxyType proxy, const DataT& data); +ObjGroupManager::PendingSendType allreduce(ProxyType proxy, const DataT& data); /** * \brief Perform a reduction over an objgroup diff --git a/src/vt/objgroup/manager.impl.h b/src/vt/objgroup/manager.impl.h index 7ea2e8c87b..4fb3d3dac9 100644 --- a/src/vt/objgroup/manager.impl.h +++ b/src/vt/objgroup/manager.impl.h @@ -57,7 +57,6 @@ #include "vt/collective/collective_alg.h" #include "vt/messaging/active.h" #include "vt/elm/elm_id_bits.h" -#include "vt/collective/reduce/allreduce/rabenseifner.h" #include "vt/messaging/message/smart_ptr.h" #include @@ -268,7 +267,7 @@ ObjGroupManager::PendingSendType ObjGroupManager::broadcast(MsgSharedPtr m template < auto f, typename ObjT, template class Op, typename DataT> ObjGroupManager::PendingSendType -ObjGroupManager::allreduce_r(ProxyType proxy, const DataT& data) { +ObjGroupManager::allreduce(ProxyType proxy, const DataT& data) { // check payload size and choose appropriate algorithm auto const this_node = vt::theContext()->getNode(); diff --git a/src/vt/objgroup/proxy/proxy_objgroup.impl.h b/src/vt/objgroup/proxy/proxy_objgroup.impl.h index f9c4ba5b06..72572fbc7e 100644 --- a/src/vt/objgroup/proxy/proxy_objgroup.impl.h +++ b/src/vt/objgroup/proxy/proxy_objgroup.impl.h @@ -214,7 +214,7 @@ Proxy::allreduce_h( Args&&... args ) const { auto proxy = Proxy(*this); - return theObjGroup()->allreduce_r< + return theObjGroup()->allreduce< f, ObjT, Op diff --git a/tests/perf/allreduce.cc b/tests/perf/allreduce.cc index 645f83136d..00f31773a1 100644 --- a/tests/perf/allreduce.cc +++ b/tests/perf/allreduce.cc @@ -2,7 +2,7 @@ //@HEADER // ***************************************************************************** // -// reduce.cc +// allreduce.cc // DARMA/vt => Virtual Transport // // Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC @@ -61,7 +61,7 @@ static constexpr int num_iters = 1; struct MyTest : PerfTestHarness { void SetUp() override { PerfTestHarness::SetUp(); - data.resize(1 << 4); + data.resize(1 << 16); for (auto& val : data) { val = theContext()->getNode() + 1; } @@ -105,17 +105,12 @@ struct NodeObj { } void newReduceComplete(std::vector in) { - // fmt::print( - // "\n[{}]: allreduce_h done! (Size == {}) Results are ...\n", - // theContext()->getNode(), in.size()); - // const auto p = theContext()->getNumNodes(); - // const auto expected = (p * (p + 1)) / 2; - // for (auto val : in) { - // vtAssert(val == expected, "FAILURE!"); - // } + // std::string printer(1024, 0x0); + // printer.append(fmt::format("\n[{}]: allreduce_rabenseifner done! ", theContext()->getNode())); + // for (int node = 0; node < theContext()->getNumNodes(); ++node) { // if (node == theContext()->getNode()) { - // std::string printer(128, 0x0); + // for (auto val : in) { // printer.append(fmt::format("{} ", val)); // } @@ -127,6 +122,11 @@ struct NodeObj { // } // fmt::print("\n"); + // const auto p = theContext()->getNumNodes(); + // const auto expected = (p * (p + 1)) / 2; + // for (auto val : in) { + // vtAssert(val == expected, "FAILURE!"); + // } } void reduceComplete(std::vector in) { @@ -161,13 +161,11 @@ VT_PERF_TEST(MyTest, test_allreduce_rabenseifner) { using Reducer = collective::reduce::allreduce::Rabenseifner< DataT, collective::PlusOp, NodeObj, &NodeObj::newReduceComplete>; - auto grp_proxy = - vt::theObjGroup()->makeCollective("allreduce_rabenseifner"); - vt::runInEpochCollective([=] { - grp_proxy[my_node_].template invoke<&Reducer::initialize>( - data, grp_proxy, proxy, num_nodes_); - grp_proxy[my_node_].template invoke<&Reducer::partOne>(); - }); + auto grp_proxy = vt::theObjGroup()->makeCollective( + "allreduce_rabenseifner", proxy, num_nodes_, data); + grp_proxy[my_node_].get()->proxy_ = grp_proxy; + vt::runInEpochCollective( + [=] { grp_proxy[my_node_].template invoke<&Reducer::allreduce>(); }); } VT_PERF_TEST(MyTest, test_allreduce_recursive_doubling) { @@ -179,10 +177,10 @@ VT_PERF_TEST(MyTest, test_allreduce_recursive_doubling) { DataT, collective::PlusOp, NodeObj, &NodeObj::recursiveDoubling>; auto grp_proxy = vt::theObjGroup()->makeCollective( - "allreduce_recursive_doubling", num_nodes_, data); - vt::runInEpochCollective([=] { - grp_proxy[my_node_].template invoke<&Reducer::allreduce>(grp_proxy, proxy); - }); + "allreduce_recursive_doubling", proxy, num_nodes_, data); + grp_proxy[my_node_].get()->proxy_ = grp_proxy; + vt::runInEpochCollective( + [=] { grp_proxy[my_node_].template invoke<&Reducer::allreduce>(); }); } VT_PERF_TEST_MAIN() From 166f2319f7050a39a8622d9d7cbe5ef345baae1b Mon Sep 17 00:00:00 2001 From: Jacob Domagala Date: Fri, 26 Apr 2024 11:53:32 +0200 Subject: [PATCH 09/29] #2240: Improve accuracy of timing allreduce algorithms in allreduce.cc --- tests/perf/allreduce.cc | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/tests/perf/allreduce.cc b/tests/perf/allreduce.cc index 00f31773a1..eba84358eb 100644 --- a/tests/perf/allreduce.cc +++ b/tests/perf/allreduce.cc @@ -45,6 +45,7 @@ #include "vt/collective/reduce/operators/functors/plus_op.h" #include "vt/configs/error/config_assert.h" #include "vt/context/context.h" +#include "vt/scheduler/scheduler.h" #include #include #include @@ -57,7 +58,6 @@ using namespace vt; using namespace vt::tests::perf::common; -static constexpr int num_iters = 1; struct MyTest : PerfTestHarness { void SetUp() override { PerfTestHarness::SetUp(); @@ -71,7 +71,7 @@ struct MyTest : PerfTestHarness { }; struct NodeObj { - explicit NodeObj(MyTest* test_obj) : test_obj_(test_obj) { } + explicit NodeObj(MyTest* test_obj, const std::string& name) : test_obj_(test_obj), timer_name_(name) { } void initialize() { proxy_ = vt::theObjGroup()->getProxy(this); @@ -102,6 +102,7 @@ struct NodeObj { // for (auto val : in) { // vtAssert(val == expected, "FAILURE!"); // } + test_obj_->StopTimer(timer_name_); } void newReduceComplete(std::vector in) { @@ -127,6 +128,7 @@ struct NodeObj { // for (auto val : in) { // vtAssert(val == expected, "FAILURE!"); // } + test_obj_->StopTimer(timer_name_); } void reduceComplete(std::vector in) { @@ -137,25 +139,26 @@ struct NodeObj { // } // fmt::print("\n"); + test_obj_->StopTimer(timer_name_); } -private: + std::string timer_name_ = {}; MyTest* test_obj_ = nullptr; vt::objgroup::proxy::Proxy proxy_ = {}; }; VT_PERF_TEST(MyTest, test_reduce) { auto grp_proxy = - vt::theObjGroup()->makeCollective("test_allreduce", this); + vt::theObjGroup()->makeCollective("test_allreduce", this, "Reduce -> Bcast"); - vt::runInEpochCollective([=] { - grp_proxy.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data); - }); + theCollective()->barrier(); + StartTimer(grp_proxy[theContext()->getNode()].get()->timer_name_); + grp_proxy.allreduce<&NodeObj::reduceComplete, collective::PlusOp>(data); } VT_PERF_TEST(MyTest, test_allreduce_rabenseifner) { auto proxy = - vt::theObjGroup()->makeCollective("test_allreduce_new", this); + vt::theObjGroup()->makeCollective("test_allreduce_new", this, "Rabenseifner"); using DataT = decltype(data); using Reducer = collective::reduce::allreduce::Rabenseifner< @@ -164,13 +167,15 @@ VT_PERF_TEST(MyTest, test_allreduce_rabenseifner) { auto grp_proxy = vt::theObjGroup()->makeCollective( "allreduce_rabenseifner", proxy, num_nodes_, data); grp_proxy[my_node_].get()->proxy_ = grp_proxy; - vt::runInEpochCollective( - [=] { grp_proxy[my_node_].template invoke<&Reducer::allreduce>(); }); + + theCollective()->barrier(); + StartTimer(proxy[theContext()->getNode()].get()->timer_name_); + grp_proxy[my_node_].template invoke<&Reducer::allreduce>(); } VT_PERF_TEST(MyTest, test_allreduce_recursive_doubling) { auto proxy = - vt::theObjGroup()->makeCollective("test_allreduce_new_2", this); + vt::theObjGroup()->makeCollective("test_allreduce_new_2", this, "Recursive doubling"); using DataT = decltype(data); using Reducer = collective::reduce::allreduce::DistanceDoubling< From fa16fa14df5a85e386d82bd8212b53d5ccfecba8 Mon Sep 17 00:00:00 2001 From: Jacob Domagala Date: Tue, 21 May 2024 18:02:03 +0200 Subject: [PATCH 10/29] #2240: Add unit tests for new allreduce and cleanup code --- .../reduce/allreduce/rabenseifner.h | 37 ++++-- .../reduce/allreduce/recursive_doubling.h | 14 ++- src/vt/objgroup/manager.h | 6 + src/vt/objgroup/manager.impl.h | 105 ++++++++++-------- tests/perf/allreduce.cc | 6 +- tests/unit/objgroup/test_objgroup.cc | 43 +++++++ tests/unit/objgroup/test_objgroup_common.h | 28 +++++ 7 files changed, 178 insertions(+), 61 deletions(-) diff --git a/src/vt/collective/reduce/allreduce/rabenseifner.h b/src/vt/collective/reduce/allreduce/rabenseifner.h index fa5ea0a557..c47f5ff0f0 100644 --- a/src/vt/collective/reduce/allreduce/rabenseifner.h +++ b/src/vt/collective/reduce/allreduce/rabenseifner.h @@ -41,6 +41,7 @@ //@HEADER */ + #if !defined INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H #define INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_RABENSEIFNER_H @@ -48,6 +49,8 @@ #include "vt/context/context.h" #include "vt/messaging/message/message.h" #include "vt/objgroup/proxy/proxy_objgroup.h" +#include "vt/registry/auto/auto_registry.h" +#include "vt/pipe/pipe_manager.h" #include #include @@ -95,7 +98,6 @@ struct Rabenseifner { vt::objgroup::proxy::Proxy parentProxy, NodeType num_nodes, Args&&... args) : parent_proxy_(parentProxy), - val_(std::forward(args)...), num_nodes_(num_nodes), this_node_(vt::theContext()->getNode()), is_even_(this_node_ % 2 == 0), @@ -104,7 +106,15 @@ struct Rabenseifner { nprocs_rem_(num_nodes_ - nprocs_pof2_), gather_step_(num_steps_ - 1), gather_mask_(nprocs_pof2_ >> 1), - finished_adjustment_part_(nprocs_rem_ == 0) { + finished_adjustment_part_(nprocs_rem_ == 0) + { + initialize(std::forward(args)...); + } + + template + void initialize(Args&&... args) { + val_ = DataT(std::forward(args)...); + is_part_of_adjustment_group_ = this_node_ < (2 * nprocs_rem_); if (is_part_of_adjustment_group_) { if (is_even_) { @@ -156,6 +166,13 @@ struct Rabenseifner { scatter_steps_recv_.resize(num_steps_, false); } + void executeFinalHan() { + + // theCB()->makeSend(parent_proxy_[this_node_]).sendTuple(std::make_tuple(val_)); + parent_proxy_[this_node_].template invoke(val_); + completed_ = true; + } + void allreduce() { if (nprocs_rem_) { adjustForPowerOfTwo(); @@ -181,7 +198,7 @@ struct Rabenseifner { } void adjustForPowerOfTwoRightHalf(AllreduceRbnMsg* msg) { - for (int i = 0; i < msg->val_.size(); i++) { + for (uint32_t i = 0; i < msg->val_.size(); i++) { val_[(val_.size() / 2) + i] += msg->val_[i]; } @@ -192,13 +209,13 @@ struct Rabenseifner { } void adjustForPowerOfTwoLeftHalf(AllreduceRbnMsg* msg) { - for (int i = 0; i < msg->val_.size(); i++) { + for (uint32_t i = 0; i < msg->val_.size(); i++) { val_[i] += msg->val_[i]; } } void adjustForPowerOfTwoFinalPart(AllreduceRbnMsg* msg) { - for (int i = 0; i < msg->val_.size(); i++) { + for (uint32_t i = 0; i < msg->val_.size(); i++) { val_[(val_.size() / 2) + i] = msg->val_[i]; } @@ -243,7 +260,7 @@ struct Rabenseifner { [](const auto val) { return val; })) { auto& in_msg = scatter_messages_.at(step); auto& in_val = in_msg->val_; - for (int i = 0; i < in_val.size(); i++) { + for (uint32_t i = 0; i < in_val.size(); i++) { Op()( val_[r_index_[in_msg->step_] + i], in_val[i]); } @@ -339,7 +356,7 @@ struct Rabenseifner { if (doRed) { auto& in_msg = gather_messages_.at(step); auto& in_val = in_msg->val_; - for (int i = 0; i < in_val.size(); i++) { + for (uint32_t i = 0; i < in_val.size(); i++) { val_[s_index_[in_msg->step_] + i] = in_val[i]; } @@ -417,8 +434,7 @@ struct Rabenseifner { sendToExcludedNodes(); } - parent_proxy_[this_node_].template invoke(val_); - completed_ = true; + executeFinalHan(); } void sendToExcludedNodes() { @@ -435,8 +451,7 @@ struct Rabenseifner { void sendToExcludedNodesHandler(AllreduceRbnMsg* msg) { val_ = msg->val_; - parent_proxy_[this_node_].template invoke(val_); - completed_ = true; + executeFinalHan(); } vt::objgroup::proxy::Proxy proxy_ = {}; diff --git a/src/vt/collective/reduce/allreduce/recursive_doubling.h b/src/vt/collective/reduce/allreduce/recursive_doubling.h index 917196a4d3..f283d8499f 100644 --- a/src/vt/collective/reduce/allreduce/recursive_doubling.h +++ b/src/vt/collective/reduce/allreduce/recursive_doubling.h @@ -97,7 +97,6 @@ struct DistanceDoubling { vt::objgroup::proxy::Proxy parentProxy, NodeType num_nodes, Args&&... args) : parent_proxy_(parentProxy), - val_(std::forward(args)...), num_nodes_(num_nodes), this_node_(vt::theContext()->getNode()), is_even_(this_node_ % 2 == 0), @@ -105,6 +104,12 @@ struct DistanceDoubling { nprocs_pof2_(1 << num_steps_), nprocs_rem_(num_nodes_ - nprocs_pof2_), finished_adjustment_part_(nprocs_rem_ == 0) { + initialize(std::forward(args)...); + } + + template + void initialize(Args&&... args) { + val_ = DataT(std::forward(args)...); is_part_of_adjustment_group_ = this_node_ < (2 * nprocs_rem_); if (is_part_of_adjustment_group_) { if (is_even_) { @@ -168,8 +173,8 @@ struct DistanceDoubling { [](const auto val) { return val; }); } bool isReady() { - return (is_part_of_adjustment_group_ and finished_adjustment_part_) and - step_ == 0 or + return ((is_part_of_adjustment_group_ and finished_adjustment_part_) and + step_ == 0) or allMessagesReceived(); } @@ -279,8 +284,9 @@ struct DistanceDoubling { vt::objgroup::proxy::Proxy parent_proxy_ = {}; DataT val_ = {}; - NodeType this_node_ = {}; NodeType num_nodes_ = {}; + NodeType this_node_ = {}; + bool is_even_ = false; int32_t num_steps_ = {}; int32_t nprocs_pof2_ = {}; diff --git a/src/vt/objgroup/manager.h b/src/vt/objgroup/manager.h index 3ea00d7191..e4921dfef9 100644 --- a/src/vt/objgroup/manager.h +++ b/src/vt/objgroup/manager.h @@ -41,6 +41,7 @@ //@HEADER */ +#include "vt/configs/types/types_type.h" #if !defined INCLUDED_VT_OBJGROUP_MANAGER_H #define INCLUDED_VT_OBJGROUP_MANAGER_H @@ -291,6 +292,9 @@ struct ObjGroupManager : runtime::component::Component { ProxyType proxy, std::string const& name, std::string const& parent = "" ); +template class Op, typename DataT> +ObjGroupManager::PendingSendType allreduce(ProxyType proxy, const DataT& data); + template class Op, typename DataT> ObjGroupManager::PendingSendType allreduce(ProxyType proxy, const DataT& data); @@ -504,6 +508,8 @@ ObjGroupManager::PendingSendType allreduce(ProxyType proxy, const DataT& d std::unordered_map> pending_; /// Map of object groups' labels std::unordered_map labels_; + + std::unordered_map reducers_; }; }} /* end namespace vt::objgroup */ diff --git a/src/vt/objgroup/manager.impl.h b/src/vt/objgroup/manager.impl.h index 4fb3d3dac9..72e464335e 100644 --- a/src/vt/objgroup/manager.impl.h +++ b/src/vt/objgroup/manager.impl.h @@ -41,6 +41,7 @@ //@HEADER */ +#include "vt/configs/types/types_sentinels.h" #if !defined INCLUDED_VT_OBJGROUP_MANAGER_IMPL_H #define INCLUDED_VT_OBJGROUP_MANAGER_IMPL_H @@ -58,7 +59,10 @@ #include "vt/messaging/active.h" #include "vt/elm/elm_id_bits.h" #include "vt/messaging/message/smart_ptr.h" +#include "vt/collective/reduce/allreduce/rabenseifner.h" +#include "vt/collective/reduce/allreduce/recursive_doubling.h" #include +#include #include @@ -264,57 +268,70 @@ ObjGroupManager::PendingSendType ObjGroupManager::broadcast(MsgSharedPtr m return objgroup::broadcast(msg,han); } + +// Helper trait to detect if a type is a specialization of a given variadic template +template