Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#2240: Add Rabenseifner and Recursive doubling allreduce algorithms for ObjGroup #2272

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
7def560
#2240: Initial work for new allreduce
JacobDomagala Mar 24, 2024
01b6afb
#2240: Semi working Rabenseifner
JacobDomagala Mar 27, 2024
5289362
#2240: Working Rabenseifner (non-commutative ops)
JacobDomagala Apr 4, 2024
d52afeb
#2240: Fix non power of 2 for new allreduce
JacobDomagala Apr 7, 2024
5372da5
#2240: Initial work for adding recursive doubling allreduce algorithm
JacobDomagala Apr 10, 2024
90a20e0
#2240: Make sure the order of reduce operations is correct
JacobDomagala Apr 11, 2024
8bf1cc9
#2240: Working Recursive doubling
JacobDomagala Apr 15, 2024
bb1ca10
#2240: Code cleanup and make Rabenseifner work with any Op type
JacobDomagala Apr 16, 2024
166f231
#2240: Improve accuracy of timing allreduce algorithms in allreduce.cc
JacobDomagala Apr 26, 2024
fa16fa1
#2240: Add unit tests for new allreduce and cleanup code
JacobDomagala May 21, 2024
a0fdad8
#2240: DataHandler for Rabenseifner allreduce that provides common AP…
JacobDomagala May 28, 2024
f9a60fa
#2240: Fix warnings
JacobDomagala May 28, 2024
63b39f5
#2240: Update ObjGroup test to use custom DataHandler for Rabenseifne…
JacobDomagala May 30, 2024
a07f6c8
#2240: Add unit test for Rabenseifner with Kokkos::View as DataType a…
JacobDomagala May 31, 2024
316bfb8
#2240: Move function definitions to impl.h file for Rabenseifner
JacobDomagala Jun 3, 2024
b8cd612
#2240: Add allreduce print category and use it in rabenseifner instea…
JacobDomagala Jun 4, 2024
5f40e4b
#2240: Provide documentation for RecursiveDoubling algorithm
JacobDomagala Jun 4, 2024
b200ecd
#2240: Use vt_debug_print for RecursiveDoubling allreduce
JacobDomagala Jun 4, 2024
eb1bc40
#2240: Update allreduce perf tests to use array of payload sizes
JacobDomagala Jun 5, 2024
977e9e3
#2240: Fix runtime failure in allreduce perf test
JacobDomagala Jun 7, 2024
1456dd5
#2240: Working allreduce perf test with Kokkos
JacobDomagala Jun 16, 2024
5803848
#2240: Working RecursiveDoubling with multiple allreduce in flight
JacobDomagala Jun 17, 2024
2015e78
#2240: Update Rabenseifner to use ID for each allreduce and update tests
JacobDomagala Jun 18, 2024
87ad4cf
#2240: Fix failing unit and performance tests for multiple allreduce …
JacobDomagala Jun 25, 2024
ab0357b
#2240: Fix compile issues on some compilers and runtime issue with pa…
JacobDomagala Jul 2, 2024
be3ee2c
#2240: Update logs
JacobDomagala Jul 6, 2024
28139a7
#2240: Fix issues with handlers being executed and payload not being …
JacobDomagala Jul 16, 2024
c5232dc
#2240: Add helpers and use Kokkos::View for internals of Rabenseifner…
JacobDomagala Jul 17, 2024
57b8cab
#2240: Store Reducers by tuple(ProxyType, DataType, OperandType)
JacobDomagala Jul 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions src/vt/collective/reduce/allreduce/data_handler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
//@HEADER
// *****************************************************************************
//
// data_handler.h
// DARMA/vt => Virtual Transport
//
// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC
// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S.
// Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from this
// software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact [email protected]
//
// *****************************************************************************
//@HEADER
*/

#if !defined INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_DATA_HANDLER_H
#define INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_DATA_HANDLER_H

#include <vector>
JacobDomagala marked this conversation as resolved.
Show resolved Hide resolved

#ifdef MAGISTRATE_KOKKOS_ENABLED
#include <Kokkos_Core.hpp>
#endif // MAGISTRATE_KOKKOS_ENABLED

namespace vt::collective::reduce::allreduce {

template <typename DataType, typename Enable = void>
class DataHandler {
public:
using Scalar = void;
static size_t size(void) { return 0; }
};

template <typename ScalarType>
class DataHandler<ScalarType, typename std::enable_if<std::is_arithmetic<ScalarType>::value>::type> {
public:
using Scalar = ScalarType;

static std::vector<ScalarType> toVec(const ScalarType& data) { return std::vector<ScalarType>{data}; }
static ScalarType fromVec(const std::vector<ScalarType>& data) { return data[0]; }
static ScalarType fromMemory(const ScalarType* data, size_t) {
return *data;
}

static size_t size(const ScalarType&) { return 1; }
};

template <typename T>
class DataHandler<std::vector<T>> {
public:
using Scalar = T;

static const std::vector<T>& toVec(const std::vector<T>& data) { return data; }
static std::vector<T> fromVec(const std::vector<T>& data) { return data; }
static std::vector<T> fromMemory(const T* data, size_t count) {
return std::vector<T>(data, data + count);
}

static size_t size(const std::vector<T>& data) { return data.size(); }
};

#if MAGISTRATE_KOKKOS_ENABLED

template <typename T, typename... Props>
class DataHandler<Kokkos::View<T*, Kokkos::HostSpace, Props...>> {
using ViewType = Kokkos::View<T*, Kokkos::HostSpace, Props...>;

public:
using Scalar = T;

static std::vector<T> toVec(const ViewType& data) {
std::vector<T> vec;
vec.resize(data.extent(0));
std::memcpy(vec.data(), data.data(), data.extent(0) * sizeof(T));
return vec;
}

static ViewType fromMemory(T* data, size_t size) {
return ViewType(data, size);
}

static ViewType fromVec(const std::vector<T>& data) {
ViewType view("", data.size());
Kokkos::parallel_for(
"InitView", view.extent(0),
KOKKOS_LAMBDA(const int i) { view(i) = static_cast<float>(data[i]); });

return view;
}

static size_t size(const ViewType& data) { return data.extent(0); }
};

#endif // MAGISTRATE_KOKKOS_ENABLED

} // namespace vt::collective::reduce::allreduce

#endif /*INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_DATA_HANDLER_H*/
203 changes: 203 additions & 0 deletions src/vt/collective/reduce/allreduce/helpers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/*
//@HEADER
// *****************************************************************************
//
// helpers.h
// DARMA/vt => Virtual Transport
//
// Copyright 2019-2021 National Technology & Engineering Solutions of Sandia, LLC
// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S.
// Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from this
// software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact [email protected]
//
// *****************************************************************************
//@HEADER
*/

#if !defined INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_HELPERS_H
#define INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_HELPERS_H

#include "data_handler.h"
#include "rabenseifner_msg.h"
#include "vt/messaging/message/shared_message.h"

#include <vector>
#include <type_traits>

namespace vt {
template <typename T>
using remove_cvref = std::remove_cv_t<std::remove_reference_t<T>>;
}

namespace vt::collective::reduce::allreduce {

template <typename Scalar, typename DataT>
struct DataHelper {
using DataType = DataHandler<DataT>;

template <typename... Args>
static void assign(std::vector<Scalar>& dest, Args&&... data) {
dest = DataHandler<DataT>::toVec(std::forward<Args>(data)...);
}

static MsgPtr<RabenseifnerMsg<Scalar, DataT>> createMessage(
const std::vector<Scalar>& payload, size_t begin, size_t count, size_t id,
int32_t step = 0) {
return vt::makeMessage<RabenseifnerMsg<Scalar, DataT>>(
payload.data() + begin, count, id, step);
}

static void copy(
std::vector<Scalar>& dest, size_t start_idx, RabenseifnerMsg<Scalar, DataT>* msg) {
for (uint32_t i = 0; i < msg->size_; i++) {
dest[start_idx + i] = msg->val_[i];
}
}

template <template <typename Arg> class Op>
static void reduce(
std::vector<Scalar>& dest, size_t start_idx, RabenseifnerMsg<Scalar, DataT>* msg) {
for (uint32_t i = 0; i < msg->size_; i++) {
Op<Scalar>()(dest[start_idx + i], msg->val_[i]);
}
}

static void invoke() { }

static bool empty(const std::vector<Scalar>& payload) {
return payload.empty();
}
};

#if MAGISTRATE_KOKKOS_ENABLED

template <typename Scalar>
struct DataHelper<Scalar, Kokkos::View<Scalar*, Kokkos::HostSpace>> {
using DataT = Kokkos::View<Scalar*, Kokkos::HostSpace>;
using DataType = DataHandler<DataT>;

template <typename... Args>
static void assign(DataT& dest, Args&&... data) {
dest = {std::forward<Args>(data)...};
}

static MsgPtr<RabenseifnerMsg<Scalar, DataT>> createMessage(
const DataT& payload, size_t begin, size_t count, size_t id,
int32_t step = 0) {
return vt::makeMessage<RabenseifnerMsg<Scalar, DataT>>(
Kokkos::subview(payload, std::make_pair(begin, begin + count)), id, step
);
}

static void
copy(DataT& dest, size_t start_idx, RabenseifnerMsg<Scalar, DataT>* msg) {
Kokkos::parallel_for(
"Rabenseifner::copy", msg->val_.extent(0),
KOKKOS_LAMBDA(const int i) { dest(start_idx + i) = msg->val_(i); }
);
}

template <template <typename Arg> class Op>
static void reduce(
DataT& dest, size_t start_idx, RabenseifnerMsg<Scalar, DataT>* msg) {
Kokkos::parallel_for(
"Rabenseifner::reduce", msg->val_.extent(0), KOKKOS_LAMBDA(const int i) {
Op<Scalar>()(dest(start_idx + i), msg->val_(i));
}
);
}

static void invoke() { }

static bool empty(const DataT& payload) {
return payload.extent(0) == 0;
}
};

#endif // MAGISTRATE_KOKKOS_ENABLED

struct StateBase {
size_t size_ = {};

bool finished_adjustment_part_ = false;

int32_t mask_ = 1;
int32_t step_ = 0;
bool initialized_ = false;
bool completed_ = false;

// Scatter
int32_t scatter_mask_ = 1;
int32_t scatter_step_ = 0;
int32_t scatter_num_recv_ = 0;
std::vector<bool> scatter_steps_recv_ = {};
std::vector<bool> scatter_steps_reduced_ = {};

bool finished_scatter_part_ = false;

// Gather
int32_t gather_step_ = 0;
int32_t gather_mask_ = 1;
int32_t gather_num_recv_ = 0;
std::vector<bool> gather_steps_recv_ = {};
std::vector<bool> gather_steps_reduced_ = {};

std::vector<uint32_t> r_index_ = {};
std::vector<uint32_t> r_count_ = {};
std::vector<uint32_t> s_index_ = {};
std::vector<uint32_t> s_count_ = {};
};

template <typename Scalar, typename DataT>
struct State : StateBase {
std::vector<Scalar> val_ = {};

MsgSharedPtr<RabenseifnerMsg<Scalar, DataT>> left_adjust_message_ = nullptr;
MsgSharedPtr<RabenseifnerMsg<Scalar, DataT>> right_adjust_message_ = nullptr;
std::vector<MsgSharedPtr<RabenseifnerMsg<Scalar, DataT>>> scatter_messages_ = {};
std::vector<MsgSharedPtr<RabenseifnerMsg<Scalar, DataT>>> gather_messages_ = {};
};

#if MAGISTRATE_KOKKOS_ENABLED
template <typename Scalar>
struct State<Scalar, Kokkos::View<Scalar*, Kokkos::HostSpace>> : StateBase {
using DataT = Kokkos::View<Scalar*, Kokkos::HostSpace>;

Kokkos::View<Scalar*, Kokkos::HostSpace> val_ = {};

MsgSharedPtr<RabenseifnerMsg<Scalar, DataT>> left_adjust_message_ = nullptr;
MsgSharedPtr<RabenseifnerMsg<Scalar, DataT>> right_adjust_message_ = nullptr;
std::vector<MsgSharedPtr<RabenseifnerMsg<Scalar, DataT>>> scatter_messages_ = {};
std::vector<MsgSharedPtr<RabenseifnerMsg<Scalar, DataT>>> gather_messages_ = {};
};
#endif //MAGISTRATE_KOKKOS_ENABLED

} // namespace vt::collective::reduce::allreduce
#endif /*INCLUDED_VT_COLLECTIVE_REDUCE_ALLREDUCE_HELPERS_H*/
Loading