diff --git a/.jenkins/continuous.groovy b/.jenkins/continuous.groovy index 754184fc7..d716e2166 100644 --- a/.jenkins/continuous.groovy +++ b/.jenkins/continuous.groovy @@ -89,6 +89,8 @@ pipeline { -D CMAKE_CXX_COMPILER=$KOKKOS_DIR/bin/nvcc_wrapper \ -D CMAKE_CXX_EXTENSIONS=OFF \ -D CMAKE_PREFIX_PATH="$KOKKOS_DIR;$ARBORX_DIR" \ + -D MPIEXEC_PREFLAGS="--allow-run-as-root" \ + -D MPIEXEC_MAX_NUMPROCS=4 \ examples \ ''' sh 'make VERBOSE=1' @@ -148,6 +150,8 @@ pipeline { -D CMAKE_CXX_COMPILER=$KOKKOS_DIR/bin/nvcc_wrapper \ -D CMAKE_CXX_EXTENSIONS=OFF \ -D CMAKE_PREFIX_PATH="$KOKKOS_DIR;$ARBORX_DIR" \ + -D MPIEXEC_PREFLAGS="--allow-run-as-root" \ + -D MPIEXEC_MAX_NUMPROCS=4 \ examples \ ''' sh 'make VERBOSE=1' @@ -266,6 +270,8 @@ pipeline { -D CMAKE_CXX_COMPILER=clang++ \ -D CMAKE_CXX_EXTENSIONS=OFF \ -D CMAKE_PREFIX_PATH="$KOKKOS_DIR;$ARBORX_DIR" \ + -D MPIEXEC_PREFLAGS="--allow-run-as-root" \ + -D MPIEXEC_MAX_NUMPROCS=4 \ examples \ ''' sh 'make VERBOSE=1' @@ -326,6 +332,8 @@ pipeline { -D CMAKE_CXX_COMPILER=g++ \ -D CMAKE_CXX_EXTENSIONS=OFF \ -D CMAKE_PREFIX_PATH="$KOKKOS_DIR;$ARBORX_DIR" \ + -D MPIEXEC_PREFLAGS="--allow-run-as-root" \ + -D MPIEXEC_MAX_NUMPROCS=4 \ examples \ ''' sh 'make VERBOSE=1' @@ -389,6 +397,8 @@ pipeline { -D CMAKE_CXX_EXTENSIONS=OFF \ -D CMAKE_BUILD_TYPE=RelWithDebInfo \ -D CMAKE_PREFIX_PATH="$KOKKOS_DIR;$ARBORX_DIR" \ + -D MPIEXEC_PREFLAGS="--allow-run-as-root" \ + -D MPIEXEC_MAX_NUMPROCS=4 \ examples \ ''' sh 'make VERBOSE=1' @@ -451,6 +461,8 @@ pipeline { -D CMAKE_CXX_FLAGS="-Wno-unknown-cuda-version" \ -D CMAKE_PREFIX_PATH="$KOKKOS_DIR;$ARBORX_DIR;$ONE_DPL_DIR" \ -D ONEDPL_PAR_BACKEND=serial \ + -D MPIEXEC_PREFLAGS="--allow-run-as-root" \ + -D MPIEXEC_MAX_NUMPROCS=4 \ examples \ ''' sh 'make VERBOSE=1' diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 6dcdbb8d1..4e9c38ef4 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -11,6 +11,10 @@ add_subdirectory(dbscan) add_subdirectory(molecular_dynamics) add_subdirectory(simple_intersection) +if(ARBORX_ENABLE_MPI) + add_subdirectory(distributed_tree) +endif() + find_package(Boost COMPONENTS program_options) if(Boost_FOUND) add_subdirectory(viz) diff --git a/examples/distributed_tree/CMakeLists.txt b/examples/distributed_tree/CMakeLists.txt new file mode 100644 index 000000000..0f5eb99d8 --- /dev/null +++ b/examples/distributed_tree/CMakeLists.txt @@ -0,0 +1,3 @@ +add_executable(ArborX_DistributedTree_KNNCallback.exe distributed_knn_callback.cpp) +target_link_libraries(ArborX_DistributedTree_KNNCallback.exe ArborX::ArborX) +add_test(NAME ArborX_DistributedTree_KNNCallback_Example COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ${MPIEXEC_PREFLAGS} ./ArborX_DistributedTree_KNNCallback.exe ${MPIEXEC_POSTFLAGS}) diff --git a/examples/distributed_tree/distributed_knn_callback.cpp b/examples/distributed_tree/distributed_knn_callback.cpp new file mode 100644 index 000000000..565ddaa87 --- /dev/null +++ b/examples/distributed_tree/distributed_knn_callback.cpp @@ -0,0 +1,139 @@ +/**************************************************************************** + * Copyright (c) 2017-2022 by the ArborX authors * + * All rights reserved. * + * * + * This file is part of the ArborX library. ArborX is * + * distributed under a BSD 3-clause license. For the licensing terms see * + * the LICENSE file in the top-level directory. * + * * + * SPDX-License-Identifier: BSD-3-Clause * + ****************************************************************************/ + +#include + +#include + +#include +#include +#include +#include +#include + +#include + +using ExecutionSpace = Kokkos::DefaultExecutionSpace; +using MemorySpace = ExecutionSpace::memory_space; + +namespace Example +{ +template +struct Nearest +{ + Points points; + int k; + int mpi_rank; +}; +template +Nearest(Points const &, int, int) -> Nearest; + +struct IndexAndRank +{ + int index; + int rank; +}; + +template +struct PrintAndInsert +{ + Kokkos::View points; + int mpi_rank; + + PrintAndInsert(Kokkos::View const &points_, + int mpi_rank_) + : points(points_) + , mpi_rank(mpi_rank_) + {} + + template + KOKKOS_FUNCTION void operator()([[maybe_unused]] Predicate const &predicate, + int primitive_index, + OutputFunctor const &out) const + { +#ifndef KOKKOS_ENABLE_SYCL + auto data = ArborX::getData(predicate); + auto const &point = points(primitive_index); + printf("Match for query %d from MPI rank %d on MPI rank %d for " + "point %f,%f,%f with index %d\n", + data.index, data.rank, mpi_rank, point[0], point[1], point[2], + primitive_index); +#endif + + out({primitive_index, mpi_rank}); + } +}; + +} // namespace Example + +template +struct ArborX::AccessTraits, ArborX::PredicatesTag> +{ + static KOKKOS_FUNCTION std::size_t size(Example::Nearest const &x) + { + return x.points.extent(0); + } + static KOKKOS_FUNCTION auto get(Example::Nearest const &x, int i) + { + return attach(ArborX::nearest(x.points(i), x.k), + Example::IndexAndRank{i, x.mpi_rank}); + } + using memory_space = MemorySpace; +}; + +int main(int argc, char *argv[]) +{ + MPI_Init(&argc, &argv); + Kokkos::initialize(argc, argv); + { + MPI_Comm comm = MPI_COMM_WORLD; + int comm_rank; + MPI_Comm_rank(comm, &comm_rank); + int comm_size; + MPI_Comm_size(comm, &comm_size); + ArborX::Point lower_left_corner = {static_cast(comm_rank), + static_cast(comm_rank), + static_cast(comm_rank)}; + ArborX::Point center = {static_cast(comm_rank) + .5f, + static_cast(comm_rank) + .5f, + static_cast(comm_rank) + .5f}; + std::vector points = {lower_left_corner, center}; + auto points_device = Kokkos::create_mirror_view_and_copy( + MemorySpace{}, + Kokkos::View(points.data(), points.size())); + + ExecutionSpace exec; + ArborX::DistributedTree tree(comm, exec, points_device); + + Kokkos::View values("values", 0); + Kokkos::View offsets("offsets", 0); + tree.query(exec, Example::Nearest{points_device, 3, comm_rank}, + Example::PrintAndInsert(points_device, comm_rank), + values, offsets); + + auto host_values = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, values); + auto host_offsets = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, offsets); + for (unsigned int i = 0; i + 1 < host_offsets.size(); ++i) + { + std::cout << "Results for query " << i << " on MPI rank " << comm_rank + << '\n'; + for (int j = host_offsets(i); j < host_offsets(i + 1); ++j) + std::cout << "point " << host_values(j).index << ", rank " + << host_values(j).rank << std::endl; + } + } + Kokkos::finalize(); + MPI_Finalize(); + return 0; +} diff --git a/src/details/ArborX_DetailsDistributedTreeImpl.hpp b/src/details/ArborX_DetailsDistributedTreeImpl.hpp index 06da643e6..b67b08cbf 100644 --- a/src/details/ArborX_DetailsDistributedTreeImpl.hpp +++ b/src/details/ArborX_DetailsDistributedTreeImpl.hpp @@ -160,6 +160,15 @@ struct DistributedTreeImpl Indices &indices, Offset &offset, Ranks &ranks, Distances *distances_ptr = nullptr); + template + static std::enable_if_t{} && + Kokkos::is_view{}> + queryDispatch(NearestPredicateTag, DistributedTree const &tree, + ExecutionSpace const &space, Predicates const &queries, + Callback const &callback, OutputView &out, OffsetView &offset); + template static std::enable_if_t{} && @@ -168,7 +177,6 @@ struct DistributedTreeImpl ExecutionSpace const &space, Predicates const &queries, IndicesAndRanks &values, Offset &offset) { - // FIXME avoid zipping when distributed nearest callbacks become available Kokkos::View indices( "ArborX::DistributedTree::query::nearest::indices", 0); Kokkos::View ranks( @@ -309,8 +317,8 @@ void DistributedTreeImpl::deviseStrategy( // Accumulate total leave count in the local trees until it reaches k which // is the number of neighbors queried for. Stop if local trees get - // empty because it means that they are no more leaves and there is no point - // on forwarding queries to leafless trees. + // empty because that means that there are no more leaves and there is no + // point in forwarding queries to leafless trees. using Access = AccessTraits; auto const n_queries = Access::size(queries); Kokkos::View new_offset( @@ -627,6 +635,149 @@ DistributedTreeImpl::queryDispatch( Kokkos::Profiling::popRegion(); } +template +struct QueriesWithIndices +{ + Query query; + int query_id; + int primitive_index; +}; + +template +template +std::enable_if_t{} && Kokkos::is_view{}> +DistributedTreeImpl::queryDispatch( + NearestPredicateTag, DistributedTree const &tree, + ExecutionSpace const &space, Predicates const &queries, + Callback const &callback, OutputView &out, OffsetView &offset) +{ + Kokkos::Profiling::pushRegion( + "ArborX::DistributedTree::query::nearest_callback"); + Kokkos::View indices( + "ArborX::DistributedTree::query::nearest::indices", 0); + Kokkos::View ranks( + "ArborX::DistributedTree::query::nearest::ranks", 0); + + // Distributed nearest callbacks strategy: + // - Find the ranks and indices for the nearest queries using a regular query + // without a callback. + // - Scatter (predicate, primitive) pairs to the corresponding matching ranks. + // - Execute the callback on the process owning the primitives. + // - Send the result back to the process owning the predicates. + + // Find the ranks and indices for the nearest queries using the overload not + // taking a callback. + queryDispatchImpl(NearestPredicateTag{}, tree, space, queries, indices, + offset, ranks); + Kokkos::Profiling::popRegion(); + + Kokkos::Profiling::pushRegion( + "ArborX::DistributedTree::query::nearest::execute_callback"); + + // Send the predicate-primitive pairs to the process where the match was + // found. + auto comm = tree.getComm(); + int comm_rank; + MPI_Comm_rank(comm, &comm_rank); + + using Access = AccessTraits; + using Query = typename AccessTraitsHelper::type; + + Kokkos::View *, typename DeviceType::memory_space> + exported_queries_with_indices( + Kokkos::view_alloc( + space, Kokkos::WithoutInitializing, + "ArborX::DistributedTree::query::exported_queries_with_indices"), + ranks.size()); + Kokkos::parallel_for( + "ArborX::DistributedTree::query::zip_queries_and_primitives", + Kokkos::RangePolicy(space, 0, Access::size(queries)), + KOKKOS_LAMBDA(int q) { + using index_type = typename OffsetView::value_type; + for (index_type i = offset(q); i < offset(q + 1); ++i) + exported_queries_with_indices(i) = {Access::get(queries, q), q, + indices(i)}; + }); + + Distributor distributor(comm); + auto const n_imports = distributor.createFromSends(space, ranks); + + Kokkos::View *, typename DeviceType::memory_space> + imported_queries_with_indices( + Kokkos::view_alloc( + space, Kokkos::WithoutInitializing, + "ArborX::DistributedTree::query::imported_queries_with_indices"), + n_imports); + + sendAcrossNetwork(space, distributor, exported_queries_with_indices, + imported_queries_with_indices); + + // Execute the callback on the process owning the primitives. + OutputView remote_out( + Kokkos::view_alloc(space, Kokkos::WithoutInitializing, + "ArborX::DistributedTree::query::remote_out"), + n_imports); + KokkosExt::reallocWithoutInitializing(space, indices, n_imports); + Kokkos::deep_copy(space, indices, -1); + Kokkos::parallel_for( + "ArborX::DistributedTree::query::execute_callbacks", + Kokkos::RangePolicy(space, 0, + imported_queries_with_indices.size()), + KOKKOS_LAMBDA(int i) { + callback(imported_queries_with_indices(i).query, + imported_queries_with_indices(i).primitive_index, + [&](typename OutputView::value_type const &value) { +#ifndef NDEBUG + // FIXME We only allow calling the callback once per match. + if (indices(i) != -1) + Kokkos::abort("Inserting more than one result per " + "callback is not implemented!"); +#endif + remote_out(i) = value; + indices(i) = imported_queries_with_indices(i).query_id; + }); + }); + + // Send the result back to the process owning the predicates. + Distributor back_distributor(comm); + auto const &dest = distributor.getSources(); + auto const &off = distributor.getSourceOffsets(); + + Kokkos::View host_destinations(dest.data(), + dest.size()); + Kokkos::View host_offsets(off.data(), + off.size()); + typename DeviceType::memory_space memory_space; + Kokkos::View destinations( + Kokkos::view_alloc(space, Kokkos::WithoutInitializing, + "ArborX::DistributedTree::query::destinations"), + dest.size()); + Kokkos::deep_copy(space, destinations, host_destinations); + Kokkos::View offsets( + Kokkos::view_alloc(space, Kokkos::WithoutInitializing, + "ArborX::DistributedTree::query::offsets"), + off.size()); + Kokkos::deep_copy(space, offsets, host_offsets); + auto const n_imports_back = + back_distributor.createFromSends(space, destinations, offsets); + KokkosExt::reallocWithoutInitializing(space, out, n_imports_back); + Kokkos::View query_ids( + Kokkos::view_alloc(space, Kokkos::WithoutInitializing, + "ArborX::DistributedTree::query::nearest::query_ids"), + n_imports_back); + + // FIXME does combining communication here help? + sendAcrossNetwork(space, back_distributor, remote_out, out); + sendAcrossNetwork(space, back_distributor, indices, query_ids); + + auto const permutation = ArborX::Details::sortObjects(space, query_ids); + ArborX::Details::applyPermutation(space, permutation, out); + + Kokkos::Profiling::popRegion(); +} + template template void DistributedTreeImpl::sortResults(ExecutionSpace const &space, diff --git a/src/details/ArborX_DetailsDistributor.hpp b/src/details/ArborX_DetailsDistributor.hpp index e254f355c..6b4e11b40 100644 --- a/src/details/ArborX_DetailsDistributor.hpp +++ b/src/details/ArborX_DetailsDistributor.hpp @@ -54,7 +54,7 @@ determineBufferLayout(ExecutionSpace const &space, InputView batched_ranks, // In case all the batches are empty, return an empty list of unique_ranks and // counts, but still have one element in offsets. This is conforming with // creating the total offsets from batched_ranks and batched_offsets ignoring - // empty batches and calling sortAndDetermeineBufferLayout. + // empty batches and calling sortAndDetermineBufferLayout. offsets.push_back(0); auto const n_batched_ranks = batched_ranks.size(); @@ -384,6 +384,12 @@ class Distributor size_t getTotalReceiveLength() const { return _src_offsets.back(); } size_t getTotalSendLength() const { return _dest_offsets.back(); } + auto const &getDestinations() const { return _destinations; } + auto const &getSources() const { return _sources; } + + auto const &getDestinationOffsets() const { return _dest_offsets; } + auto const &getSourceOffsets() const { return _src_offsets; } + private: size_t preparePointToPointCommunication() { diff --git a/test/tstDistributedTree.cpp b/test/tstDistributedTree.cpp index 1b759f066..3613663f6 100644 --- a/test/tstDistributedTree.cpp +++ b/test/tstDistributedTree.cpp @@ -31,6 +31,39 @@ namespace tt = boost::test_tools; using ArborX::PairIndexRank; using ArborX::Details::PairIndexRankAndDistance; +struct PairRankIndex +{ + int rank; + int index; + + friend bool operator==(PairRankIndex lhs, PairRankIndex rhs) + { + return lhs.index == rhs.index && lhs.rank == rhs.rank; + } + friend bool operator<(PairRankIndex lhs, PairRankIndex rhs) + { + return lhs.rank < rhs.rank || + (lhs.rank == rhs.rank && lhs.index < rhs.index); + } + friend std::ostream &operator<<(std::ostream &stream, + PairRankIndex const &pair) + { + return stream << '[' << pair.rank << ',' << pair.index << ']'; + } +}; + +struct InlineCallback +{ + int mpi_rank; + + template + KOKKOS_FUNCTION void operator()(Predicate const &, int primitive_index, + OutputFunctor const &out) const + { + out({mpi_rank, primitive_index}); + } +}; + BOOST_AUTO_TEST_CASE_TEMPLATE(hello_world, DeviceType, ARBORX_DEVICE_TYPES) { using Tree = ArborX::DistributedTree; @@ -128,6 +161,26 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(hello_world, DeviceType, ARBORX_DEVICE_TYPES) {{0, comm_size - 1 - comm_rank}, {1, comm_size - 1 - comm_rank}}, {0, 2})); } + + // Now do the same with callbacks + if (comm_rank < comm_size - 1) + { + ARBORX_TEST_QUERY_TREE_CALLBACK(ExecutionSpace{}, tree, nearest_queries, + InlineCallback{comm_rank}, + make_reference_solution( + {{comm_size - 1 - comm_rank, 0}, + {comm_size - 2 - comm_rank, n - 1}, + {comm_size - 1 - comm_rank, 1}}, + {0, 3})); + } + else + { + ARBORX_TEST_QUERY_TREE_CALLBACK( + ExecutionSpace{}, tree, nearest_queries, InlineCallback{comm_rank}, + make_reference_solution( + {{comm_size - 1 - comm_rank, 0}, {comm_size - 1 - comm_rank, 1}}, + {0, 2})); + } } BOOST_AUTO_TEST_CASE_TEMPLATE(empty_tree, DeviceType, ARBORX_DEVICE_TYPES)