From 65802a56c3ac2a4b473b9103e0bd629293d52b3e Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 4 Apr 2024 15:47:45 +0100 Subject: [PATCH] Make mpi (device-aware) examples backend agnostic (#11) Make MPI samples backend agnostic, rather than being CUDA-only. --------- Signed-off-by: JackAKirk Co-authored-by: Rafal Bielski --- CMakeLists.txt | 2 +- README.md | 58 ++++++++++----- src/MPI_for_CUDA_backend/CMakeLists.txt | 15 ---- src/MPI_with_SYCL/CMakeLists.txt | 11 +++ .../scatter_reduce_gather.cpp | 11 ++- .../send_recv_buff.cpp | 72 +++++++++++++------ .../send_recv_usm.cpp | 13 ++-- 7 files changed, 115 insertions(+), 67 deletions(-) delete mode 100644 src/MPI_for_CUDA_backend/CMakeLists.txt create mode 100644 src/MPI_with_SYCL/CMakeLists.txt rename src/{MPI_for_CUDA_backend => MPI_with_SYCL}/scatter_reduce_gather.cpp (94%) rename src/{MPI_for_CUDA_backend => MPI_with_SYCL}/send_recv_buff.cpp (59%) rename src/{MPI_for_CUDA_backend => MPI_with_SYCL}/send_recv_usm.cpp (84%) diff --git a/CMakeLists.txt b/CMakeLists.txt index ba48e67..1cd12ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ include(cmake/ConfigureSYCL.cmake) # Configure the demo projects set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} CACHE PATH "" FORCE) add_subdirectory(src/matrix_multiply_omp_compare) -add_subdirectory(src/MPI_for_CUDA_backend) +add_subdirectory(src/MPI_with_SYCL) add_subdirectory(src/scan_parallel_inclusive) if(ENABLE_GRAPHICS) add_subdirectory(src/fluid) diff --git a/README.md b/README.md index 520cbbd..a0cfd06 100644 --- a/README.md +++ b/README.md @@ -30,12 +30,16 @@ direction of the mouse travel. The fluid fades slowly over time so as not to fil the container. ## Non-graphical Demos -### MPI for CUDA Backend -MPI, the Message Passing Interface, is a standard API for communicating data via -messages between distributed processes that is commonly used in HPC to build -applications that can scale to multi-node computer clusters. +### MPI with SYCL +MPI, the Message Passing Interface, is a standard API for communicating data +via messages between distributed processes that is commonly used in HPC to +build applications that can scale to multi-node computer clusters. The three minimal code examples demonstrate how some GPUs can support -CUDA-Aware MPI together with SYCL. +GPU-Aware MPI together with SYCL. This enables fast device to device memory +transfers and collective operations without going via the host. +More generally the USM code samples are also portable across any SYCL backend +(including CPU devices) that support the MPI standard. For this reason we +use the more general term "device-aware" MPI. The first example uses the SYCL Unified Shared Memory (USM) memory model (`send_recv_usm`). The second uses the Buffer (`send_recv_buff`) model. Each @@ -50,17 +54,39 @@ using the SYCL 2020 reduction interface. Finally, the partial results from each rank are reduced to a final scalar value, `res`, using Reduce. Finally, the initial data is updated using Gather. -These three examples form part of the [Codeplay oneAPI for NVIDIA GPUs plugin -documentation](https://developer.codeplay.com/products/oneapi/nvidia/latest/guides/MPI-guide). -The documentation refers to the gpu-aware MPI guide for the CUDA backend. - -Building the MPI-CUDA examples requires the CUDA backend to be enabled and the -MPI headers and library to be present on the system. This demo will be -automatically skipped when not building for the CUDA backend or when MPI is not -installed/detected. A message saying this will appear in the CMake configuration -output. Additionally, in order to run the examples, the MPI implementation needs -to be CUDA-aware. This is only detectable at runtime, so the examples may build -fine but crash on execution if the linked MPI library isn't CUDA-aware. +These three examples form part of the Codeplay oneAPI for [NVIDIA GPUs](https://developer.codeplay.com/products/oneapi/nvidia/latest/guides/MPI-guide) +and [AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/latest/guides/MPI-guide) +plugin documentation. +These two links point to the device-aware MPI guide for the CUDA/HIP backends +respectively. + +Building the MPI examples requires that the correct +MPI headers and library be present on the system, and that you have set your +CMAKE_CXX_COMPILER correctly (If you are using an MPI wrapper such as `mpicxx`). +This demo will be automatically skipped when MPI is not installed/detected. +Sometimes CMake fails to find the correct MPI library. A message saying this +will appear in the CMake configuration output. If this occurs then you +should adjust the CMakeLists.txt manually depending on the location of your +MPI installation. E.g. + +```bash +--- a/src/MPI_with_SYCL/CMakeLists.txt ++++ b/src/MPI_with_SYCL/CMakeLists.txt +@@ -5,7 +5,7 @@ else() + message(STATUS "Found MPI, configuring the MPI_with_SYCL demo") + foreach(TARGET send_recv_usm send_recv_buff scatter_reduce_gather) + add_executable(${TARGET} ${TARGET}.cpp) +- target_compile_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_INCLUDE_DIRS}) +- target_link_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_LIBRARIES}) ++ target_compile_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_INCLUDE_DIRS} -I/opt/cray/pe/mpich/8.1.25/ofi/cray/10.0/include/) ++ target_link_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_LIBRARIES} -L/opt/cray/pe/mpich/8.1.25/ofi/cray/10.0/lib) + endforeach() + endif() +``` + +Additionally, in order to run the examples, the MPI implementation needs +to be device-aware. This is only detectable at runtime, so the examples may build +fine but crash on execution if the linked MPI library isn't device-aware. ### Parallel Inclusive Scan Implementation of a parallel inclusive scan with a given associative binary diff --git a/src/MPI_for_CUDA_backend/CMakeLists.txt b/src/MPI_for_CUDA_backend/CMakeLists.txt deleted file mode 100644 index 90390da..0000000 --- a/src/MPI_for_CUDA_backend/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -if(NOT ENABLE_CUDA) - message(STATUS "CUDA backend is disabled, skipping the MPI_for_CUDA_backend demo") -else() - find_package(MPI) - if(NOT MPI_FOUND) - message(STATUS "MPI not found, skipping the MPI_for_CUDA_backend demo") - else() - message(STATUS "Found MPI, configuring the MPI_for_CUDA_backend demo") - foreach(TARGET send_recv_usm send_recv_buff scatter_reduce_gather) - add_executable(${TARGET} ${TARGET}.cpp) - target_compile_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_INCLUDE_DIRS}) - target_link_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_LIBRARIES}) - endforeach() - endif() -endif() diff --git a/src/MPI_with_SYCL/CMakeLists.txt b/src/MPI_with_SYCL/CMakeLists.txt new file mode 100644 index 0000000..101282b --- /dev/null +++ b/src/MPI_with_SYCL/CMakeLists.txt @@ -0,0 +1,11 @@ +find_package(MPI) +if(NOT MPI_FOUND) + message(STATUS "MPI not found, skipping the MPI_with_SYCL demo") +else() + message(STATUS "Found MPI, configuring the MPI_with_SYCL demo") + foreach(TARGET send_recv_usm send_recv_buff scatter_reduce_gather) + add_executable(${TARGET} ${TARGET}.cpp) + target_compile_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_INCLUDE_DIRS}) + target_link_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_LIBRARIES}) + endforeach() +endif() diff --git a/src/MPI_for_CUDA_backend/scatter_reduce_gather.cpp b/src/MPI_with_SYCL/scatter_reduce_gather.cpp similarity index 94% rename from src/MPI_for_CUDA_backend/scatter_reduce_gather.cpp rename to src/MPI_with_SYCL/scatter_reduce_gather.cpp index 8628393..003de36 100644 --- a/src/MPI_for_CUDA_backend/scatter_reduce_gather.cpp +++ b/src/MPI_with_SYCL/scatter_reduce_gather.cpp @@ -1,8 +1,7 @@ -// Compile with `mpicxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda -// -Xsycl-target-backend --cuda-gpu-arch=sm_xx scatter_reduce_gather.cpp -o res` -// Where sm_xx is the Compute Capability (CC). If the `-Xsycl-target-backend -// --cuda-gpu-arch=` flags are not explicitly provided the lowest supported CC -// will be used: sm_50. +// Refer to +// https://developer.codeplay.com/products/oneapi/nvidia/latest/guides/MPI-guide +// or https://developer.codeplay.com/products/oneapi/amd/latest/guides/MPI-guide +// for build/run instructions // This sample runs a common HPC programming idiom in a simplified form. Firstly // a data array is scattered to two processes associated with @@ -43,7 +42,7 @@ int main(int argc, char *argv[]) { } /* ------------------------------------------------------------------------------------------- - SYCL Initialization, which internally sets the CUDA device. + SYCL Initialization, which internally sets the device. --------------------------------------------------------------------------------------------*/ sycl::queue q{}; diff --git a/src/MPI_for_CUDA_backend/send_recv_buff.cpp b/src/MPI_with_SYCL/send_recv_buff.cpp similarity index 59% rename from src/MPI_for_CUDA_backend/send_recv_buff.cpp rename to src/MPI_with_SYCL/send_recv_buff.cpp index d466ebe..fc4ccd0 100644 --- a/src/MPI_for_CUDA_backend/send_recv_buff.cpp +++ b/src/MPI_with_SYCL/send_recv_buff.cpp @@ -1,17 +1,51 @@ -// Compile with `mpicxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda -// -Xsycl-target-backend --cuda-gpu-arch=sm_xx send_recv_buff.cpp -o res` -// where sm_xx is the Compute Capability (CC). If the `-Xsycl-target-backend -// --cuda-gpu-arch=` flags are not explicitly provided the lowest supported CC -// will be used: sm_50. +// Refer to +// https://developer.codeplay.com/products/oneapi/nvidia/latest/guides/MPI-guide +// or https://developer.codeplay.com/products/oneapi/amd/latest/guides/MPI-guide +// for build/run instructions -// This example shows how to use CUDA-aware MPI with SYCL Buffer memory using a -// simple send-receive pattern. +// This example shows how to use device-aware MPI with SYCL Buffer memory using +// a simple send-receive pattern. #include #include #include +/// Get the native device pointer from a SYCL accessor +template +inline void *getDevicePointer(const Accessor &acc, + const sycl::interop_handle &ih) { + void *device_ptr{nullptr}; + switch (ih.get_backend()) { +#if SYCL_EXT_ONEAPI_BACKEND_CUDA + case sycl::backend::ext_oneapi_cuda: { + device_ptr = reinterpret_cast( + ih.get_native_mem(acc)); + break; + } +#endif +#if SYCL_EXT_ONEAPI_BACKEND_HIP + case sycl::backend::ext_oneapi_hip: { + device_ptr = reinterpret_cast( + ih.get_native_mem(acc)); + break; + } +#endif + case sycl::backend::ext_oneapi_level_zero: { + device_ptr = reinterpret_cast( + ih.get_native_mem(acc)); + break; + } + default: { + throw std::runtime_error{ + "Backend does not yet support buffer interop " + "required for device-aware MPI with sycl::buffer"}; + break; + } + } + return device_ptr; +} + int main(int argc, char *argv[]) { /* --------------------------------------------------------------------------- MPI Initialization. @@ -37,7 +71,7 @@ int main(int argc, char *argv[]) { } /* --------------------------------------------------------------------------- - SYCL Initialization, which internally sets the CUDA device. + SYCL Initialization, which internally sets the device. ----------------------------------------------------------------------------*/ sycl::queue q{}; @@ -66,19 +100,15 @@ int main(int argc, char *argv[]) { auto kern = [=](sycl::id<1> id) { acc[id] *= 2; }; h.parallel_for(sycl::range<1>{nelem}, kern); }; - // When using buffers with CUDA-aware MPI, a host_task must be used with a - // sycl::interop_handle in the following way. This host task command group - // uses MPI_Send to send the data to rank 1. + // When using buffers with device-aware MPI, a host_task must be used with + // a sycl::interop_handle in the following way. This host task command + // group uses MPI_Send to send the data to rank 1. auto ht = [&](sycl::handler &h) { sycl::accessor acc{buff, h}; h.host_task([=](sycl::interop_handle ih) { - // get the native CUDA device pointer from the SYCL accessor. - auto cuda_ptr = reinterpret_cast( - ih.get_native_mem(acc)); - - MPI_Status status; + void *device_ptr = getDevicePointer(acc, ih); // Send the data from rank 0 to rank 1. - MPI_Send(cuda_ptr, nsize, MPI_BYTE, 1, tag, MPI_COMM_WORLD); + MPI_Send(device_ptr, nsize, MPI_BYTE, 1, tag, MPI_COMM_WORLD); printf("Sent %d elements from %d to 1\n", nelem, rank); }); }; @@ -92,13 +122,11 @@ int main(int argc, char *argv[]) { auto ht = [&](sycl::handler &h) { sycl::accessor acc{buff, h}; h.host_task([=](sycl::interop_handle ih) { - // get the native CUDA device pointer from the SYCL accessor. - auto cuda_ptr = reinterpret_cast( - ih.get_native_mem(acc)); - + void *device_ptr = getDevicePointer(acc, ih); MPI_Status status; // Receive the data sent from rank 0. - MPI_Recv(cuda_ptr, nsize, MPI_BYTE, 0, tag, MPI_COMM_WORLD, &status); + MPI_Recv(device_ptr, nsize, MPI_BYTE, 0, tag, MPI_COMM_WORLD, + &status); printf("received status==%d\n", status.MPI_ERROR); }); }; diff --git a/src/MPI_for_CUDA_backend/send_recv_usm.cpp b/src/MPI_with_SYCL/send_recv_usm.cpp similarity index 84% rename from src/MPI_for_CUDA_backend/send_recv_usm.cpp rename to src/MPI_with_SYCL/send_recv_usm.cpp index 26c0d2e..1487243 100644 --- a/src/MPI_for_CUDA_backend/send_recv_usm.cpp +++ b/src/MPI_with_SYCL/send_recv_usm.cpp @@ -1,10 +1,9 @@ -// Compile with `mpicxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda -// -Xsycl-target-backend --cuda-gpu-arch=sm_xx send_recv_usm.cpp -o res` Where -// sm_xx is the Compute Capability (CC). If the `-Xsycl-target-backend -// --cuda-gpu-arch=` flags are not explicitly provided the lowest supported CC -// will be used: sm_50. +// Refer to +// https://developer.codeplay.com/products/oneapi/nvidia/latest/guides/MPI-guide +// or https://developer.codeplay.com/products/oneapi/amd/latest/guides/MPI-guide +// for build/run instructions -// This example shows how to use CUDA-aware MPI with SYCL USM memory using a +// This example shows how to use device-aware MPI with SYCL USM memory using a // simple send-receive pattern. #include @@ -37,7 +36,7 @@ int main(int argc, char *argv[]) { } /* ------------------------------------------------------------------------------------------- - SYCL Initialization, which internally sets the CUDA device. + SYCL Initialization, which internally sets the device. --------------------------------------------------------------------------------------------*/ sycl::queue q{};