From 65802a56c3ac2a4b473b9103e0bd629293d52b3e Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 4 Apr 2024 15:47:45 +0100
Subject: [PATCH] Make mpi (device-aware) examples backend agnostic (#11)

Make MPI samples backend agnostic, rather than being CUDA-only.

---------

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
Co-authored-by: Rafal Bielski <rafal.bielski@codeplay.com>
---
 CMakeLists.txt                                |  2 +-
 README.md                                     | 58 ++++++++++-----
 src/MPI_for_CUDA_backend/CMakeLists.txt       | 15 ----
 src/MPI_with_SYCL/CMakeLists.txt              | 11 +++
 .../scatter_reduce_gather.cpp                 | 11 ++-
 .../send_recv_buff.cpp                        | 72 +++++++++++++------
 .../send_recv_usm.cpp                         | 13 ++--
 7 files changed, 115 insertions(+), 67 deletions(-)
 delete mode 100644 src/MPI_for_CUDA_backend/CMakeLists.txt
 create mode 100644 src/MPI_with_SYCL/CMakeLists.txt
 rename src/{MPI_for_CUDA_backend => MPI_with_SYCL}/scatter_reduce_gather.cpp (94%)
 rename src/{MPI_for_CUDA_backend => MPI_with_SYCL}/send_recv_buff.cpp (59%)
 rename src/{MPI_for_CUDA_backend => MPI_with_SYCL}/send_recv_usm.cpp (84%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba48e67..1cd12ff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@ include(cmake/ConfigureSYCL.cmake)
 # Configure the demo projects
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} CACHE PATH "" FORCE)
 add_subdirectory(src/matrix_multiply_omp_compare)
-add_subdirectory(src/MPI_for_CUDA_backend)
+add_subdirectory(src/MPI_with_SYCL)
 add_subdirectory(src/scan_parallel_inclusive)
 if(ENABLE_GRAPHICS)
      add_subdirectory(src/fluid)
diff --git a/README.md b/README.md
index 520cbbd..a0cfd06 100644
--- a/README.md
+++ b/README.md
@@ -30,12 +30,16 @@ direction of the mouse travel. The fluid fades slowly over time so as not to fil
 the container.
 
 ## Non-graphical Demos
-### MPI for CUDA Backend 
-MPI, the Message Passing Interface, is a standard API for communicating data via 
-messages between distributed processes that is commonly used in HPC to build 
-applications that can scale to multi-node computer clusters.
+### MPI with SYCL
+MPI, the Message Passing Interface, is a standard API for communicating data
+via messages between distributed processes that is commonly used in HPC to
+build applications that can scale to multi-node computer clusters.
 The three minimal code examples demonstrate how some GPUs can support
-CUDA-Aware MPI together with SYCL.
+GPU-Aware MPI together with SYCL. This enables fast device to device memory
+transfers and collective operations without going via the host.
+More generally the USM code samples are also portable across any SYCL backend
+(including CPU devices) that support the MPI standard. For this reason we
+use the more general term "device-aware" MPI.
 
 The first example uses the SYCL Unified Shared Memory (USM) memory model 
 (`send_recv_usm`). The second uses the Buffer (`send_recv_buff`) model. Each
@@ -50,17 +54,39 @@ using the SYCL 2020 reduction interface. Finally, the partial results from each
 rank are reduced to a final scalar value, `res`, using Reduce. Finally, the 
 initial data is updated using Gather.
 
-These three examples form part of the [Codeplay oneAPI for NVIDIA GPUs plugin 
-documentation](https://developer.codeplay.com/products/oneapi/nvidia/latest/guides/MPI-guide).
-The documentation refers to the gpu-aware MPI guide for the CUDA backend.
-
-Building the MPI-CUDA examples requires the CUDA backend to be enabled and the
-MPI headers and library to be present on the system. This demo will be
-automatically skipped when not building for the CUDA backend or when MPI is not
-installed/detected. A message saying this will appear in the CMake configuration
-output. Additionally, in order to run the examples, the MPI implementation needs
-to be CUDA-aware. This is only detectable at runtime, so the examples may build
-fine but crash on execution if the linked MPI library isn't CUDA-aware.
+These three examples form part of the Codeplay oneAPI for [NVIDIA GPUs](https://developer.codeplay.com/products/oneapi/nvidia/latest/guides/MPI-guide)
+and [AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/latest/guides/MPI-guide)
+plugin documentation.
+These two links point to the device-aware MPI guide for the CUDA/HIP backends
+respectively.
+
+Building the MPI examples requires that the correct
+MPI headers and library be present on the system, and that you have set your
+CMAKE_CXX_COMPILER correctly (If you are using an MPI wrapper such as `mpicxx`).
+This demo will be automatically skipped when MPI is not installed/detected.
+Sometimes CMake fails to find the correct MPI library. A message saying this
+will appear in the CMake configuration output. If this occurs then you
+should adjust the CMakeLists.txt manually depending on the location of your
+MPI installation. E.g.
+
+```bash
+--- a/src/MPI_with_SYCL/CMakeLists.txt
++++ b/src/MPI_with_SYCL/CMakeLists.txt
+@@ -5,7 +5,7 @@ else()
+     message(STATUS "Found MPI, configuring the MPI_with_SYCL demo")
+     foreach(TARGET send_recv_usm send_recv_buff scatter_reduce_gather)
+         add_executable(${TARGET} ${TARGET}.cpp)
+-        target_compile_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_INCLUDE_DIRS})
+-        target_link_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_LIBRARIES})
++        target_compile_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_INCLUDE_DIRS} -I/opt/cray/pe/mpich/8.1.25/ofi/cray/10.0/include/)
++        target_link_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_LIBRARIES} -L/opt/cray/pe/mpich/8.1.25/ofi/cray/10.0/lib)
+     endforeach()
+ endif()
+```
+
+Additionally, in order to run the examples, the MPI implementation needs
+to be device-aware. This is only detectable at runtime, so the examples may build
+fine but crash on execution if the linked MPI library isn't device-aware.
 
 ### Parallel Inclusive Scan
 Implementation of a parallel inclusive scan with a given associative binary 
diff --git a/src/MPI_for_CUDA_backend/CMakeLists.txt b/src/MPI_for_CUDA_backend/CMakeLists.txt
deleted file mode 100644
index 90390da..0000000
--- a/src/MPI_for_CUDA_backend/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-if(NOT ENABLE_CUDA)
-    message(STATUS "CUDA backend is disabled, skipping the MPI_for_CUDA_backend demo")
-else()
-    find_package(MPI)
-    if(NOT MPI_FOUND)
-        message(STATUS "MPI not found, skipping the MPI_for_CUDA_backend demo")
-    else()
-        message(STATUS "Found MPI, configuring the MPI_for_CUDA_backend demo")
-        foreach(TARGET send_recv_usm send_recv_buff scatter_reduce_gather)
-            add_executable(${TARGET} ${TARGET}.cpp)
-            target_compile_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_INCLUDE_DIRS})
-            target_link_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_LIBRARIES})
-        endforeach()
-    endif()
-endif()
diff --git a/src/MPI_with_SYCL/CMakeLists.txt b/src/MPI_with_SYCL/CMakeLists.txt
new file mode 100644
index 0000000..101282b
--- /dev/null
+++ b/src/MPI_with_SYCL/CMakeLists.txt
@@ -0,0 +1,11 @@
+find_package(MPI)
+if(NOT MPI_FOUND)
+    message(STATUS "MPI not found, skipping the MPI_with_SYCL demo")
+else()
+    message(STATUS "Found MPI, configuring the MPI_with_SYCL demo")
+    foreach(TARGET send_recv_usm send_recv_buff scatter_reduce_gather)
+        add_executable(${TARGET} ${TARGET}.cpp)
+        target_compile_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_INCLUDE_DIRS})
+        target_link_options(${TARGET} PUBLIC ${SYCL_FLAGS} ${MPI_LIBRARIES})
+    endforeach()
+endif()
diff --git a/src/MPI_for_CUDA_backend/scatter_reduce_gather.cpp b/src/MPI_with_SYCL/scatter_reduce_gather.cpp
similarity index 94%
rename from src/MPI_for_CUDA_backend/scatter_reduce_gather.cpp
rename to src/MPI_with_SYCL/scatter_reduce_gather.cpp
index 8628393..003de36 100644
--- a/src/MPI_for_CUDA_backend/scatter_reduce_gather.cpp
+++ b/src/MPI_with_SYCL/scatter_reduce_gather.cpp
@@ -1,8 +1,7 @@
-// Compile with `mpicxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda
-// -Xsycl-target-backend --cuda-gpu-arch=sm_xx scatter_reduce_gather.cpp -o res`
-// Where sm_xx is the Compute Capability (CC). If the `-Xsycl-target-backend
-// --cuda-gpu-arch=` flags are not explicitly provided the lowest supported CC
-// will be used: sm_50.
+// Refer to
+// https://developer.codeplay.com/products/oneapi/nvidia/latest/guides/MPI-guide
+// or https://developer.codeplay.com/products/oneapi/amd/latest/guides/MPI-guide
+// for build/run instructions
 
 // This sample runs a common HPC programming idiom in a simplified form. Firstly
 // a data array is scattered to two processes associated with
@@ -43,7 +42,7 @@ int main(int argc, char *argv[]) {
   }
 
   /* -------------------------------------------------------------------------------------------
-    SYCL Initialization, which internally sets the CUDA device.
+    SYCL Initialization, which internally sets the device.
   --------------------------------------------------------------------------------------------*/
 
   sycl::queue q{};
diff --git a/src/MPI_for_CUDA_backend/send_recv_buff.cpp b/src/MPI_with_SYCL/send_recv_buff.cpp
similarity index 59%
rename from src/MPI_for_CUDA_backend/send_recv_buff.cpp
rename to src/MPI_with_SYCL/send_recv_buff.cpp
index d466ebe..fc4ccd0 100644
--- a/src/MPI_for_CUDA_backend/send_recv_buff.cpp
+++ b/src/MPI_with_SYCL/send_recv_buff.cpp
@@ -1,17 +1,51 @@
-// Compile with `mpicxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda
-// -Xsycl-target-backend --cuda-gpu-arch=sm_xx send_recv_buff.cpp -o res`
-// where sm_xx is the Compute Capability (CC). If the `-Xsycl-target-backend
-// --cuda-gpu-arch=` flags are not explicitly provided the lowest supported CC
-// will be used: sm_50.
+// Refer to
+// https://developer.codeplay.com/products/oneapi/nvidia/latest/guides/MPI-guide
+// or https://developer.codeplay.com/products/oneapi/amd/latest/guides/MPI-guide
+// for build/run instructions
 
-// This example shows how to use CUDA-aware MPI with SYCL Buffer memory using a
-// simple send-receive pattern.
+// This example shows how to use device-aware MPI with SYCL Buffer memory using
+// a simple send-receive pattern.
 
 #include <assert.h>
 #include <mpi.h>
 
 #include <sycl/sycl.hpp>
 
+/// Get the native device pointer from a SYCL accessor
+template <typename Accessor>
+inline void *getDevicePointer(const Accessor &acc,
+                              const sycl::interop_handle &ih) {
+  void *device_ptr{nullptr};
+  switch (ih.get_backend()) {
+#if SYCL_EXT_ONEAPI_BACKEND_CUDA
+    case sycl::backend::ext_oneapi_cuda: {
+      device_ptr = reinterpret_cast<void *>(
+          ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(acc));
+      break;
+    }
+#endif
+#if SYCL_EXT_ONEAPI_BACKEND_HIP
+    case sycl::backend::ext_oneapi_hip: {
+      device_ptr = reinterpret_cast<void *>(
+          ih.get_native_mem<sycl::backend::ext_oneapi_hip>(acc));
+      break;
+    }
+#endif
+    case sycl::backend::ext_oneapi_level_zero: {
+      device_ptr = reinterpret_cast<void *>(
+          ih.get_native_mem<sycl::backend::ext_oneapi_level_zero>(acc));
+      break;
+    }
+    default: {
+      throw std::runtime_error{
+          "Backend does not yet support buffer interop "
+          "required for device-aware MPI with sycl::buffer"};
+      break;
+    }
+  }
+  return device_ptr;
+}
+
 int main(int argc, char *argv[]) {
   /* ---------------------------------------------------------------------------
     MPI Initialization.
@@ -37,7 +71,7 @@ int main(int argc, char *argv[]) {
   }
 
   /* ---------------------------------------------------------------------------
-    SYCL Initialization, which internally sets the CUDA device.
+    SYCL Initialization, which internally sets the device.
   ----------------------------------------------------------------------------*/
 
   sycl::queue q{};
@@ -66,19 +100,15 @@ int main(int argc, char *argv[]) {
         auto kern = [=](sycl::id<1> id) { acc[id] *= 2; };
         h.parallel_for(sycl::range<1>{nelem}, kern);
       };
-      // When using buffers with CUDA-aware MPI, a host_task must be used with a
-      // sycl::interop_handle in the following way. This host task command group
-      // uses MPI_Send to send the data to rank 1.
+      // When using buffers with device-aware MPI, a host_task must be used with
+      // a sycl::interop_handle in the following way. This host task command
+      // group uses MPI_Send to send the data to rank 1.
       auto ht = [&](sycl::handler &h) {
         sycl::accessor acc{buff, h};
         h.host_task([=](sycl::interop_handle ih) {
-          // get the native CUDA device pointer from the SYCL accessor.
-          auto cuda_ptr = reinterpret_cast<int *>(
-              ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(acc));
-
-          MPI_Status status;
+          void *device_ptr = getDevicePointer(acc, ih);
           // Send the data from rank 0 to rank 1.
-          MPI_Send(cuda_ptr, nsize, MPI_BYTE, 1, tag, MPI_COMM_WORLD);
+          MPI_Send(device_ptr, nsize, MPI_BYTE, 1, tag, MPI_COMM_WORLD);
           printf("Sent %d elements from %d to 1\n", nelem, rank);
         });
       };
@@ -92,13 +122,11 @@ int main(int argc, char *argv[]) {
       auto ht = [&](sycl::handler &h) {
         sycl::accessor acc{buff, h};
         h.host_task([=](sycl::interop_handle ih) {
-          // get the native CUDA device pointer from the SYCL accessor.
-          auto cuda_ptr = reinterpret_cast<int *>(
-              ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(acc));
-
+          void *device_ptr = getDevicePointer(acc, ih);
           MPI_Status status;
           // Receive the data sent from rank 0.
-          MPI_Recv(cuda_ptr, nsize, MPI_BYTE, 0, tag, MPI_COMM_WORLD, &status);
+          MPI_Recv(device_ptr, nsize, MPI_BYTE, 0, tag, MPI_COMM_WORLD,
+                   &status);
           printf("received status==%d\n", status.MPI_ERROR);
         });
       };
diff --git a/src/MPI_for_CUDA_backend/send_recv_usm.cpp b/src/MPI_with_SYCL/send_recv_usm.cpp
similarity index 84%
rename from src/MPI_for_CUDA_backend/send_recv_usm.cpp
rename to src/MPI_with_SYCL/send_recv_usm.cpp
index 26c0d2e..1487243 100644
--- a/src/MPI_for_CUDA_backend/send_recv_usm.cpp
+++ b/src/MPI_with_SYCL/send_recv_usm.cpp
@@ -1,10 +1,9 @@
-// Compile with `mpicxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda
-// -Xsycl-target-backend --cuda-gpu-arch=sm_xx send_recv_usm.cpp -o res` Where
-// sm_xx is the Compute Capability (CC). If the `-Xsycl-target-backend
-// --cuda-gpu-arch=` flags are not explicitly provided the lowest supported CC
-// will be used: sm_50.
+// Refer to
+// https://developer.codeplay.com/products/oneapi/nvidia/latest/guides/MPI-guide
+// or https://developer.codeplay.com/products/oneapi/amd/latest/guides/MPI-guide
+// for build/run instructions
 
-// This example shows how to use CUDA-aware MPI with SYCL USM memory using a
+// This example shows how to use device-aware MPI with SYCL USM memory using a
 // simple send-receive pattern.
 
 #include <assert.h>
@@ -37,7 +36,7 @@ int main(int argc, char *argv[]) {
   }
 
   /* -------------------------------------------------------------------------------------------
-      SYCL Initialization, which internally sets the CUDA device.
+      SYCL Initialization, which internally sets the device.
   --------------------------------------------------------------------------------------------*/
 
   sycl::queue q{};