diff --git a/src/MPI_with_SYCL/scatter_reduce_gather.cpp b/src/MPI_with_SYCL/scatter_reduce_gather.cpp index 003de36..a6d8318 100644 --- a/src/MPI_with_SYCL/scatter_reduce_gather.cpp +++ b/src/MPI_with_SYCL/scatter_reduce_gather.cpp @@ -44,8 +44,16 @@ int main(int argc, char *argv[]) { /* ------------------------------------------------------------------------------------------- SYCL Initialization, which internally sets the device. --------------------------------------------------------------------------------------------*/ - - sycl::queue q{}; + // For simplicity the below usage of `sycl::queue` uses a single gpu shared + // between the two ranks. In order to use a separate gpu for each rank, simply + // use `Devs[rank]` instead. Note that it is important to manually instantiate a + // sycl::context in this way using a single device per rank. This implicitly + // sets the gpu device that each MPI rank will use, avoiding MPI/SYCL + // interoperability misuse in the case that the sycl runtime uses the default + // `sycl::context` which instantiates all available devices; and would lead to + // MPI calls leaking data to unused devices. + sycl::context Context(Devs[0]); + sycl::queue q{Context, Devs[0]}; size_t N = 500000; std::vector A(N, 1.0); diff --git a/src/MPI_with_SYCL/send_recv_buff.cpp b/src/MPI_with_SYCL/send_recv_buff.cpp index fc4ccd0..1f31b80 100644 --- a/src/MPI_with_SYCL/send_recv_buff.cpp +++ b/src/MPI_with_SYCL/send_recv_buff.cpp @@ -73,8 +73,16 @@ int main(int argc, char *argv[]) { /* --------------------------------------------------------------------------- SYCL Initialization, which internally sets the device. ----------------------------------------------------------------------------*/ - - sycl::queue q{}; + // For simplicity the below usage of `sycl::queue` uses a single gpu shared + // between the two ranks. In order to use a separate gpu for each rank, simply + // use `Devs[rank]` instead. Note that it is important to manually instantiate a + // sycl::context in this way using a single device per rank. This implicitly + // sets the gpu device that each MPI rank will use, avoiding MPI/SYCL + // interoperability misuse in the case that the sycl runtime uses the default + // `sycl::context` which instantiates all available devices; and would lead to + // MPI calls leaking data to unused devices. + sycl::context Context(Devs[0]); + sycl::queue q{Context, Devs[0]}; int tag = 0; const int nelem = 20; diff --git a/src/MPI_with_SYCL/send_recv_usm.cpp b/src/MPI_with_SYCL/send_recv_usm.cpp index 1487243..dea4343 100644 --- a/src/MPI_with_SYCL/send_recv_usm.cpp +++ b/src/MPI_with_SYCL/send_recv_usm.cpp @@ -38,8 +38,16 @@ int main(int argc, char *argv[]) { /* ------------------------------------------------------------------------------------------- SYCL Initialization, which internally sets the device. --------------------------------------------------------------------------------------------*/ - - sycl::queue q{}; + // For simplicity the below usage of `sycl::queue` uses a single gpu shared + // between the two ranks. In order to use a separate gpu for each rank, simply + // use `Devs[rank]` instead. Note that it is important to manually instantiate a + // sycl::context in this way using a single device per rank. This implicitly + // sets the gpu device that each MPI rank will use, avoiding MPI/SYCL + // interoperability misuse in the case that the sycl runtime uses the default + // `sycl::context` which instantiates all available devices; and would lead to + // MPI calls leaking data to unused devices. + sycl::context Context(Devs[0]); + sycl::queue q{Context, Devs[0]}; int tag = 0; const int nelem = 20; @@ -83,11 +91,10 @@ int main(int argc, char *argv[]) { // Copy the data back to the host and wait for the memory copy to complete. q.memcpy(&data[0], devp, nsize).wait(); - sycl::free(devp, q); - // Check the values. for (int i = 0; i < nelem; ++i) assert(data[i] == -2); } + sycl::free(devp, q); MPI_Finalize(); return 0; }