From 5491714d7fdfcc50dfe23f895ce77c7abd7aa1ba Mon Sep 17 00:00:00 2001 From: Andre Vehreschild Date: Wed, 2 Oct 2024 15:35:50 +0200 Subject: [PATCH 1/2] Purge communicator before free on mpich4+. Mpich from 4.0 on seems to bug when there is a message dangling on freeing the communicator. --- src/runtime-libraries/mpi/CMakeLists.txt | 4 ++++ src/runtime-libraries/mpi/mpi_caf.c | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/runtime-libraries/mpi/CMakeLists.txt b/src/runtime-libraries/mpi/CMakeLists.txt index b79334c1..6341cdf7 100644 --- a/src/runtime-libraries/mpi/CMakeLists.txt +++ b/src/runtime-libraries/mpi/CMakeLists.txt @@ -154,6 +154,10 @@ if (mpi_version_out MATCHES "[Oo]pen[ -][Mm][Pp][Ii]") if(NOT DEFINED ENV{TRAVIS}) message( STATUS "Open-MPI back end detected, passing --allow-run-as-root to allow tests to pass when run with sudo or as root." ) endif() +elseif (mpi_version_out MATCHES "HYDRA") + message(STATUS "MPICH detected") + target_compile_definitions(caf_mpi PRIVATE MPI_CLEAR_COMM_BEFORE_FREE) + target_compile_definitions(caf_mpi_static PRIVATE MPI_CLEAR_COMM_BEFORE_FREE) endif () if("${CMAKE_Fortran_COMPILER_ID}" STREQUAL "GNU") diff --git a/src/runtime-libraries/mpi/mpi_caf.c b/src/runtime-libraries/mpi/mpi_caf.c index ce63f724..5e5d71d7 100644 --- a/src/runtime-libraries/mpi/mpi_caf.c +++ b/src/runtime-libraries/mpi/mpi_caf.c @@ -1104,6 +1104,24 @@ finalize_internal(int status_code) ierr = MPI_Finalize(); chk_err(ierr); } #else +#ifdef MPI_CLEAR_COMM_BEFORE_FREE + { + int probe_flag; + MPI_Status status; + do { + ierr = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, CAF_COMM_WORLD, &probe_flag, + &status); /* error is not of interest. */ + if (probe_flag) { + int cnt; + MPI_Get_count(&status, MPI_BYTE, &cnt); + void * buf = alloca(cnt); + ierr = MPI_Recv(buf, cnt, MPI_BYTE, status.MPI_SOURCE, status.MPI_TAG, + CAF_COMM_WORLD, &status); chk_err(ierr); + } + } while (probe_flag); + } +#endif + dprint("freeing caf's communicator.\n"); ierr = MPI_Comm_free(&CAF_COMM_WORLD); chk_err(ierr); CAF_Win_unlock_all(*stat_tok); @@ -1112,6 +1130,7 @@ finalize_internal(int status_code) /* Only call Finalize if CAF runtime Initialized MPI. */ if (caf_owns_mpi) { + dprint("Finalizing MPI.\n"); ierr = MPI_Finalize(); chk_err(ierr); } #endif From 2bae1150a6b593de70c69f13dbda98b5b6e01943 Mon Sep 17 00:00:00 2001 From: Andre Vehreschild Date: Wed, 9 Oct 2024 12:20:51 +0200 Subject: [PATCH 2/2] Use a global property for indicating openmpi. Using just a variable to indicate openmpi did not work reliably with all generators, i.e. at least with ninja the variable was not set in the top-level CMakeLists.txt and therefore additional options were not set leading to test failures. --- CMakeLists.txt | 6 ++++++ src/runtime-libraries/mpi/CMakeLists.txt | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ea5fc68..47fbf4b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -644,6 +644,11 @@ define_property(TARGET FULL_DOCS "Property to mark executable targets run as tests that they require 2^n images." ) +#------------------------------------------------------------------------------------- +# Add global openmpi property, because using a variable an setting in parent scope did +# not work as expected, i.e., not at all, on Linux Fedora 39. +#------------------------------------------------------------------------------------- +define_property(GLOBAL PROPERTY openmpi BRIEF_DOCS "True when mpi is openMPI.") #------------------------------- # Recurse into the src directory @@ -715,6 +720,7 @@ function(add_caf_test name num_caf_img test_target) endif() endif() # Add a host file for OMPI + get_property(openmpi GLOBAL PROPERTY openmpi) if ( openmpi ) set(test_parameters --hostfile ${CMAKE_BINARY_DIR}/hostfile) endif() diff --git a/src/runtime-libraries/mpi/CMakeLists.txt b/src/runtime-libraries/mpi/CMakeLists.txt index 6341cdf7..8155b17b 100644 --- a/src/runtime-libraries/mpi/CMakeLists.txt +++ b/src/runtime-libraries/mpi/CMakeLists.txt @@ -147,7 +147,7 @@ execute_process(COMMAND ${MPIEXEC_EXECUTABLE} --version OUTPUT_VARIABLE mpi_version_out) if (mpi_version_out MATCHES "[Oo]pen[ -][Mm][Pp][Ii]") message( STATUS "OpenMPI detected") - set ( openmpi true PARENT_SCOPE) + set_property(GLOBAL PROPERTY openmpi true) # Write out a host file because OMPI's mpiexec is dumb file(WRITE ${CMAKE_BINARY_DIR}/hostfile "${HOST_NAME} slots=${N_CPU}\n") message( STATUS "hostfile written to: ${CMAKE_BINARY_DIR}/hostfile")