Integrate changes from NERSC GPU hackathon. (BlueBrain/CoreNeuron#713)

Summary of changes: - Support OpenMP target offload when NMODL and GPU support are enabled. (BlueBrain/CoreNeuron#693, BlueBrain/CoreNeuron#704, BlueBrain/CoreNeuron#705, BlueBrain/CoreNeuron#707, BlueBrain/CoreNeuron#708, BlueBrain/CoreNeuron#716, BlueBrain/CoreNeuron#719) - Use sensible defaults for the --nwarp parameter, improving the performance of the Hines solver with --cell-permute=2 on GPU. (BlueBrain/CoreNeuron#700, BlueBrain/CoreNeuron#710, BlueBrain/CoreNeuron#718) - Use a Boost memory pool, if Boost is available, to reduce the number of independent CUDA unified memory allocations used for Random123 stream objects. This speeds up initialisation of models using Random123, and also makes it feasible to use NSight Compute on models using Random123 and for NSight Systems to profile initialisation. (BlueBrain/CoreNeuron#702, BlueBrain/CoreNeuron#703) - Use -cuda when compiling with NVHPC and OpenACC or OpenMP, as recommended on the NVIDIA forums. (BlueBrain/CoreNeuron#721) - Do not compile for compute capability 6.0 by default, as this is not supported by NVHPC with OpenMP target offload. - Add new GitLab CI tests so we test CoreNEURON + NMODL with both OpenACC and OpenMP. (BlueBrain/CoreNeuron#698, BlueBrain/CoreNeuron#717) - Add CUDA runtime header search path explicitly, so we don't rely on it being implicit in our NVHPC localrc. - Cleanup unused code. (BlueBrain/CoreNeuron#711) Co-authored-by: Pramod Kumbhar <[email protected]> Co-authored-by: Ioannis Magkanaris <[email protected]> Co-authored-by: Christos Kotsalos <[email protected]> Co-authored-by: Nicolas Cornu <[email protected]> CoreNEURON Repo SHA: BlueBrain/CoreNeuron@423ae6c
neuronsimulator · Dec 23, 2021 · 30e1474 · 30e1474
1 parent 65e3f92
commit 30e1474
Show file tree

Hide file tree

Showing 43 changed files with 1,109 additions and 1,080 deletions.
diff --git a/cmake/coreneuron/CMakeLists.txt b/cmake/coreneuron/CMakeLists.txt
@@ -85,6 +85,7 @@ add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11)
 # Build options
 # =============================================================================
 option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON)
+option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON)
 option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON)
 option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF)
 option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON)
@@ -117,6 +118,7 @@ else()
   set(CORENRN_HAVE_NVHPC_COMPILER OFF)
 endif()
 
+set(CORENRN_ACCELERATOR_OFFLOAD "Disabled")
 if(CORENRN_ENABLE_GPU)
   # Older CMake versions than 3.15 have not been tested for GPU/CUDA/OpenACC support after
   # https://github.com/BlueBrain/CoreNeuron/pull/609.
@@ -135,7 +137,7 @@ if(CORENRN_ENABLE_GPU)
 
   # Set some sensible default CUDA architectures.
   if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
+    set(CMAKE_CUDA_ARCHITECTURES 70 80)
     message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}")
   endif()
 
@@ -185,6 +187,18 @@ if(CORENRN_ENABLE_GPU)
   set(CMAKE_CUDA_FLAGS
       "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085"
   )
+
+  if(CORENRN_ENABLE_NMODL)
+    # NMODL supports both OpenACC and OpenMP target offload
+    if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
+      set(CORENRN_ACCELERATOR_OFFLOAD "OpenMP")
+    else()
+      set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC")
+    endif()
+  else()
+    # MOD2C only supports OpenACC offload
+    set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC")
+  endif()
 endif()
 
 # =============================================================================
@@ -526,6 +540,7 @@ message(STATUS "MOD2CPP PATH        | ${CORENRN_MOD2CPP_BINARY}")
 message(STATUS "GPU Support         | ${CORENRN_ENABLE_GPU}")
 if(CORENRN_ENABLE_GPU)
   message(STATUS "  CUDA              | ${CUDAToolkit_LIBRARY_DIR}")
+  message(STATUS "  Offload           | ${CORENRN_ACCELERATOR_OFFLOAD}")
   message(STATUS "  Unified Memory    | ${CORENRN_ENABLE_CUDA_UNIFIED_MEMORY}")
 endif()
 message(STATUS "Auto Timeout        | ${CORENRN_ENABLE_TIMEOUT}")

diff --git a/cmake/coreneuron/OpenAccHelper.cmake b/cmake/coreneuron/OpenAccHelper.cmake
@@ -10,6 +10,9 @@
 if(CORENRN_ENABLE_GPU)
   # Enable cudaProfiler{Start,Stop}() behind the Instrumentor::phase... APIs
   add_compile_definitions(CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU)
+  # Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for example, starting and
+  # stopping profiling. This makes sure those headers can be found.
+  include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
   # cuda unified memory support
   if(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY)
     add_compile_definitions(CORENEURON_UNIFIED_MEMORY)
@@ -47,25 +50,32 @@ if(CORENRN_ENABLE_GPU)
     endif()
     set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
   endif()
-  # -acc enables OpenACC support, -cuda links CUDA libraries and (very importantly!) seems to be
-  # required to make the NVHPC compiler do the device code linking. Otherwise the explicit CUDA
-  # device code (.cu files in libcoreneuron) has to be linked in a separate, earlier, step, which
-  # apparently causes problems with interoperability with OpenACC. Passing -cuda to nvc++ when
-  # compiling (as opposed to linking) seems to enable CUDA C++ support, which has other consequences
-  # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
-  # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
-  # CUDA version as is used for the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-acc -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}")
-  set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
+  # -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code
+  # linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code
+  # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
+  # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
+  # code is compiled with the same CUDA version as the explicit CUDA code.
+  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
   # same default compute capabilities as each other, particularly on GPU-less build machines.
   foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
     string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
   endforeach()
+  if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
+    # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
+    # for a region then prefer OpenMP.
+    add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
+    string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu")
+  elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
+    # Only enable OpenACC offload for GPU
+    string(APPEND NVHPC_ACC_COMP_FLAGS " -acc")
+  else()
+    message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers")
+  endif()
   # avoid PGI adding standard compliant "-A" flags
   set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
-  string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")
+  string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}")
   # Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is
   # especially needed when we compile with -O0 or -O1 optimisation level where we get link errors.
   # Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined
@@ -81,7 +91,7 @@ if(CORENRN_ENABLE_GPU)
     GLOBAL
     PROPERTY
       CORENEURON_LIB_LINK_FLAGS
-      "${NVHPC_ACC_COMP_FLAGS} ${NVHPC_ACC_LINK_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive"
+      "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive"
   )
 else()
   set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS

diff --git a/external/nmodl b/external/nmodl
diff --git a/src/coreneuron/CMakeLists.txt b/src/coreneuron/CMakeLists.txt
@@ -284,6 +284,16 @@ target_include_directories(coreneuron SYSTEM
 target_include_directories(coreneuron SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
 
+if(CORENRN_ENABLE_GPU)
+  # nrnran123.cpp possibly-temporarily uses Boost.Pool in GPU builds if it's available.
+  find_package(Boost QUIET)
+  if(Boost_FOUND)
+    message(STATUS "Boost found, enabling use of memory pools for Random123...")
+    target_include_directories(coreneuron SYSTEM PRIVATE ${Boost_INCLUDE_DIRS})
+    target_compile_definitions(coreneuron PRIVATE CORENEURON_USE_BOOST_POOL)
+  endif()
+endif()
+
 set_target_properties(
   coreneuron scopmath
   PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib

diff --git a/src/coreneuron/apps/corenrn_parameters.cpp b/src/coreneuron/apps/corenrn_parameters.cpp
@@ -47,7 +47,12 @@ corenrn_parameters::corenrn_parameters() {
                  "Print number of instances of each mechanism and detailed memory stats.");
 
     auto sub_gpu = app.add_option_group("GPU", "Commands relative to GPU.");
-    sub_gpu->add_option("-W, --nwarp", this->nwarp, "Number of warps to balance.", true)
+    sub_gpu
+        ->add_option("-W, --nwarp",
+                     this->nwarp,
+                     "Number of warps to execute in parallel the Hines solver. Each warp solves a "
+                     "group of cells. (Only used with cell permute 2)",
+                     true)
         ->check(CLI::Range(0, 1'000'000));
     sub_gpu
         ->add_option("-R, --cell-permute",

diff --git a/src/coreneuron/apps/corenrn_parameters.hpp b/src/coreneuron/apps/corenrn_parameters.hpp
@@ -46,8 +46,8 @@ struct corenrn_parameters_data {
     unsigned ms_subint = 2;                /// Number of multisend interval. 1 or 2
     unsigned spkcompress = 0;              /// Spike Compression
     unsigned cell_interleave_permute = 0;  /// Cell interleaving permutation
-    unsigned nwarp = 0;     /// Number of warps to balance for cell_interleave_permute == 2
-    unsigned num_gpus = 0;  /// Number of gpus to use per node
+    unsigned nwarp = 65536;  /// Number of warps to balance for cell_interleave_permute == 2
+    unsigned num_gpus = 0;   /// Number of gpus to use per node
     unsigned report_buff_size = report_buff_size_default;  /// Size in MB of the report buffer.
     int seed = -1;  /// Initialization seed for random number generator (int)
 

diff --git a/src/coreneuron/apps/main1.cpp b/src/coreneuron/apps/main1.cpp
@@ -193,10 +193,11 @@ void nrn_init_and_load_data(int argc,
     // precedence is: set by user, globals.dat, 34.0
     celsius = corenrn_param.celsius;
 
-#if _OPENACC
+#if CORENEURON_ENABLE_GPU
     if (!corenrn_param.gpu && corenrn_param.cell_interleave_permute == 2) {
         fprintf(stderr,
-                "compiled with _OPENACC does not allow the combination of --cell-permute=2 and "
+                "compiled with CORENEURON_ENABLE_GPU does not allow the combination of "
+                "--cell-permute=2 and "
                 "missing --gpu\n");
         exit(1);
     }
@@ -499,7 +500,7 @@ extern "C" void mk_mech_init(int argc, char** argv) {
     }
 #endif
 
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     if (corenrn_param.gpu) {
         init_gpu();
     }
@@ -560,10 +561,8 @@ extern "C" int run_solve_core(int argc, char** argv) {
 #endif
     bool compute_gpu = corenrn_param.gpu;
 
-    // clang-format off
-
-    #pragma acc update device(celsius, secondorder, pi) if (compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(update device(celsius, secondorder, pi) if (compute_gpu))
+    nrn_pragma_omp(target update to(celsius, secondorder, pi) if (compute_gpu))
     {
         double v = corenrn_param.voltage;
         double dt = corenrn_param.dt;