Skip to content

Commit

Permalink
Integrate changes from NERSC GPU hackathon. (BlueBrain/CoreNeuron#713)
Browse files Browse the repository at this point in the history
Summary of changes:
 - Support OpenMP target offload when NMODL and GPU support are enabled.
   (BlueBrain/CoreNeuron#693, BlueBrain/CoreNeuron#704, BlueBrain/CoreNeuron#705, BlueBrain/CoreNeuron#707, BlueBrain/CoreNeuron#708, BlueBrain/CoreNeuron#716, BlueBrain/CoreNeuron#719)
 - Use sensible defaults for the --nwarp parameter, improving the performance
   of the Hines solver with --cell-permute=2 on GPU. (BlueBrain/CoreNeuron#700, BlueBrain/CoreNeuron#710, BlueBrain/CoreNeuron#718)
 - Use a Boost memory pool, if Boost is available, to reduce the number of
   independent CUDA unified memory allocations used for Random123 stream
   objects. This speeds up initialisation of models using Random123, and also
   makes it feasible to use NSight Compute on models using Random123 and for
   NSight Systems to profile initialisation. (BlueBrain/CoreNeuron#702, BlueBrain/CoreNeuron#703)
 - Use -cuda when compiling with NVHPC and OpenACC or OpenMP, as recommended
   on the NVIDIA forums. (BlueBrain/CoreNeuron#721)
 - Do not compile for compute capability 6.0 by default, as this is not
   supported by NVHPC with OpenMP target offload.
 - Add new GitLab CI tests so we test CoreNEURON + NMODL with both OpenACC and
   OpenMP. (BlueBrain/CoreNeuron#698, BlueBrain/CoreNeuron#717)
 - Add CUDA runtime header search path explicitly, so we don't rely on it being
   implicit in our NVHPC localrc.
 - Cleanup unused code. (BlueBrain/CoreNeuron#711)

Co-authored-by: Pramod Kumbhar <[email protected]>
Co-authored-by: Ioannis Magkanaris <[email protected]>
Co-authored-by: Christos Kotsalos <[email protected]>
Co-authored-by: Nicolas Cornu <[email protected]>

CoreNEURON Repo SHA: BlueBrain/CoreNeuron@423ae6c
  • Loading branch information
olupton authored Dec 23, 2021
1 parent 65e3f92 commit 30e1474
Show file tree
Hide file tree
Showing 43 changed files with 1,109 additions and 1,080 deletions.
17 changes: 16 additions & 1 deletion cmake/coreneuron/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11)
# Build options
# =============================================================================
option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON)
option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON)
option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON)
option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF)
option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON)
Expand Down Expand Up @@ -117,6 +118,7 @@ else()
set(CORENRN_HAVE_NVHPC_COMPILER OFF)
endif()

set(CORENRN_ACCELERATOR_OFFLOAD "Disabled")
if(CORENRN_ENABLE_GPU)
# Older CMake versions than 3.15 have not been tested for GPU/CUDA/OpenACC support after
# https://github.com/BlueBrain/CoreNeuron/pull/609.
Expand All @@ -135,7 +137,7 @@ if(CORENRN_ENABLE_GPU)

# Set some sensible default CUDA architectures.
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
set(CMAKE_CUDA_ARCHITECTURES 70 80)
message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}")
endif()

Expand Down Expand Up @@ -185,6 +187,18 @@ if(CORENRN_ENABLE_GPU)
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085"
)

if(CORENRN_ENABLE_NMODL)
# NMODL supports both OpenACC and OpenMP target offload
if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
set(CORENRN_ACCELERATOR_OFFLOAD "OpenMP")
else()
set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC")
endif()
else()
# MOD2C only supports OpenACC offload
set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC")
endif()
endif()

# =============================================================================
Expand Down Expand Up @@ -526,6 +540,7 @@ message(STATUS "MOD2CPP PATH | ${CORENRN_MOD2CPP_BINARY}")
message(STATUS "GPU Support | ${CORENRN_ENABLE_GPU}")
if(CORENRN_ENABLE_GPU)
message(STATUS " CUDA | ${CUDAToolkit_LIBRARY_DIR}")
message(STATUS " Offload | ${CORENRN_ACCELERATOR_OFFLOAD}")
message(STATUS " Unified Memory | ${CORENRN_ENABLE_CUDA_UNIFIED_MEMORY}")
endif()
message(STATUS "Auto Timeout | ${CORENRN_ENABLE_TIMEOUT}")
Expand Down
34 changes: 22 additions & 12 deletions cmake/coreneuron/OpenAccHelper.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
if(CORENRN_ENABLE_GPU)
# Enable cudaProfiler{Start,Stop}() behind the Instrumentor::phase... APIs
add_compile_definitions(CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU)
# Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for example, starting and
# stopping profiling. This makes sure those headers can be found.
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
# cuda unified memory support
if(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY)
add_compile_definitions(CORENEURON_UNIFIED_MEMORY)
Expand Down Expand Up @@ -47,25 +50,32 @@ if(CORENRN_ENABLE_GPU)
endif()
set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
endif()
# -acc enables OpenACC support, -cuda links CUDA libraries and (very importantly!) seems to be
# required to make the NVHPC compiler do the device code linking. Otherwise the explicit CUDA
# device code (.cu files in libcoreneuron) has to be linked in a separate, earlier, step, which
# apparently causes problems with interoperability with OpenACC. Passing -cuda to nvc++ when
# compiling (as opposed to linking) seems to enable CUDA C++ support, which has other consequences
# due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
# more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
# CUDA version as is used for the explicit CUDA code.
set(NVHPC_ACC_COMP_FLAGS "-acc -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}")
set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
# -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code
# linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code
# and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
# some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
# code is compiled with the same CUDA version as the explicit CUDA code.
set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
# Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
# code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
# same default compute capabilities as each other, particularly on GPU-less build machines.
foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
endforeach()
if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
# Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
# for a region then prefer OpenMP.
add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu")
elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
# Only enable OpenACC offload for GPU
string(APPEND NVHPC_ACC_COMP_FLAGS " -acc")
else()
message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers")
endif()
# avoid PGI adding standard compliant "-A" flags
set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")
string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}")
# Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is
# especially needed when we compile with -O0 or -O1 optimisation level where we get link errors.
# Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined
Expand All @@ -81,7 +91,7 @@ if(CORENRN_ENABLE_GPU)
GLOBAL
PROPERTY
CORENEURON_LIB_LINK_FLAGS
"${NVHPC_ACC_COMP_FLAGS} ${NVHPC_ACC_LINK_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive"
"${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive"
)
else()
set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS
Expand Down
2 changes: 1 addition & 1 deletion external/nmodl
Submodule nmodl updated from 85dec3 to 46f8ba
10 changes: 10 additions & 0 deletions src/coreneuron/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,16 @@ target_include_directories(coreneuron SYSTEM
target_include_directories(coreneuron SYSTEM
PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)

if(CORENRN_ENABLE_GPU)
# nrnran123.cpp possibly-temporarily uses Boost.Pool in GPU builds if it's available.
find_package(Boost QUIET)
if(Boost_FOUND)
message(STATUS "Boost found, enabling use of memory pools for Random123...")
target_include_directories(coreneuron SYSTEM PRIVATE ${Boost_INCLUDE_DIRS})
target_compile_definitions(coreneuron PRIVATE CORENEURON_USE_BOOST_POOL)
endif()
endif()

set_target_properties(
coreneuron scopmath
PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
Expand Down
7 changes: 6 additions & 1 deletion src/coreneuron/apps/corenrn_parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,12 @@ corenrn_parameters::corenrn_parameters() {
"Print number of instances of each mechanism and detailed memory stats.");

auto sub_gpu = app.add_option_group("GPU", "Commands relative to GPU.");
sub_gpu->add_option("-W, --nwarp", this->nwarp, "Number of warps to balance.", true)
sub_gpu
->add_option("-W, --nwarp",
this->nwarp,
"Number of warps to execute in parallel the Hines solver. Each warp solves a "
"group of cells. (Only used with cell permute 2)",
true)
->check(CLI::Range(0, 1'000'000));
sub_gpu
->add_option("-R, --cell-permute",
Expand Down
4 changes: 2 additions & 2 deletions src/coreneuron/apps/corenrn_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ struct corenrn_parameters_data {
unsigned ms_subint = 2; /// Number of multisend interval. 1 or 2
unsigned spkcompress = 0; /// Spike Compression
unsigned cell_interleave_permute = 0; /// Cell interleaving permutation
unsigned nwarp = 0; /// Number of warps to balance for cell_interleave_permute == 2
unsigned num_gpus = 0; /// Number of gpus to use per node
unsigned nwarp = 65536; /// Number of warps to balance for cell_interleave_permute == 2
unsigned num_gpus = 0; /// Number of gpus to use per node
unsigned report_buff_size = report_buff_size_default; /// Size in MB of the report buffer.
int seed = -1; /// Initialization seed for random number generator (int)

Expand Down
13 changes: 6 additions & 7 deletions src/coreneuron/apps/main1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,11 @@ void nrn_init_and_load_data(int argc,
// precedence is: set by user, globals.dat, 34.0
celsius = corenrn_param.celsius;

#if _OPENACC
#if CORENEURON_ENABLE_GPU
if (!corenrn_param.gpu && corenrn_param.cell_interleave_permute == 2) {
fprintf(stderr,
"compiled with _OPENACC does not allow the combination of --cell-permute=2 and "
"compiled with CORENEURON_ENABLE_GPU does not allow the combination of "
"--cell-permute=2 and "
"missing --gpu\n");
exit(1);
}
Expand Down Expand Up @@ -499,7 +500,7 @@ extern "C" void mk_mech_init(int argc, char** argv) {
}
#endif

#ifdef _OPENACC
#ifdef CORENEURON_ENABLE_GPU
if (corenrn_param.gpu) {
init_gpu();
}
Expand Down Expand Up @@ -560,10 +561,8 @@ extern "C" int run_solve_core(int argc, char** argv) {
#endif
bool compute_gpu = corenrn_param.gpu;

// clang-format off

#pragma acc update device(celsius, secondorder, pi) if (compute_gpu)
// clang-format on
nrn_pragma_acc(update device(celsius, secondorder, pi) if (compute_gpu))
nrn_pragma_omp(target update to(celsius, secondorder, pi) if (compute_gpu))
{
double v = corenrn_param.voltage;
double dt = corenrn_param.dt;
Expand Down
Loading

0 comments on commit 30e1474

Please sign in to comment.