diff --git a/.gitignore b/.gitignore index 9dbc07673..fd03d2e32 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ *.orig __pycache__/ view +*.cache* diff --git a/CMakeLists.txt b/CMakeLists.txt index 6519366b4..546d74076 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,16 +24,17 @@ set(ENABLE_MPI ON CACHE BOOL "") if (ENABLE_CUDA) - add_compile_definitions(USE_CUDA) - set(SERAC_USE_CUDA ON CACHE BOOL "") + #add_compile_definitions(USE_CUDA) + set(ENABLE_CLANG_CUDA ON CACHE BOOL "") set(SERAC_USE_CUDA ON CACHE BOOL "") set(RAJA_ENABLE_CUDA ON CACHE BOOL "") set(AXOM_USE_CUDA ON CACHE BOOL "") + set(MFEM_USE_CUDA ON CACHE BOOL "") endif() if (NOT MPI_C_COMPILER OR NOT MPI_CXX_COMPILER) - message(FATAL_ERROR + message(FATAL_ERROR "Serac requires MPI. It is required to provide the MPI C/C++ " "compiler wrappers via the CMake variables, " "MPI_C_COMPILER and MPI_CXX_COMPILER.") @@ -141,6 +142,10 @@ include(${PROJECT_SOURCE_DIR}/cmake/thirdparty/SetupSeracThirdParty.cmake) set(CMAKE_C_FLAGS ${SERAC_CMAKE_C_FLAGS}) set(CMAKE_CXX_FLAGS ${SERAC_CMAKE_CXX_FLAGS}) +string(REPLACE " -Werror" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -gdwarf-4 -fgpu-rdc" CACHE STRING "") +#set(CMAKE_CXX_FLAGS_DEBUG "-fopenmp -gdwarf-4 -fgpu-rdc" CACHE STRING "") +#set(CMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS ON CACHE BOOL "") include(${PROJECT_SOURCE_DIR}/cmake/SeracConfigHeader.cmake) @@ -206,7 +211,7 @@ if (SERAC_ENABLE_CODEVELOP) ) endif() -install(EXPORT serac-targets +install(EXPORT serac-targets NAMESPACE serac:: DESTINATION lib/cmake ) diff --git a/cmake/thirdparty/SetupSeracThirdParty.cmake b/cmake/thirdparty/SetupSeracThirdParty.cmake index 29aaa3a6a..59c9fca9f 100644 --- a/cmake/thirdparty/SetupSeracThirdParty.cmake +++ b/cmake/thirdparty/SetupSeracThirdParty.cmake @@ -15,7 +15,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) # Manually set includes as system includes foreach(_target cuda_runtime cuda) get_target_property(_dirs ${_target} INTERFACE_INCLUDE_DIRECTORIES) - set_property(TARGET ${_target} + set_property(TARGET ${_target} APPEND PROPERTY INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_dirs}") endforeach() @@ -59,7 +59,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) #------------------------------------------------------------------------------ if(UMPIRE_DIR) serac_assert_is_directory(VARIABLE_NAME UMPIRE_DIR) - find_package(umpire REQUIRED NO_DEFAULT_PATH + find_package(umpire REQUIRED NO_DEFAULT_PATH PATHS ${UMPIRE_DIR}) message(STATUS "Umpire support is ON") set(UMPIRE_FOUND TRUE) @@ -73,7 +73,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) #------------------------------------------------------------------------------ if(RAJA_DIR) serac_assert_is_directory(VARIABLE_NAME RAJA_DIR) - find_package(RAJA REQUIRED NO_DEFAULT_PATH + find_package(RAJA REQUIRED NO_DEFAULT_PATH PATHS ${RAJA_DIR}) message(STATUS "RAJA support is ON") set(RAJA_FOUND TRUE) @@ -100,7 +100,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) # Manually set includes as system includes get_target_property(_dirs conduit::conduit INTERFACE_INCLUDE_DIRECTORIES) - set_property(TARGET conduit::conduit + set_property(TARGET conduit::conduit APPEND PROPERTY INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_dirs}") @@ -154,10 +154,10 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) message(FATAL_ERROR "Serac+Caliper+CUDA requires CMake > 3.17.") else() find_package(CUDAToolkit REQUIRED) - endif() + endif() endif() - find_package(caliper REQUIRED NO_DEFAULT_PATH + find_package(caliper REQUIRED NO_DEFAULT_PATH PATHS ${CALIPER_DIR}) message(STATUS "Caliper support is ON") set(CALIPER_FOUND TRUE) @@ -177,7 +177,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) #### Store Data that MFEM clears set(tpls_to_save AMGX AXOM CALIPER CAMP CONDUIT HDF5 - HYPRE LUA METIS MFEM NETCDF PARMETIS PETSC RAJA + HYPRE LUA METIS MFEM NETCDF PARMETIS PETSC RAJA SUPERLU_DIST STRUMPACK SUNDIALS TRIBOL UMPIRE) foreach(_tpl ${tpls_to_save}) set(${_tpl}_DIR_SAVE "${${_tpl}_DIR}") @@ -271,7 +271,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) else() add_subdirectory(${PROJECT_SOURCE_DIR}/mfem ${CMAKE_BINARY_DIR}/mfem) endif() - + set(MFEM_FOUND TRUE CACHE BOOL "" FORCE) # Temporary hack to inject the hdf5_hl after netcdf and before hdf5 @@ -304,7 +304,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) serac_assert_is_directory(VARIABLE_NAME AXOM_DIR) find_package(axom REQUIRED - NO_DEFAULT_PATH + NO_DEFAULT_PATH PATHS ${AXOM_DIR}/lib/cmake) message(STATUS "Axom support is ON") @@ -420,10 +420,10 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) # Mark the axom includes as "system" and filter unallowed directories get_target_property(_dirs core INTERFACE_INCLUDE_DIRECTORIES) - set_property(TARGET core + set_property(TARGET core PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${_dirs}") - set_property(TARGET core + set_property(TARGET core APPEND PROPERTY INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_dirs}") else() @@ -432,10 +432,10 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) # Mark the axom includes as "system" and filter unallowed directories get_target_property(_dirs axom INTERFACE_INCLUDE_DIRECTORIES) list(REMOVE_ITEM _dirs ${PROJECT_SOURCE_DIR}) - set_property(TARGET axom + set_property(TARGET axom PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${_dirs}") - set_property(TARGET axom + set_property(TARGET axom APPEND PROPERTY INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_dirs}") endif() @@ -449,7 +449,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) serac_assert_is_directory(VARIABLE_NAME TRIBOL_DIR) find_package(tribol REQUIRED - NO_DEFAULT_PATH + NO_DEFAULT_PATH PATHS ${TRIBOL_DIR}/lib/cmake) if(TARGET tribol) @@ -468,7 +468,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) else() set(TRIBOL_FOUND OFF) endif() - + message(STATUS "Tribol support is " ${TRIBOL_FOUND}) #------------------------------------------------------------------------------ @@ -498,7 +498,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND) # This flag is empty due to us not enabling fortran but we need to strip it # so it doesn't propagate in our project if("${OpenMP_Fortran_FLAGS}" STREQUAL "") - set(OpenMP_Fortran_FLAGS "$<$>:-fopenmp=libomp>;$<$:-fopenmp>") + set(OpenMP_Fortran_FLAGS "-fopenmp")#"$<$>:-fopenmp=libomp>;$<$:-fopenmp>") endif() foreach(_target axom) diff --git a/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@10.0.1.cmake b/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@10.0.1.cmake index 83876a455..23b37f2cf 100644 --- a/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@10.0.1.cmake +++ b/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@10.0.1.cmake @@ -24,13 +24,15 @@ if(DEFINED ENV{SPACK_CC}) else() set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-ibm-10.0.1-gcc-8.3.1/bin/clang" CACHE PATH "") + #set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-13.0.1-gcc-8.3.1/bin/clang" CACHE PATH "") set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-ibm-10.0.1-gcc-8.3.1/bin/clang++" CACHE PATH "") + #set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-13.0.1-gcc-8.3.1/bin/clang++" CACHE PATH "") set(CMAKE_Fortran_COMPILER "/usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran" CACHE PATH "") endif() - +#-fopenmp -gdwarf-4 -fgpu-rdc set(CMAKE_C_STANDARD_LIBRARIES "-lgfortran" CACHE STRING "") set(CMAKE_CXX_STANDARD_LIBRARIES "-lgfortran" CACHE STRING "") @@ -65,13 +67,15 @@ set(BLT_MPI_COMMAND_APPEND "mpibind" CACHE STRING "") # Cuda #------------------------------------------------ -set(CUDAToolkit_ROOT "/usr/tce/packages/cuda/cuda-11.2.0" CACHE PATH "") +set(CUDAToolkit_ROOT "/usr/tce/packages/cuda/cuda-12.0.0" CACHE PATH "") +#set(CUDAToolkit_ROOT "/usr/tce/packages/cuda/cuda-10.1.105" CACHE PATH "") set(CMAKE_CUDA_COMPILER "${CUDAToolkit_ROOT}/bin/nvcc" CACHE PATH "") set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE PATH "") -set(CUDA_TOOLKIT_ROOT_DIR "/usr/tce/packages/cuda/cuda-11.2.0" CACHE PATH "") +set(CUDA_TOOLKIT_ROOT_DIR "/usr/tce/packages/cuda/cuda-12.0.0" CACHE PATH "") +#set(CUDA_TOOLKIT_ROOT_DIR "/usr/tce/packages/cuda/cuda-10.1.105" CACHE PATH "") set(CMAKE_CUDA_ARCHITECTURES "70" CACHE STRING "") @@ -79,15 +83,18 @@ set(ENABLE_OPENMP ON CACHE BOOL "") set(ENABLE_CUDA ON CACHE BOOL "") -set(CMAKE_CUDA_SEPARABLE_COMPILATION ON CACHE BOOL "") +set(ENABLE_CLANG_CUDA OFF CACHE BOOL "") set(CMAKE_CUDA_FLAGS " --expt-extended-lambda --expt-relaxed-constexpr " CACHE STRING "") -set(CMAKE_CUDA_ARCHITECTURES "70" CACHE STRING "") +set(CMAKE_CUDA_SEPARABLE_COMPILATION ON CACHE BOOL "") + +#set(CMAKE_CUDA_FLAGS " --expt-extended-lambda --expt-relaxed-constexpr " CACHE STRING "") +#set(CMAKE_CUDA_FLAGS "-fopenmp" CACHE STRING "") # nvcc does not like gtest's 'pthreads' flag -set(gtest_disable_pthreads ON CACHE BOOL "") +set(gtest_disable_pthreads OFF CACHE BOOL "") set(BLT_CMAKE_IMPLICIT_LINK_DIRECTORIES_EXCLUDE "/usr/tce/packages/gcc/gcc-4.9.3/lib64;/usr/tce/packages/gcc/gcc-4.9.3/lib64/gcc/powerpc64le-unknown-linux-gnu/4.9.3;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64/gcc/powerpc64le-unknown-linux-gnu/4.9.3" CACHE STRING "") diff --git a/src/serac/numerics/functional/boundary_integral_kernels.hpp b/src/serac/numerics/functional/boundary_integral_kernels.hpp index f2da655f2..6f5373561 100644 --- a/src/serac/numerics/functional/boundary_integral_kernels.hpp +++ b/src/serac/numerics/functional/boundary_integral_kernels.hpp @@ -182,41 +182,41 @@ void evaluation_kernel_impl(trial_element_type trial_elements, test_element, con #if defined(USE_CUDA) std::cout << "USING CUDA\n"; - using policy = RAJA::cuda_exec<512>; + using policy = RAJA::cuda_exec<32>; #else using policy = RAJA::simd_exec; #endif // for each element in the domain - RAJA::forall( - RAJA::TypedRangeSegment(0, num_elements), - [J, x, qf, u, rule, r, qpts_per_elem, qf_derivatives] RAJA_HOST_DEVICE(uint32_t e) { - // load the jacobians and positions for each quadrature point in this element - auto J_e = J[e]; - auto x_e = x[e]; - // Avoid unused warning/error ([[maybe_unused]] is not possible in the capture list) - (void)qf_derivatives; - (void)qpts_per_elem; - - static constexpr trial_element_type empty_trial_element{}; - // batch-calculate values / derivatives of each trial space, at each quadrature point - [[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when( - get(empty_trial_element).interpolate(get(u)[e], rule))...}; - - // (batch) evalute the q-function at each quadrature point - auto qf_outputs = batch_apply_qf(qf, x_e, J_e, get(qf_inputs)...); - - // write out the q-function derivatives after applying the - // physical_to_parent transformation, so that those transformations - // won't need to be applied in the action_of_gradient and element_gradient kernels - if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) { - for (int q = 0; q < leading_dimension(qf_outputs); q++) { - qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[q]); - } - } - - // (batch) integrate the material response against the test-space basis functions - test_element::integrate(get_value(qf_outputs), rule, &r[e]); - }); + // RAJA::forall( + // RAJA::TypedRangeSegment(0, num_elements), + // [J, x, qf, u, rule, r, qpts_per_elem, qf_derivatives] RAJA_HOST_DEVICE(uint32_t e) { + // // load the jacobians and positions for each quadrature point in this element + // auto J_e = J[e]; + // auto x_e = x[e]; + // // Avoid unused warning/error ([[maybe_unused]] is not possible in the capture list) + // (void)qf_derivatives; + // (void)qpts_per_elem; + // + // static constexpr trial_element_type empty_trial_element{}; + // // batch-calculate values / derivatives of each trial space, at each quadrature point + // [[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when( + // get(empty_trial_element).interpolate(get(u)[e], rule))...}; + // + // // (batch) evalute the q-function at each quadrature point + // auto qf_outputs = batch_apply_qf(qf, x_e, J_e, get(qf_inputs)...); + // + // // write out the q-function derivatives after applying the + // // physical_to_parent transformation, so that those transformations + // // won't need to be applied in the action_of_gradient and element_gradient kernels + // if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) { + // for (int q = 0; q < leading_dimension(qf_outputs); q++) { + // qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[q]); + // } + // } + // + // // (batch) integrate the material response against the test-space basis functions + // test_element::integrate(get_value(qf_outputs), rule, &r[e]); + // }); } //clang-format off @@ -278,7 +278,7 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q constexpr TensorProductQuadratureRule rule{}; #if defined(USE_CUDA) - using policy = RAJA::cuda_exec<512>; + using policy = RAJA::cuda_exec<32>; #else using policy = RAJA::simd_exec; #endif @@ -334,7 +334,7 @@ void element_gradient_kernel(ExecArrayView dK, #if defined(USE_CUDA) std::cout << "USING CUDA :)\n"; - using policy = RAJA::cuda_exec<512>; + using policy = RAJA::cuda_exec<32>; #else using policy = RAJA::simd_exec; #endif diff --git a/src/serac/numerics/functional/detail/hexahedron_H1.inl b/src/serac/numerics/functional/detail/hexahedron_H1.inl index ace6dc2ef..fbc8920f8 100644 --- a/src/serac/numerics/functional/detail/hexahedron_H1.inl +++ b/src/serac/numerics/functional/detail/hexahedron_H1.inl @@ -16,8 +16,9 @@ // note: mfem assumes the parent element domain is [0,1]x[0,1]x[0,1] // for additional information on the finite_element concept requirements, see finite_element.hpp /// @cond +#include template -struct finite_element > { +struct finite_element> { static constexpr auto geometry = mfem::Geometry::CUBE; static constexpr auto family = Family::H1; static constexpr int components = c; @@ -31,9 +32,9 @@ struct finite_element > { using dof_type = tensor; - using value_type = typename std::conditional >::type; + using value_type = typename std::conditional>::type; using derivative_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using qf_input_type = tuple; SERAC_HOST_DEVICE static constexpr tensor shape_functions(tensor xi) @@ -72,7 +73,7 @@ struct finite_element > { for (int j = 0; j < p + 1; j++) { for (int i = 0; i < p + 1; i++) { dN[count++] = { - dN_xi[i] * N_eta[j] * N_zeta[k], + dN_xi[i] * N_eta[j] * N_zeta[k], N_xi[i] * dN_eta[j] * N_zeta[k], N_xi[i] * N_eta[j] * dN_zeta[k] }; @@ -105,6 +106,28 @@ struct finite_element > { } return B; } +#ifdef USE_CUDA + // Compute B on device, and return a pointer to device memory. + template + static serac::tensor* calculate_B_device() + { + constexpr auto points1D = GaussLegendreNodes(); + [[maybe_unused]] constexpr auto weights1D = GaussLegendreWeights(); + tensor* B_ptr = nullptr; + auto& rm = umpire::ResourceManager::getInstance(); + auto dest_allocator = rm.getAllocator("DEVICE"); + B_ptr = static_cast*>(dest_allocator.allocate(sizeof(tensor))); + using policy = RAJA::cuda_exec<256>; + RAJA::forall(RAJA::TypedRangeSegment(0, q), [points1D, weights1D, B_ptr] SERAC_HOST_DEVICE(int i) { + (*B_ptr)[i] = GaussLobattoInterpolation(points1D[i]); + if constexpr (apply_weights) { + (*B_ptr)[i] = (*B_ptr)[i] * weights1D[i]; + } + }); + + return B_ptr; + } +#endif /** * @brief G(i,j) is the derivative of the @@ -129,6 +152,27 @@ struct finite_element > { return G; } +#ifdef USE_CUDA + template + static serac::tensor* calculate_G_device() + { + constexpr auto points1D = GaussLegendreNodes(); + [[maybe_unused]] constexpr auto weights1D = GaussLegendreWeights(); + tensor* G_ptr = nullptr; + auto& rm = umpire::ResourceManager::getInstance(); + auto dest_allocator = rm.getAllocator("DEVICE"); + G_ptr = static_cast*>(dest_allocator.allocate(sizeof(tensor))); + using policy = RAJA::cuda_exec<256>; + RAJA::forall(RAJA::TypedRangeSegment(0, q), [points1D, weights1D, G_ptr] SERAC_HOST_DEVICE(int i) { + (*G_ptr)[i] = GaussLobattoInterpolationDerivative(points1D[i]); + if constexpr (apply_weights) { + (*G_ptr)[i] = (*G_ptr)[i] * weights1D[i]; + } + }); + return G_ptr; + } +#endif + template static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor input, const TensorProductQuadratureRule&) @@ -168,7 +212,9 @@ struct finite_element > { } template - SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule&) + SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule&, + tensor* foo = nullptr, + RAJA::LaunchContext ctx = RAJA::LaunchContext{}) { // we want to compute the following: // @@ -186,11 +232,20 @@ struct finite_element > { // A2(dz, qy, qx) := B(qy, dy) * A1(dz, dy, qx) // X_q(qz, qy, qx) := B(qz, dz) * A2(dz, qy, qx) static constexpr bool apply_weights = false; - static constexpr auto B = calculate_B(); - static constexpr auto G = calculate_G(); +#ifdef USE_CUDA + using threads_x = RAJA::LoopPolicy; +#else + using threads_x = RAJA::LoopPolicy; +#endif + + RAJA::RangeSegment x_range(0, BLOCK_SZ); + +#if not defined(USE_CUDA) tensor value{}; tensor gradient{}; + static constexpr auto B = calculate_B(); + static constexpr auto G = calculate_G(); for (int i = 0; i < c; i++) { auto A10 = contract<2, 1>(X[i], B); @@ -205,33 +260,112 @@ struct finite_element > { gradient(i, 1) = contract<0, 1>(A22, B); gradient(i, 2) = contract<0, 1>(A20, G); } +#else + + RAJA_TEAM_SHARED tensor value; + RAJA_TEAM_SHARED tensor gradient; + constexpr auto B = calculate_B(); + constexpr auto G = calculate_G(); + for (int i = 0; i < c; i++) { + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 1>(X[i], B)) A10; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 1>(X[i], G)) A11; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 1>(A10, B)) A20; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 1>(A11, B)) A21; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 1>(A10, G)) A22; + + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = tid / BLOCK_X; + int qz = tid / (BLOCK_X * BLOCK_Y); + + // Perform actual contractions + contract<2, 1>(X[i], B, &A10, qx, qy, qz); + + ctx.teamSync(); + contract<2, 1>(X[i], G, &A11, qx, qy, qz); + + ctx.teamSync(); + contract<1, 1>(A10, B, &A20, qx, qy, qz); + + ctx.teamSync(); + contract<1, 1>(A11, B, &A21, qx, qy, qz); + + ctx.teamSync(); + contract<1, 1>(A10, G, &A22, qx, qy, qz); + + ctx.teamSync(); + + contract<0, 1>(A20, B, &value(i), qx, qy, qz); + + ctx.teamSync(); + contract<0, 1>(A21, B, &gradient(i, 0), qx, qy, qz); + + ctx.teamSync(); + contract<0, 1>(A22, B, &gradient(i, 1), qx, qy, qz); + + ctx.teamSync(); + contract<0, 1>(A20, G, &gradient(i, 2), qx, qy, qz); + + ctx.teamSync(); + }); + } +#endif // transpose the quadrature data into a flat tensor of tuples - union { - tensor one_dimensional; - tensor, tensor >, q, q, q> three_dimensional; + + RAJA_TEAM_SHARED union { + tensor one_dimensional; + tensor, tensor>, q, q, q> three_dimensional; } output; - for (int qz = 0; qz < q; qz++) { - for (int qy = 0; qy < q; qy++) { - for (int qx = 0; qx < q; qx++) { - for (int i = 0; i < c; i++) { - get(output.three_dimensional(qz, qy, qx))[i] = value(i, qz, qy, qx); - for (int j = 0; j < dim; j++) { - get(output.three_dimensional(qz, qy, qx))[i][j] = gradient(i, j, qz, qy, qx); - } - } + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = tid / BLOCK_X; + int qz = tid / (BLOCK_X * BLOCK_Y); + if (qx >= q || qy >= q || qz >= q) { + return; + } + for (int i = 0; i < c; i++) { + get(output.three_dimensional(qz, qy, qx))[i] = value(i, qz, qy, qx); + for (int j = 0; j < dim; j++) { + get(output.three_dimensional(qz, qy, qx))[i][j] = gradient(i, j, qz, qy, qx); } } + }); + if (foo) { + RAJA::loop(ctx, x_range, [&](int tid) { + if (tid < serac::size(output.one_dimensional)) { + (*foo)[tid] = output.one_dimensional[tid]; + } + }); } - return output.one_dimensional; } + template + static void copy_tensor_to_host(serac::tensor* src, DestinationTensor* dst) + { + auto& rm = umpire::ResourceManager::getInstance(); + auto dest_allocator = rm.getAllocator("DEVICE"); + + umpire::register_external_allocation( + dst, + umpire::util::AllocationRecord(dst, sizeof(DestinationTensor), rm.getAllocator("HOST").getAllocationStrategy(), + std::string("external array"))); + + rm.copy(dst, src); + } + + template + static void copy_tensor_to_host(serac::zero* src, DestinationTensor* dst) + { + *dst = serac::zero{}; + } + template SERAC_HOST_DEVICE static void integrate(const tensor, q * q * q>& qf_output, const TensorProductQuadratureRule&, dof_type* element_residual, - int step = 1) + RAJA::LaunchContext ctx = RAJA::LaunchContext{}, int step = 1) { if constexpr (is_zero{} && is_zero{}) { return; @@ -239,39 +373,88 @@ struct finite_element > { constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c; - using s_buffer_type = std::conditional_t{}, zero, tensor >; - using f_buffer_type = std::conditional_t{}, zero, tensor >; + using s_buffer_type = std::conditional_t{}, zero, tensor>; + using f_buffer_type = std::conditional_t{}, zero, tensor>; - static constexpr bool apply_weights = true; - static constexpr auto B = calculate_B(); - static constexpr auto G = calculate_G(); + /*static*/ constexpr bool apply_weights = true; + + RAJA::RangeSegment x_range(0, BLOCK_SZ); + +#ifdef USE_CUDA + using threads_x = RAJA::LoopPolicy; +#else + + using threads_x = RAJA::LoopPolicy; +#endif for (int j = 0; j < ntrial; j++) { for (int i = 0; i < c; i++) { s_buffer_type source; f_buffer_type flux; - for (int qz = 0; qz < q; qz++) { - for (int qy = 0; qy < q; qy++) { - for (int qx = 0; qx < q; qx++) { - int Q = (qz * q + qy) * q + qx; - source(qz, qy, qx) = reinterpret_cast(&get(qf_output[Q]))[i * ntrial + j]; - for (int k = 0; k < dim; k++) { - flux(k, qz, qy, qx) = - reinterpret_cast(&get(qf_output[Q]))[(i * dim + k) * ntrial + j]; - } - } + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = tid / BLOCK_X; + int qz = tid / (BLOCK_X * BLOCK_Y); + if (qx >= q || qy >= q || qz >= q) { + return; } - } - - auto A20 = contract<2, 0>(source, B) + contract<2, 0>(flux(0), G); - auto A21 = contract<2, 0>(flux(1), B); - auto A22 = contract<2, 0>(flux(2), B); + constexpr auto B = calculate_B(); + constexpr auto G = calculate_G(); + int Q = (qz * q + qy) * q + qx; + source(qz, qy, qx) = reinterpret_cast(&get(qf_output[Q]))[i * ntrial + j]; + for (int k = 0; k < dim; k++) { + flux(k, qz, qy, qx) = reinterpret_cast(&get(qf_output[Q]))[(i * dim + k) * ntrial + j]; + } + }); +#if not defined USE_CUDA + constexpr auto B = calculate_B(); + constexpr auto G = calculate_G(); + auto A20 = contract<2, 0>(source, B) + contract<2, 0>(flux(0), G); + auto A21 = contract<2, 0>(flux(1), B); + auto A22 = contract<2, 0>(flux(2), B); auto A10 = contract<1, 0>(A20, B) + contract<1, 0>(A21, G); auto A11 = contract<1, 0>(A22, B); element_residual[j * step](i) += contract<0, 0>(A10, B) + contract<0, 0>(A11, G); +#else + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = tid / BLOCK_X; + int qz = tid / (BLOCK_X * BLOCK_Y); + constexpr auto B = calculate_B(); + constexpr auto G = calculate_G(); + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 0>(source, B)) A20, A20_tmp; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 0>(flux(1), B)) A21; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 0>(flux(2), B)) A22; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 0>(A20, B)) A10; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 0>(A22, B)) A11; + ctx.teamSync(); + + contract<2, 0>(source, B, &A20, qx, qy, qz); + ctx.teamSync(); + contract<2, 0>(flux(0), G, &A20, qx, qy, qz, true); + ctx.teamSync(); + + contract<2, 0>(flux(1), B, &A21, qx, qy, qz); + ctx.teamSync(); + contract<2, 0>(flux(2), B, &A22, qx, qy, qz); + ctx.teamSync(); + + contract<1, 0>(A21, G, &A10, qx, qy, qz); + ctx.teamSync(); + contract<1, 0>(A20, B, &A10, qx, qy, qz, true); + ctx.teamSync(); + contract<1, 0>(A22, B, &A11, qx, qy, qz); + ctx.teamSync(); + + contract<0, 0>(A10, B, &(element_residual[j * step](i)), qx, qy, qz, true); + ctx.teamSync(); + contract<0, 0>(A11, G, &(element_residual[j * step](i)), qx, qy, qz, true); + ctx.teamSync(); + }); +#endif } } } diff --git a/src/serac/numerics/functional/domain_integral_kernels.hpp b/src/serac/numerics/functional/domain_integral_kernels.hpp index becebf9d3..6ac6c5750 100644 --- a/src/serac/numerics/functional/domain_integral_kernels.hpp +++ b/src/serac/numerics/functional/domain_integral_kernels.hpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) #pragma once +#include #include "serac/infrastructure/accelerator.hpp" #include "serac/numerics/functional/quadrature_data.hpp" #include "serac/numerics/functional/function_signature.hpp" @@ -12,45 +13,31 @@ #include #include +#include +#include #include #include #include #ifdef USE_CUDA #include namespace { - void printCUDAMemUsage() { - int deviceCount = 0; - cudaGetDeviceCount(&deviceCount); - for (int i = 0; i < deviceCount; ++i) { - cudaSetDevice(i); - - size_t freeBytes, totalBytes; - cudaMemGetInfo(&freeBytes, &totalBytes); - size_t usedBytes = totalBytes - freeBytes; - - std::cout << "Device Number: " << i << std::endl; - std::cout << " Total Memory (MB): " << (totalBytes / 1024.0 / 1024.0) << std::endl; - std::cout << " Free Memory (MB): " << (freeBytes / 1024.0 / 1024.0) << std::endl; - std::cout << " Used Memory (MB): " << (usedBytes / 1024.0 / 1024.0) << std::endl; - } - } - -template -DataType* copy_data(DataType* source_data, std::size_t size, const std::string& destination) +void printCUDAMemUsage() { - auto& rm = umpire::ResourceManager::getInstance(); - auto dest_allocator = rm.getAllocator(destination); - - DataType* dest_data = static_cast(dest_allocator.allocate(size * sizeof(DataType))); - - // _sphinx_tag_tut_copy_start - rm.copy(dest_data, source_data); - // _sphinx_tag_tut_copy_end - - return dest_data; + int deviceCount = 0; + cudaGetDeviceCount(&deviceCount); + int i = 0; + cudaSetDevice(i); + + size_t freeBytes, totalBytes; + cudaMemGetInfo(&freeBytes, &totalBytes); + size_t usedBytes = totalBytes - freeBytes; + + std::cout << "Device Number: " << i << std::endl; + std::cout << " Total Memory (MB): " << (totalBytes / 1024.0 / 1024.0) << std::endl; + std::cout << " Free Memory (MB): " << (freeBytes / 1024.0 / 1024.0) << std::endl; + std::cout << " Used Memory (MB): " << (usedBytes / 1024.0 / 1024.0) << std::endl; } - template void deallocate(DataType* data, const std::string& destination) { @@ -59,7 +46,7 @@ void deallocate(DataType* data, const std::string& destination) dest_allocator.deallocate(data); } -} +} // namespace #endif namespace serac { @@ -77,37 +64,37 @@ SERAC_HOST_DEVICE struct QFunctionArgument; /// @overload template -SERAC_HOST_DEVICE struct QFunctionArgument, Dimension > { - using type = tuple >; ///< what will be passed to the q-function +SERAC_HOST_DEVICE struct QFunctionArgument, Dimension> { + using type = tuple>; ///< what will be passed to the q-function }; /// @overload template -SERAC_HOST_DEVICE struct QFunctionArgument, Dimension > { - using type = tuple, tensor >; ///< what will be passed to the q-function +SERAC_HOST_DEVICE struct QFunctionArgument, Dimension> { + using type = tuple, tensor>; ///< what will be passed to the q-function }; /// @overload template -SERAC_HOST_DEVICE struct QFunctionArgument, Dimension > { - using type = tuple >; ///< what will be passed to the q-function +SERAC_HOST_DEVICE struct QFunctionArgument, Dimension> { + using type = tuple>; ///< what will be passed to the q-function }; /// @overload template -SERAC_HOST_DEVICE struct QFunctionArgument, Dimension > { - using type = tuple, tensor >; ///< what will be passed to the q-function +SERAC_HOST_DEVICE struct QFunctionArgument, Dimension> { + using type = tuple, tensor>; ///< what will be passed to the q-function }; /// @overload template -SERAC_HOST_DEVICE struct QFunctionArgument, Dimension<2> > { +SERAC_HOST_DEVICE struct QFunctionArgument, Dimension<2>> { using type = tuple, double>; ///< what will be passed to the q-function }; /// @overload template -SERAC_HOST_DEVICE struct QFunctionArgument, Dimension<3> > { - using type = tuple, tensor >; ///< what will be passed to the q-function +SERAC_HOST_DEVICE struct QFunctionArgument, Dimension<3>> { + using type = tuple, tensor>; ///< what will be passed to the q-function }; /// @brief layer of indirection needed to unpack the entries of the argument tuple @@ -143,32 +130,45 @@ SERAC_HOST_DEVICE auto apply_qf(lambda&& qf, coords_type&& x_q, qpt_data_type&& template auto get_derivative_type(lambda qf, qpt_data_type&& qpt_data) { - using qf_arguments = serac::tuple >::type...>; + using qf_arguments = serac::tuple>::type...>; return get_gradient(apply_qf(qf, tensor{}, qpt_data, make_dual_wrt(qf_arguments{}))); }; template -SERAC_HOST_DEVICE auto batch_apply_qf_no_qdata(lambda qf, const tensor x, const T&... inputs) +SERAC_HOST_DEVICE auto batch_apply_qf_no_qdata(lambda qf, const tensor x, RAJA::LaunchContext ctx, + const T&... inputs) { using return_type = decltype(qf(tensor{}, T{}[0]...)); +#ifdef USE_CUDA + using threads_x = RAJA::LoopPolicy; +#else + using threads_x = RAJA::LoopPolicy; +#endif + RAJA::RangeSegment x_range(0, n); tensor outputs{}; - for (int i = 0; i < n; i++) { + RAJA::loop(ctx, x_range, [&](int i) { tensor x_q; for (int j = 0; j < dim; j++) { x_q[j] = x(j, i); } outputs[i] = qf(x_q, inputs[i]...); - } + }); return outputs; } template SERAC_HOST_DEVICE auto batch_apply_qf(lambda qf, const tensor x, qpt_data_type* qpt_data, - bool update_state, const T&... inputs) + bool update_state, RAJA::LaunchContext ctx, const T&... inputs) { using return_type = decltype(qf(tensor{}, qpt_data[0], T{}[0]...)); +#ifdef USE_CUDA + using threads_x = RAJA::LoopPolicy; +#else + using threads_x = RAJA::LoopPolicy; +#endif + RAJA::RangeSegment x_range(0, n); tensor outputs{}; - for (int i = 0; i < n; i++) { + RAJA::loop(ctx, x_range, [&](int i) { tensor x_q; for (int j = 0; j < dim; j++) { x_q[j] = x(j, i); @@ -179,7 +179,7 @@ SERAC_HOST_DEVICE auto batch_apply_qf(lambda qf, const tensor x, if (update_state) { qpt_data[i] = qdata; } - } + }); return outputs; } @@ -195,8 +195,8 @@ void evaluation_kernel_impl(trial_element_tuple_type trial_elements, te { // mfem provides this information as opaque arrays of doubles, // so we reinterpret the pointer with - using X_Type = typename batched_position::type; - using J_Type = typename batched_jacobian::type; + using X_Type = typename batched_position::type; + using J_Type = typename batched_jacobian::type; auto r = reinterpret_cast(outputs); auto x = const_cast(reinterpret_cast(positions)); auto J = const_cast(reinterpret_cast(jacobians)); @@ -206,95 +206,150 @@ void evaluation_kernel_impl(trial_element_tuple_type trial_elements, te [[maybe_unused]] tuple u = { reinterpret_cast(trial_elements))::dof_type*>(inputs[indices])...}; -#define USE_CUDA + + trial_element_tuple_type empty_trial_element{}; + using interpolate_out_type = + decltype(tuple{get(empty_trial_element).interpolate(get(u)[0], rule)...}); + + using type = decltype(tuple{promote_each_to_dual_when( + get(empty_trial_element).interpolate(get(u)[0], rule))...}); + #ifdef USE_CUDA + auto& rm = umpire::ResourceManager::getInstance(); + auto dest_allocator = rm.getAllocator("DEVICE"); + type* qf_inputs = static_cast(dest_allocator.allocate(sizeof(type) * num_elements)); + interpolate_out_type* interpolate_result = + static_cast(dest_allocator.allocate(sizeof(interpolate_out_type) * num_elements)); std::cout << "USING CUDA :)\n"; - printCUDAMemUsage(); - cudaSetDevice(0); - using policy = RAJA::cuda_exec<512>; - auto& rm = umpire::ResourceManager::getInstance(); - auto dest_allocator = rm.getAllocator("DEVICE"); - auto device_J = copy_data((J), serac::size(J), "DEVICE"); - auto device_x = copy_data((x), serac::size(x), "DEVICE"); - auto device_r = copy_data(r, serac::size(r), "DEVICE"); + auto device_J = copy_data(J, serac::size(*J) * sizeof(double), "DEVICE"); + auto device_x = copy_data(x, serac::size(*x) * sizeof(double), "DEVICE"); + auto device_r = copy_data(r, serac::size(*r) * sizeof(double), "DEVICE"); // These more complex types require a helper struct to deduce the data structure size. - decltype(u)* device_u = static_cast(dest_allocator.allocate(serac::size(u))); - rm.copy(device_u, &u); - auto device_qf_derivatives = static_cast(dest_allocator.allocate(serac::size(*qf_derivatives))); - rm.copy(device_qf_derivatives, qf_derivatives); + // decltype(u)* device_u = static_cast(dest_allocator.allocate(serac::size(u))); + + // vector of pointers to device inputs + + // umpire::register_external_allocation( + // &u, umpire::util::AllocationRecord(&u, serac::size(u) * sizeof(double), + // rm.getAllocator("HOST").getAllocationStrategy(), + // std::string("external array"))); + // rm.copy(device_u, &u); + // auto device_qf_derivatives = static_cast(dest_allocator.allocate(serac::size(*qf_derivatives) * + // sizeof(double))); umpire::register_external_allocation( + // qf_derivatives, umpire::util::AllocationRecord(qf_derivatives, serac::size(*qf_derivatives) * sizeof(double), + // rm.getAllocator("HOST").getAllocationStrategy(), + // std::string("external array"))); + // rm.copy(device_qf_derivatives, qf_derivatives); + + printCUDAMemUsage(); + cudaSetDevice(0); #else - using policy = RAJA::simd_exec; + type* qf_inputs = nullptr; + interpolate_out_type* interpolate_result = nullptr; + auto device_J = J; + auto device_x = x; + auto device_r = r; + // auto device_qf_derivatives = qf_derivatives; #endif + auto e_range = RAJA::TypedRangeSegment(0, num_elements); + +#if defined(USE_CUDA) + using teams_e = RAJA::LoopPolicy; + using launch_policy = RAJA::LaunchPolicy>; +#else + using policy = RAJA::simd_exec; + using teams_e = RAJA::LoopPolicy; + using launch_policy = RAJA::LaunchPolicy; +#endif + std::cout << "NUM elements " << num_elements << std::endl; // for each element in the domain - RAJA::forall( - RAJA::TypedRangeSegment(0, num_elements), - [device_J, device_x, qf, device_u, qpts_per_elem, rule, device_r, qf_state, device_qf_derivatives, update_state] SERAC_HOST_DEVICE(uint32_t e) { - auto J_e = device_J[e]; - auto x_e = device_x[e]; - // load the jacobians and positions for each quadrature point in this element - - // Avoid unused warning/error ([[maybe_unused]] is not possible in the capture list) - //(void)u; - (void)device_qf_derivatives; - (void)qpts_per_elem; - (void)update_state; - (void)qf_state; - - static constexpr trial_element_tuple_type empty_trial_element{}; - // batch-calculate values / derivatives of each trial space, at each quadrature point - [[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when( - get(empty_trial_element).interpolate(get(*device_u)[e], rule))...}; - - // use J_e to transform values / derivatives on the parent element - // to the to the corresponding values / derivatives on the physical element - (parent_to_physical(empty_trial_element).family>(get(qf_inputs), J_e), ...); - - // (batch) evalute the q-function at each quadrature point - // - // note: the weird immediately-invoked lambda expression is - // a workaround for a bug in GCC(<12.0) where it fails to - // decide which function overload to use, and crashes - - auto qf_outputs = [&]() { - if constexpr (std::is_same_v) { - return batch_apply_qf_no_qdata(qf, x_e, get(qf_inputs)...); - } else { - return batch_apply_qf(qf, x_e, &qf_state(e, 0), update_state, get(qf_inputs)...); - } - }(); - - // use J to transform sources / fluxes on the physical element - // back to the corresponding sources / fluxes on the parent element - physical_to_parent(qf_outputs, J_e); - - // write out the q-function derivatives after applying the - // physical_to_parent transformation, so that those transformations - // won't need to be applied in the action_of_gradient and element_gradient kernels - if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) { - for (int q = 0; q < leading_dimension(qf_outputs); q++) { - device_qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[q]); - } - } + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(num_elements), RAJA::Threads(BLOCK_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, e_range, + [&ctx, device_J, device_x, u, qf, qpts_per_elem, rule, device_r, qf_state, + /*device_*/ qf_derivatives, qf_inputs, interpolate_result, update_state](uint32_t e) { + if constexpr (test_element_type::geometry == mfem::Geometry::CUBE) { + // load the jacobians and positions for each quadrature point in this element + static constexpr trial_element_tuple_type empty_trial_element{}; + + // batch-calculate values / derivatives of each trial space, at each quadrature point + + (get(empty_trial_element) + .interpolate(get(u)[e], rule, &get(interpolate_result[e]), ctx), + ...); + + (promote_each_to_dual_when( + get(interpolate_result[e]), &get(qf_inputs[e]), ctx), + ...); + + //// use J_e to transform values / derivatives on the parent element + //// to the to the corresponding values / derivatives on the physical element + (parent_to_physical(empty_trial_element).family>( + get(qf_inputs[e]), device_J, e, ctx), + ...); + // parent_to_physical(empty_trial_element).family>(qf_inputs, device_J,e, ctx); + + ctx.teamSync(); + //// (batch) evalute the q-function at each quadrature point + //// + //// note: the weird immediately-invoked lambda expression is + //// a workaround for a bug in GCC(<12.0) where it fails to + //// decide which function overload to use, and crashes + // + auto qf_outputs = [&]() { + if constexpr (std::is_same_v) { + return batch_apply_qf_no_qdata(qf, device_x[e], ctx, get(qf_inputs[e])...); + } else { + return batch_apply_qf(qf, device_x[e], &qf_state(e, 0), update_state, ctx, + get(qf_inputs[e])...); + } + }(); - // (batch) integrate the material response against the test-space basis functions - test_element_type::integrate(get_value(qf_outputs), rule, &device_r[e]); - +#ifdef USE_CUDA + using threads_x = RAJA::LoopPolicy; +#else + using threads_x = RAJA::LoopPolicy; +#endif + // use J to transform sources / fluxes on the physical element + // back to the corresponding sources / fluxes on the parent element + // physical_to_parent(qf_outputs, J_e); + + // write out the q-function derivatives after applying the + // physical_to_parent transformation, so that those transformations + // won't need to be applied in the action_of_gradient and element_gradient kernels + if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) { + RAJA::RangeSegment x_range(0, leading_dimension(qf_outputs)); + RAJA::loop(ctx, x_range, [&](int q) { + // qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = + // get_gradient(qf_outputs[q]); + }); + } + ctx.teamSync(); + + // (batch) integrate the material response against the test-space basis functions + test_element_type::integrate(get_value(qf_outputs), rule, &device_r[e], ctx); + } + }); }); - - rm.copy(&r, device_r); - - rm.copy(&u, device_u); - rm.copy(qf_derivatives, device_qf_derivatives); - - deallocate(device_J, "DEVICE"); - deallocate(device_x, "DEVICE"); - deallocate(device_u, "DEVICE"); - deallocate(device_r, "DEVICE"); - deallocate(device_qf_derivatives, "DEVICE"); +#ifdef USE_CUDA + rm.copy(r, device_r); + // rm.copy(&u, device_u); + // rm.copy(qf_derivatives, device_qf_derivatives); + deallocate(device_J, "DEVICE"); + deallocate(qf_inputs, "DEVICE"); + deallocate(interpolate_result, "DEVICE"); + deallocate(device_x, "DEVICE"); + deallocate(device_r, "DEVICE"); + std::cout << "312\n"; + printCUDAMemUsage(); + // deallocate(device_u, "DEVICE"); + // deallocate(device_qf_derivatives, "DEVICE"); +#endif return; } @@ -317,7 +372,8 @@ SERAC_HOST_DEVICE auto chain_rule(const S& dfdx, const T& dx) //clang-format on template -SERAC_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor& inputs) +SERAC_HOST_DEVICE tensor(derivative_type{}, T{})), n> batch_apply_chain_rule( + derivative_type* qf_derivatives, const tensor& inputs, const RAJA::LaunchContext& ctx = RAJA::LaunchContext{}) { using return_type = decltype(chain_rule(derivative_type{}, T{})); tensor outputs{}; @@ -368,23 +424,34 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q auto dr = reinterpret_cast(dR); constexpr TensorProductQuadratureRule rule{}; + const int n_blocks_x = RAJA_DIVIDE_CEILING_INT(Q, BLOCK_X); + const int n_blocks_y = RAJA_DIVIDE_CEILING_INT(Q, BLOCK_Y); + const int n_blocks_z = RAJA_DIVIDE_CEILING_INT(Q, BLOCK_Z); + auto e_range = RAJA::RangeSegment(0, num_elements); + #if defined(USE_CUDA) - using policy = RAJA::cuda_exec<512>; + using teams_e = RAJA::LoopPolicy; + using launch_policy = RAJA::LaunchPolicy>; #else - using policy = RAJA::simd_exec; + using policy = RAJA::simd_exec; + using teams_e = RAJA::LoopPolicy; + using launch_policy = RAJA::LaunchPolicy; #endif - // for each element in the domain - RAJA::forall(RAJA::TypedRangeSegment(0, num_elements), [=] SERAC_HOST_DEVICE(uint32_t e) { - // (batch) interpolate each quadrature point's value - auto qf_inputs = trial_element::interpolate(du[e], rule); + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(num_elements), RAJA::Threads(BLOCK_X, BLOCK_Y, BLOCK_Z)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, e_range, [du, rule, &ctx, qf_derivatives, dr, num_qpts](int e) { + if constexpr (g == mfem::Geometry::CUBE) { + // (batch) interpolate each quadrature point's value + auto qf_inputs = trial_element::interpolate(du[e], rule, nullptr, ctx); - // (batch) evalute the q-function at each quadrature point - auto qf_outputs = batch_apply_chain_rule(qf_derivatives + e * num_qpts, qf_inputs); + auto qf_outputs = batch_apply_chain_rule(qf_derivatives + e * num_qpts, qf_inputs); - // (batch) integrate the material response against the test-space basis functions - test_element::integrate(qf_outputs, rule, &dr[e]); - }); + test_element::integrate(qf_outputs, rule, &dr[e], ctx); + } + }); + }); } /** @@ -427,39 +494,58 @@ void element_gradient_kernel(ExecArrayView dK, constexpr int nquad = num_quadrature_points(g, Q); constexpr TensorProductQuadratureRule rule{}; #if defined(USE_CUDA) - std::cout << "USING CUDA :)\n"; - using policy = RAJA::cuda_exec<512>; + using teams_e = RAJA::LoopPolicy; + using launch_policy = RAJA::LaunchPolicy>; #else - using policy = RAJA::simd_exec; + using teams_e = RAJA::LoopPolicy; + using launch_policy = RAJA::LaunchPolicy; #endif // for each element in the domain - RAJA::forall(RAJA::TypedRangeSegment(0, num_elements), [=] SERAC_HOST_DEVICE(std::size_t e) { - static constexpr bool is_QOI_2 = test::family == Family::QOI; - [[maybe_unused]] auto* output_ptr = reinterpret_cast(&dK(e, 0, 0)); - - tensor derivatives{}; - for (int q = 0; q < nquad; q++) { - if constexpr (is_QOI_2) { - get<0>(derivatives(q)) = qf_derivatives[e * nquad + uint32_t(q)]; - } else { - derivatives(q) = qf_derivatives[e * nquad + uint32_t(q)]; - } - } - - for (int J = 0; J < trial_element::ndof; J++) { - auto source_and_flux = trial_element::batch_apply_shape_fn(J, derivatives, rule); - - test_element::integrate(source_and_flux, rule, output_ptr + J, trial_element::ndof); - } - }); -} +// RAJA::launch(RAJA::LaunchParams(RAJA::Teams(num_elements), RAJA::Threads(BLOCK_SZ)), +// [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { +// if constexpr (g == mfem::Geometry::CUBE) { +// static constexpr bool is_QOI_2 = test::family == Family::QOI; +// [[maybe_unused]] auto* output_ptr = +// reinterpret_cast(&dK(e, 0, 0)); +// +// (void*)qf_derivatives; +//#ifdef USE_CUDA +// using threads_x = RAJA::LoopPolicy; +//#else +// using threads_x = RAJA::LoopPolicy; +//#endif +// tensor derivatives{}; +// RAJA::RangeSegment x_range(0, nquad); +// RAJA::loop(ctx, x_range, [&](int q) { +// if constexpr (is_QOI_2) { +// get<0>(derivatives(q)) = qf_derivatives[e * nquad + uint32_t(q)]; +// } else { +// derivatives(q) = qf_derivatives[e * nquad + uint32_t(q)]; +// } +// }); +// printf("here 3, num elem %d\n", trial_element::ndof); +// +// RAJA::RangeSegment J_range(0, trial_element::ndof); +// RAJA::loop(ctx, x_range, [&](int J) { +// auto source_and_flux = trial_element::batch_apply_shape_fn(J, derivatives, rule); +// test_element::integrate(source_and_flux, rule, output_ptr + J, +// trial_element::ndof); +// }); +// } +// }); +//}); +#if defined(USE_CUDA) + std::cout << "L480\n"; + printCUDAMemUsage(); +#endif +} // namespace domain_integral template std::function&, double*, bool)> evaluation_kernel( signature s, lambda_type qf, const double* positions, const double* jacobians, - std::shared_ptr > qf_state, std::shared_ptr qf_derivatives, + std::shared_ptr> qf_state, std::shared_ptr qf_derivatives, uint32_t num_elements) { auto trial_elements = trial_elements_tuple(s); diff --git a/src/serac/numerics/functional/dual.hpp b/src/serac/numerics/functional/dual.hpp index 949e0744c..a8c302a15 100644 --- a/src/serac/numerics/functional/dual.hpp +++ b/src/serac/numerics/functional/dual.hpp @@ -108,6 +108,8 @@ SERAC_HOST_DEVICE constexpr auto operator-(dual a, dual SERAC_HOST_DEVICE constexpr auto operator*(const dual& a, double b) { + // printf("dual operator* 3\n"); + // return a; return dual{a.value * b, a.gradient * b}; } @@ -115,6 +117,7 @@ SERAC_HOST_DEVICE constexpr auto operator*(const dual& a, double template SERAC_HOST_DEVICE constexpr auto operator*(double a, const dual& b) { + // printf("dual operator* 2\n"); return dual{a * b.value, a * b.gradient}; } @@ -122,6 +125,7 @@ SERAC_HOST_DEVICE constexpr auto operator*(double a, const dual& template SERAC_HOST_DEVICE constexpr auto operator*(dual a, dual b) { + // printf("dual operator* 1\n"); return dual{a.value * b.value, b.value * a.gradient + a.value * b.gradient}; } @@ -427,7 +431,7 @@ SERAC_HOST_DEVICE constexpr auto get_value(dual arg) /** @brief return the "gradient" part from a dual number type */ template -SERAC_HOST_DEVICE constexpr auto get_gradient(dual arg) +SERAC_HOST_DEVICE /*constexpr*/ auto get_gradient(dual arg) { return arg.gradient; } diff --git a/src/serac/numerics/functional/element_restriction.hpp b/src/serac/numerics/functional/element_restriction.hpp index f082e9236..60edd891c 100644 --- a/src/serac/numerics/functional/element_restriction.hpp +++ b/src/serac/numerics/functional/element_restriction.hpp @@ -77,7 +77,7 @@ struct DoF { uint64_t orientation() const { return ((bits & orientation_mask) >> orientation_shift); } /// get the index field of this `DoF` - + uint64_t index() const { return (bits & index_mask); } }; @@ -175,11 +175,11 @@ struct ElementRestriction { * @param i the index of the element * @param dofs (output) the DoFs associated with element `i` */ - + void GetElementVDofs(int i, DoF* vdofs) const; /// get the dof information for a given node / component - + DoF GetVDof(DoF node, uint64_t component) const; /// "L->E" in mfem parlance, each element gathers the values that belong to it, and stores them in the "E-vector" diff --git a/src/serac/numerics/functional/finite_element.hpp b/src/serac/numerics/functional/finite_element.hpp index 28174f703..7c8221340 100644 --- a/src/serac/numerics/functional/finite_element.hpp +++ b/src/serac/numerics/functional/finite_element.hpp @@ -244,16 +244,25 @@ struct QOI { * @param jacobians the jacobians of the isoparametric map from parent to physical space of each quadrature point */ template -SERAC_HOST_DEVICE void parent_to_physical(tensor& qf_input, const tensor& jacobians) +SERAC_HOST_DEVICE void parent_to_physical(tensor& qf_input, tensor* jacobians, + uint32_t block_idx, RAJA::LaunchContext ctx = {}) { [[maybe_unused]] constexpr int VALUE = 0; [[maybe_unused]] constexpr int DERIVATIVE = 1; - - for (int k = 0; k < q; k++) { +#ifdef USE_CUDA + using threads_x = RAJA::LoopPolicy; +#else + using threads_x = RAJA::LoopPolicy; +#endif + RAJA::RangeSegment k_range(0, BLOCK_SZ); + RAJA::loop(ctx, k_range, [&](int k) { + if (k >= q) { + return; + } tensor J; for (int row = 0; row < dim; row++) { for (int col = 0; col < dim; col++) { - J[row][col] = jacobians(col, row, k); + J[row][col] = jacobians[block_idx](col, row, k); } } @@ -269,7 +278,7 @@ SERAC_HOST_DEVICE void parent_to_physical(tensor& qf_input, const tensor(qf_input[k]) = dot(get(qf_input[k]), transpose(J)); } } - } + }); } /** diff --git a/src/serac/numerics/functional/functional.hpp b/src/serac/numerics/functional/functional.hpp index 67dafe6bd..a378754a7 100644 --- a/src/serac/numerics/functional/functional.hpp +++ b/src/serac/numerics/functional/functional.hpp @@ -33,7 +33,6 @@ #include #include - namespace serac { template @@ -519,6 +518,12 @@ class Functional { trial_space_(f.trial_space_[which]), df_(f.test_space_->GetTrueVSize()) { + // #ifdef USE_CUDA + // df_.UseDevice(true); + // input_L_.UseDevice(true); + // output_L_.UseDevice(true); + // output_T_.UseDevice(true); + // #endif } /** @@ -575,7 +580,10 @@ class Functional { integral.ComputeElementGradients(K_elem, which_argument); } - +#ifdef USE_CUDA + std::cout << "Printing USAGE before assemble\n"; + printCUDAMemUsage(); +#endif for (auto type : Integral::Types) { auto K_elem = element_gradients[type]; auto test_restrictions = form_.G_test_[type].restrictions; @@ -628,6 +636,10 @@ class Functional { } } } +#ifdef USE_CUDA + std::cout << "Printing USAGE AFTER assemble\n"; + printCUDAMemUsage(); +#endif // Copy the column indices to an auxilliary array as MFEM can mutate these during HypreParMatrix construction col_ind_copy_ = lookup_tables.col_ind; diff --git a/src/serac/numerics/functional/geometric_factors.cpp b/src/serac/numerics/functional/geometric_factors.cpp index d67ace5a8..fdb921497 100644 --- a/src/serac/numerics/functional/geometric_factors.cpp +++ b/src/serac/numerics/functional/geometric_factors.cpp @@ -45,13 +45,13 @@ void compute_geometric_factors(mfem::Vector& positions_q, mfem::Vector& jacobian auto J_q = reinterpret_cast(jacobians_q.ReadWrite()); auto X = reinterpret_cast(positions_e.Read()); #if defined(USE_CUDA) - using policy = RAJA::cuda_exec<512>; + using policy = RAJA::cuda_exec<32>; #else using policy = RAJA::simd_exec; #endif // for each element in the domain - //RAJA::forall(RAJA::TypedRangeSegment(0, num_elements), [=] SERAC_HOST_DEVICE(uint32_t e) { - for (uint32_t e = 0; e < num_elements; ++e) { + // RAJA::forall(RAJA::TypedRangeSegment(0, num_elements), [=] SERAC_HOST_DEVICE(uint32_t e) { + for (uint32_t e = 0; e < num_elements; ++e) { // load the positions for the nodes in this element auto X_e = X[e]; @@ -73,7 +73,7 @@ void compute_geometric_factors(mfem::Vector& positions_q, mfem::Vector& jacobian } } } - } + } //}); } diff --git a/src/serac/numerics/functional/integral.hpp b/src/serac/numerics/functional/integral.hpp index 0b7889de4..977e78432 100644 --- a/src/serac/numerics/functional/integral.hpp +++ b/src/serac/numerics/functional/integral.hpp @@ -88,9 +88,30 @@ struct Integral { for (auto& [geometry, func] : kernels) { std::vector inputs(active_trial_spaces_.size()); for (std::size_t i = 0; i < active_trial_spaces_.size(); i++) { - inputs[i] = input_E[uint32_t(active_trial_spaces_[i])].GetBlock(geometry).Read(); + if (input_E[uint32_t(active_trial_spaces_[i])].UseDevice()) { + std::cout << "is device " << std::endl; + } + input_E[uint32_t(active_trial_spaces_[i])].GetBlock(geometry).UseDevice(true); +#ifdef USE_CUDA + const auto& mfem_vec = input_E[uint32_t(active_trial_spaces_[i])].GetBlock(geometry); + inputs[i] = const_cast( + copy_data(const_cast(mfem_vec.Read()), mfem_vec.Size() * sizeof(double), "DEVICE")); +#else + inputs[i] = input_E[uint32_t(active_trial_spaces_[i])].GetBlock(geometry).Read(); +#endif } +#ifdef USE_CUDA + std::cout << "L103" << std::endl; + printCUDAMemUsage(); +#endif func(inputs, output_E.GetBlock(geometry).ReadWrite(), update_state); +// Deallocate +#ifdef USE_CUDA + for (auto input : inputs) { + deallocate(const_cast(input), "DEVICE"); + } + printCUDAMemUsage(); +#endif } } @@ -111,7 +132,25 @@ struct Integral { // if this integral actually depends on the specified variable if (functional_to_integral_index_.count(differentiation_index) > 0) { for (auto& [geometry, func] : jvp_[functional_to_integral_index_.at(differentiation_index)]) { - func(input_E.GetBlock(geometry).Read(), output_E.GetBlock(geometry).ReadWrite()); + const auto& mfem_vec = input_E.GetBlock(geometry); +#ifdef USE_CUDA + auto device_input = const_cast( + copy_data(const_cast(mfem_vec.Read()), mfem_vec.Size() * sizeof(double), "DEVICE")); + auto device_output = copy_data(const_cast(output_E.GetBlock(geometry).ReadWrite()), + output_E.GetBlock(geometry).Size() * sizeof(double), "DEVICE"); +#else + auto device_input = mfem_vec.Read(); + auto device_output = output_E.GetBlock(geometry).ReadWrite(); +#endif + func(device_input, device_output); +#ifdef USE_CUDA + auto& rm = umpire::ResourceManager::getInstance(); + deallocate(const_cast(device_input), "DEVICE"); + rm.copy(output_E.GetBlock(geometry).ReadWrite(), device_output); + deallocate(device_output, "DEVICE"); + std::cout << "L148" << std::endl; + printCUDAMemUsage(); +#endif } } } diff --git a/src/serac/numerics/functional/tensor.hpp b/src/serac/numerics/functional/tensor.hpp index 3150550a8..36385842d 100644 --- a/src/serac/numerics/functional/tensor.hpp +++ b/src/serac/numerics/functional/tensor.hpp @@ -12,12 +12,40 @@ #pragma once +#define BLOCK_SZ 128 +#define BLOCK_X 8 +#define BLOCK_Y 8 +#define BLOCK_Z 4 + #include "serac/infrastructure/accelerator.hpp" #include "detail/metaprogramming.hpp" +#include +#include #include +namespace { +template +DataType* copy_data(DataType* source_data, std::size_t size, const std::string& destination) +{ + auto& rm = umpire::ResourceManager::getInstance(); + auto dest_allocator = rm.getAllocator(destination); + + DataType* dest_data = static_cast(dest_allocator.allocate(size)); + + umpire::register_external_allocation( + source_data, umpire::util::AllocationRecord(source_data, size, rm.getAllocator("HOST").getAllocationStrategy(), + std::string("external array"))); + + // _sphinx_tag_tut_copy_start + rm.copy(dest_data, source_data); + // _sphinx_tag_tut_copy_end + + return dest_data; +} +} // namespace + namespace serac { /** @@ -702,6 +730,7 @@ template SERAC_HOST_DEVICE constexpr auto dot(const tensor& A, const tensor& B) { tensor AB{}; + for (int i = 0; i < m; i++) { for (int j = 0; j < p; j++) { for (int k = 0; k < n; k++) { @@ -1248,7 +1277,7 @@ SERAC_HOST_DEVICE constexpr auto detApIm1(const tensor& A) // clang-format off // equivalent to tr(A) + I2(A) + det(A) - return A(0, 0) + A(1, 1) + A(2, 2) + return A(0, 0) + A(1, 1) + A(2, 2) - A(0, 1) * A(1, 0) * (1 + A(2, 2)) + A(0, 0) * A(1, 1) * (1 + A(2, 2)) - A(0, 2) * A(2, 0) * (1 + A(1, 1)) @@ -1324,12 +1353,8 @@ SERAC_HOST_DEVICE auto contract(const tensor& A, const tensor{}; - if constexpr (d3 != 0) return tensor{}; - }(); - if constexpr (d3 == 0) { + auto C = tensor{}; for (int i = 0; i < d1; i++) { for (int j = 0; j < d2; j++) { U sum{}; @@ -1342,7 +1367,10 @@ SERAC_HOST_DEVICE auto contract(const tensor& A, const tensor{}; for (int i = 0; i < d1; i++) { for (int j = 0; j < d2; j++) { for (int k = 0; k < d3; k++) { @@ -1359,11 +1387,88 @@ SERAC_HOST_DEVICE auto contract(const tensor& A, const tensor +SERAC_DEVICE constexpr auto deduce_contract_return_type(const tensor& A, const tensor& B) +{ + constexpr int Adims[] = {m, n...}; + constexpr int Bdims[] = {p, q}; + static_assert(sizeof...(n) < 3); + static_assert(Adims[i1] == Bdims[i2], "error: incompatible tensor dimensions"); + + // first, we have to figure out the dimensions of the output tensor + constexpr int new_dim = (i2 == 0) ? q : p; + constexpr int d1 = (i1 == 0) ? new_dim : Adims[0]; + constexpr int d2 = (i1 == 1) ? new_dim : Adims[1]; + constexpr int d3 = sizeof...(n) == 1 ? 0 : ((i1 == 2) ? new_dim : Adims[2]); + using U = decltype(S{} * T{}); + if constexpr (d3 == 0) { + return serac::tensor{}; + } + return tensor{}; +} + +template +SERAC_HOST_DEVICE constexpr auto deduce_contract_return_type(const zero&, const tensor&) +{ + return zero{}; +} + +#ifdef USE_CUDA +template +SERAC_DEVICE void contract(const tensor& A, const tensor& B, tensor* C, int qx, + int qy, int qz, bool accumulate = false) +{ + constexpr int Adims[] = {m, n...}; + for (int i0 = qz; i0 < n0; i0 += BLOCK_Z) { + for (int i1 = qy; i1 < n1; i1 += BLOCK_Y) { + for (int i2 = qx; i2 < n2; i2 += BLOCK_X) { + double sum = (accumulate) ? (*C)(i0, i1, i2) : 0.0; + + for (int j = 0; j < Adims[l1]; j++) { + if constexpr (l1 == 0 && l2 == 1) { + sum += A(j, i1, i2) * B(i0, j); + } + if constexpr (l1 == 1 && l2 == 1) { + sum += A(i0, j, i2) * B(i1, j); + } + if constexpr (l1 == 2 && l2 == 1) { + sum += A(i0, i1, j) * B(i2, j); + } + } + + (*C)(i0, i1, i2) = sum; + } + } + } +} + +template +SERAC_DEVICE void contract(const zero&, const tensor&, zero*, int, int, int, bool accumulate = false) +{ + return; +} + +template +SERAC_DEVICE void contract(const zero&, const tensor&, tensor*, int, int, int, + bool accumulate = false) +{ + return; +} + +template +SERAC_DEVICE void contract(const tensor&, const tensor&, zero*, int, int, int, + bool accumulate = false) +{ + return; } +#endif + /// @overload template SERAC_HOST_DEVICE auto contract(const zero&, const T&) @@ -1591,6 +1696,62 @@ SERAC_HOST_DEVICE constexpr tensor inv(const tensor& return invA; } + +/** + * @overload + * @note Compute ith jth inverse element + */ +template +SERAC_HOST_DEVICE constexpr double inv_elem(const tensor& A) +{ + double inv_detA(1.0 / det(A)); + + if constexpr (i == 0 && j == 0) { + return (A[1][1] * A[2][2] - A[1][2] * A[2][1]) * inv_detA; + } else if constexpr (i == 0 && j == 1) { + return (A[0][2] * A[2][1] - A[0][1] * A[2][2]) * inv_detA; + } else if constexpr (i == 0 && j == 2) { + return (A[0][1] * A[1][2] - A[0][2] * A[1][1]) * inv_detA; + } else if constexpr (i == 1 && j == 0) { + return (A[1][2] * A[2][0] - A[1][0] * A[2][2]) * inv_detA; + } else if constexpr (i == 1 && j == 1) { + return (A[0][0] * A[2][2] - A[0][2] * A[2][0]) * inv_detA; + } else if constexpr (i == 1 && j == 2) { + return (A[0][2] * A[1][0] - A[0][0] * A[1][2]) * inv_detA; + } else if constexpr (i == 2 && j == 0) { + return (A[1][0] * A[2][1] - A[1][1] * A[2][0]) * inv_detA; + } else if constexpr (i == 2 && j == 1) { + return (A[0][1] * A[2][0] - A[0][0] * A[2][1]) * inv_detA; + } else if constexpr (i == 2 && j == 2) { + return (A[0][0] * A[1][1] - A[0][1] * A[1][0]) * inv_detA; + } + // error case + return 0; +} + +/** + * @overload + * @note Compute ith jth inverse element + */ +template +SERAC_HOST_DEVICE constexpr double inv_elem(const tensor& A) +{ + double inv_detA(1.0 / det(A)); + + if constexpr (i == 0 && j == 0) { + return A[1][1] * inv_detA; + } else if constexpr (i == 0 && j == 1) { + return -A[0][1] * inv_detA; + } else if constexpr (i == 1 && j == 0) { + return -A[1][0] * inv_detA; + } else if constexpr (i == 1 && j == 1) { + return A[0][0] * inv_detA; + } + + // error case + return 0; +} + /** * @overload * @note For N-by-N matrices with N > 3, requires Gaussian elimination @@ -1644,8 +1805,8 @@ inline SERAC_HOST_DEVICE void print(double value) { printf("%f", value); } * @brief print a tensor using `printf`, so that it is suitable for use inside cuda kernels. * @param[in] A The tensor to write out */ -template -SERAC_HOST_DEVICE void print(const tensor& A) +template +SERAC_HOST_DEVICE void print(const tensor& A) { printf("{"); print(A[0]); @@ -1867,10 +2028,7 @@ SERAC_HOST_DEVICE constexpr int size(const tensor*) * @tparam n the extents of each dimension * @return the total number of values stored in the tensor */ -SERAC_HOST_DEVICE constexpr int size(const zero*) -{ - return 1; -} +SERAC_HOST_DEVICE constexpr int size(const zero*) { return 1; } /** * @overload @@ -2012,12 +2170,12 @@ inline void eig(const r2tensor < 3, 3 > & A, // then just use the basis for the orthogonal complement // found earlier if (fabs(delta) <= 1.0e-15) { - + for (int i = 0; i < 3; i++){ Q(i,1) = s0(i); Q(i,2) = s1(i); - } - + } + // otherwise compute the remaining eigenvectors } else { @@ -2143,7 +2301,7 @@ inline mat < 2, 2 > look_at(const vec < 2 > & direction) { inline mat < 3, 3 > R3_basis(const vec3 & n) { float sign = (n[2] >= 0.0f) ? 1.0f : -1.0f; - float a = -1.0f / (sign + n[2]); + float a = -1.0f / (sign + n[2]); float b = n[0] * n[1] * a; return mat < 3, 3 >{ diff --git a/src/serac/numerics/functional/tests/CMakeLists.txt b/src/serac/numerics/functional/tests/CMakeLists.txt index b69210de6..a37f02c38 100644 --- a/src/serac/numerics/functional/tests/CMakeLists.txt +++ b/src/serac/numerics/functional/tests/CMakeLists.txt @@ -74,6 +74,4 @@ target_compile_definitions(functional_cuda PUBLIC RAJA_ENABLE_CUDA) target_compile_definitions(functional_basic_h1_scalar_cuda PUBLIC ENABLE_CUDA) target_compile_definitions(functional_basic_h1_vector_cuda PUBLIC USE_CUDA) - - target_compile_definitions(functional_basic_h1_vector_cuda PUBLIC RAJA_ENABLE_CUDA) endif() diff --git a/src/serac/numerics/functional/tests/check_gradient.hpp b/src/serac/numerics/functional/tests/check_gradient.hpp index 27f8c8e29..870b151e1 100644 --- a/src/serac/numerics/functional/tests/check_gradient.hpp +++ b/src/serac/numerics/functional/tests/check_gradient.hpp @@ -19,15 +19,24 @@ void check_gradient(serac::Functional& f, mfem::Vector& U, double epsil int seed = 42; mfem::Vector dU(U.Size()); + // Set memory backend to device if using GPU execution. + if constexpr (exec == serac::ExecutionSpace::GPU) { + dU.UseDevice(true); + } dU.Randomize(seed); auto [value, dfdU] = f(serac::differentiate_wrt(U)); std::unique_ptr dfdU_matrix = assemble(dfdU); // jacobian vector products - mfem::Vector df_jvp1 = dfdU(dU); // matrix-free + mfem::Vector df_jvp1; + df_jvp1.UseDevice(true); + df_jvp1 = dfdU(dU); // matrix-free mfem::Vector df_jvp2(df_jvp1.Size()); + if constexpr (exec == serac::ExecutionSpace::GPU) { + df_jvp2.UseDevice(true); + } dfdU_matrix->Mult(dU, df_jvp2); // sparse matvec if (df_jvp1.Norml2() != 0) { diff --git a/src/serac/numerics/functional/tests/functional_basic_h1_vector.cpp b/src/serac/numerics/functional/tests/functional_basic_h1_vector.cpp index a2a7a05ef..5fd3cd387 100644 --- a/src/serac/numerics/functional/tests/functional_basic_h1_vector.cpp +++ b/src/serac/numerics/functional/tests/functional_basic_h1_vector.cpp @@ -6,7 +6,7 @@ #include #include -#define USE_CUDA + #include "mfem.hpp" #include @@ -89,20 +89,21 @@ void weird_mixed_test(std::unique_ptr& mesh) auto [test_fes, test_col] = generateParFiniteElementSpace(mesh.get()); mfem::Vector U(trial_fes->TrueVSize()); - U.Randomize(); #ifdef USE_CUDA + U.UseDevice(true); Functional residual(test_fes.get(), {trial_fes.get()}); #else Functional residual(test_fes.get(), {trial_fes.get()}); #endif + U.Randomize(); // note: this is not really an elasticity problem, it's testing source and flux // terms that have the appropriate shapes to ensure that all the differentiation // code works as intended residual.AddDomainIntegral(Dimension{}, DependsOn<0>{}, MixedModelOne{}, *mesh); - residual.AddBoundaryIntegral(Dimension{}, DependsOn<0>{}, MixedModelTwo{}, *mesh); + // residual.AddBoundaryIntegral(Dimension{}, DependsOn<0>{}, MixedModelTwo{}, *mesh); check_gradient(residual, U); } @@ -118,20 +119,22 @@ void elasticity_test(std::unique_ptr& mesh) auto [test_fes, test_col] = generateParFiniteElementSpace(mesh.get()); mfem::Vector U(trial_fes->TrueVSize()); - U.Randomize(); #ifdef USE_CUDA + U.UseDevice(true); Functional residual(test_fes.get(), {trial_fes.get()}); #else Functional residual(test_fes.get(), {trial_fes.get()}); #endif + U.Randomize(); + // note: this is not really an elasticity problem, it's testing source and flux // terms that have the appropriate shapes to ensure that all the differentiation // code works as intended residual.AddDomainIntegral(Dimension{}, DependsOn<0>{}, ElasticityTestModelOne{}, *mesh); - residual.AddBoundaryIntegral(Dimension{}, DependsOn<0>{}, ElasticityTestModelTwo{}, *mesh); + // residual.AddBoundaryIntegral(Dimension{}, DependsOn<0>{}, ElasticityTestModelTwo{}, *mesh); check_gradient(residual, U); } @@ -157,16 +160,15 @@ void test_suite(std::string meshfile) } } - //TEST(VectorValuedH1, test_suite_tris) { test_suite("/data/meshes/patch2D_tris.mesh"); } - //TEST(VectorValuedH1, test_suite_quads) { test_suite("/data/meshes/patch2D_quads.mesh"); } -//TEST(VectorValuedH1, test_suite_tris_and_quads) { test_suite("/data/meshes/patch2D_tris_and_quads.mesh"); } - -//TEST(VectorValuedH1, test_suite_tets) { test_suite("/data/meshes/patch3D_tets.mesh"); } +// TEST(VectorValuedH1, test_suite_tris) { test_suite("/data/meshes/patch2D_tris.mesh"); } +// TEST(VectorValuedH1, test_suite_quads) { test_suite("/data/meshes/patch2D_quads.mesh"); } +// TEST(VectorValuedH1, test_suite_tris_and_quads) { test_suite("/data/meshes/patch2D_tris_and_quads.mesh"); } +// TEST(VectorValuedH1, test_suite_tets) { test_suite("/data/meshes/patch3D_tets.mesh"); } - TEST(VectorValuedH1, test_suite_hexes) { test_suite("/data/meshes/patch3D_hexes.mesh"); } +TEST(VectorValuedH1, test_suite_hexes) { test_suite("/data/meshes/patch3D_hexes.mesh"); } - TEST(VectorValuedH1, test_suite_tets_and_hexes) { test_suite("/data/meshes/patch3D_tets_and_hexes.mesh"); } +// TEST(VectorValuedH1, test_suite_tets_and_hexes) { test_suite("/data/meshes/patch3D_tets_and_hexes.mesh"); } int main(int argc, char* argv[]) { @@ -179,18 +181,18 @@ int main(int argc, char* argv[]) MPI_Comm_rank(MPI_COMM_WORLD, &myid); axom::slic::SimpleLogger logger; -//cudaSetDevice(0); -//cudaDeviceReset(); -//#ifdef USE_CUDA -//printCUDAMemUsage(); -//#endif -//cudaSetDevice(2); -cudaDeviceSynchronize(); + // cudaSetDevice(0); + // cudaDeviceReset(); + //#ifdef USE_CUDA + // printCUDAMemUsage(); + //#endif + // cudaSetDevice(2); + int result = RUN_ALL_TESTS(); -cudaDeviceSynchronize(); -//#ifdef USE_CUDA -//printCUDAMemUsage(); -//#endif + + //#ifdef USE_CUDA + // printCUDAMemUsage(); + //#endif MPI_Finalize(); return result; diff --git a/src/serac/numerics/functional/tests/interpolate_device.cpp b/src/serac/numerics/functional/tests/interpolate_device.cpp new file mode 100644 index 000000000..db19f749c --- /dev/null +++ b/src/serac/numerics/functional/tests/interpolate_device.cpp @@ -0,0 +1,42 @@ +// Copyright (c) 2019-2023, Lawrence Livermore National Security, LLC and +// other Serac Project Developers. See the top-level LICENSE file for +// details. +// +// SPDX-License-Identifier: (BSD-3-Clause) + +#include +#include + +#include "mfem.hpp" + +#include + +#include "axom/slic/core/SimpleLogger.hpp" +#include "serac/infrastructure/input.hpp" +#include "serac/numerics/functional/finite_element.hpp" +#include "serac/serac_config.hpp" +#include "serac/mesh/mesh_utils_base.hpp" +#include "serac/numerics/stdfunction_operator.hpp" +#include "serac/numerics/functional/functional.hpp" +#include "serac/numerics/functional/tensor.hpp" + +#include "serac/numerics/functional/tests/check_gradient.hpp" + + + +int main { + using element_type = serac::finite_element>; + double* U_e; //input buffer, 2D array + double* U_q; //output buffer, 2D array + double* dU_dxi_q; //gradient 3D + // allocate buffers with umpire + + RAJA::forall() .. { + using local_array_mem_policy = RAJA::cuda_shared_mem; + RAJA::LocalArray shared_buffer; + // copy input buffer into shared_buffer + element_type::interpolate() + // check U_q and gradient dU_dxi_q + } + return 0; +} \ No newline at end of file diff --git a/src/serac/numerics/functional/tuple.hpp b/src/serac/numerics/functional/tuple.hpp index 49c790e99..93244aa24 100644 --- a/src/serac/numerics/functional/tuple.hpp +++ b/src/serac/numerics/functional/tuple.hpp @@ -205,7 +205,7 @@ struct tuple_size_ptr> : std::integral_constant +template SERAC_HOST_DEVICE constexpr int size(const serac::tuple) { return tuple_size>::value; @@ -218,7 +218,7 @@ SERAC_HOST_DEVICE constexpr int size(const serac::tuple) * @tparam n the extents of each dimension * @return the total number of values stored in the tensor */ - template +template SERAC_HOST_DEVICE constexpr int size(const serac::tuple) { return tuple_size_ptr>::value; @@ -574,6 +574,7 @@ template SERAC_HOST_DEVICE constexpr auto mult_helper(const tuple& x, const tuple& y, std::integer_sequence) { + // printf("mult helper* 3\n"); return tuple{get(x) * get(y)...}; } @@ -587,6 +588,7 @@ SERAC_HOST_DEVICE constexpr auto mult_helper(const tuple& x, const tuple SERAC_HOST_DEVICE constexpr auto operator*(const tuple& x, const tuple& y) { + // printf("operator* 1\n"); static_assert(sizeof...(S) == sizeof...(T)); return mult_helper(x, y, std::make_integer_sequence(sizeof...(S))>()); } @@ -600,9 +602,10 @@ SERAC_HOST_DEVICE constexpr auto operator*(const tuple& x, const tuple -SERAC_HOST_DEVICE constexpr auto mult_helper(const double a, const tuple& x, std::integer_sequence) +template +SERAC_HOST_DEVICE constexpr auto mult_helper(const double a, const TupleType& x, std::integer_sequence) { + // printf("mult helper* 1\n"); return tuple{a * get(x)...}; } @@ -615,9 +618,10 @@ SERAC_HOST_DEVICE constexpr auto mult_helper(const double a, const tuple& * @param a a constant multiplier * @return the returned tuple product */ -template -SERAC_HOST_DEVICE constexpr auto mult_helper(const tuple& x, const double a, std::integer_sequence) +template +SERAC_HOST_DEVICE constexpr auto mult_helper(const TupleType& x, const double a, std::integer_sequence) { + // printf("mult helper* 2\n"); return tuple{get(x) * a...}; } @@ -630,6 +634,7 @@ SERAC_HOST_DEVICE constexpr auto mult_helper(const tuple& x, const double template SERAC_HOST_DEVICE constexpr auto operator*(const double a, const tuple& x) { + // printf("operator* 2\n"); return mult_helper(a, x, std::make_integer_sequence(sizeof...(T))>()); } @@ -642,6 +647,7 @@ SERAC_HOST_DEVICE constexpr auto operator*(const double a, const tuple& x) template SERAC_HOST_DEVICE constexpr auto operator*(const tuple& x, const double a) { + // printf("operator* 3\n"); return mult_helper(x, a, std::make_integer_sequence(sizeof...(T))>()); } diff --git a/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp b/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp index a1ef69f75..1de092542 100644 --- a/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp +++ b/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include "serac/numerics/functional/tuple.hpp" #include "serac/numerics/functional/tensor.hpp" #include "serac/numerics/functional/dual.hpp" @@ -220,17 +221,37 @@ SERAC_HOST_DEVICE auto promote_to_dual_when(const T& x) * @param x the values to be promoted */ template -SERAC_HOST_DEVICE auto promote_each_to_dual_when(const tensor& x) +SERAC_HOST_DEVICE auto promote_each_to_dual_when(const tensor& x, void* output_ptr = nullptr, + RAJA::LaunchContext ctx = RAJA::LaunchContext{}) { if constexpr (dualify) { +#ifdef USE_CUDA + using threads_x = RAJA::LoopPolicy; +#else + using threads_x = RAJA::LoopPolicy; +#endif + + RAJA::RangeSegment x_range(0, n); using return_type = decltype(make_dual(T{})); tensor output; - for (int i = 0; i < n; i++) { + auto casted_output_ptr = static_cast*>(output_ptr); + RAJA::loop(ctx, x_range, [&](int i) { +#ifndef USE_CUDA output[i] = make_dual(x[i]); - } +#else + (*casted_output_ptr)[i] = make_dual(x[i]); +#endif + }); return output; } if constexpr (!dualify) { +#ifdef USE_CUDA + using threads_x = RAJA::LoopPolicy; + + RAJA::RangeSegment x_range(0, n); + auto casted_output_ptr = static_cast*>(output_ptr); + RAJA::loop(ctx, x_range, [&](int i) { (*casted_output_ptr)[i] = x[i]; }); +#endif return x; } } @@ -503,7 +524,7 @@ SERAC_HOST_DEVICE auto get_value(const tensor, n...>& arg) * @param[in] arg The tensor of dual numbers */ template -SERAC_HOST_DEVICE constexpr auto get_gradient(const tensor, n...>& arg) +SERAC_HOST_DEVICE /*constexpr*/ auto get_gradient(const tensor, n...>& arg) { tensor g{}; for_constexpr([&](auto... i) { g(i...) = arg(i...).gradient; }); @@ -512,7 +533,7 @@ SERAC_HOST_DEVICE constexpr auto get_gradient(const tensor, n...>& /// @overload template -SERAC_HOST_DEVICE constexpr auto get_gradient(const tensor>, n...>& arg) +SERAC_HOST_DEVICE /*constexpr*/ auto get_gradient(const tensor>, n...>& arg) { tensor g{}; for_constexpr([&](auto... i) { g(i...) = arg(i...).gradient; });