diff --git a/CMakeLists.txt b/CMakeLists.txt index 21a83a7..6d45c21 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,60 +1,15 @@ -cmake_minimum_required(VERSION 3.17 FATAL_ERROR) +cmake_minimum_required(VERSION 3.12 FATAL_ERROR) -project(cloverleaf_sycl) +project(cloverleaf_openmp_target) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) -# -#set (OpenMP_CXX_FLAGS "-fiopenmp -qnextgen") -#set (OpenMP_CXX_LIB_NAMES "libiomp5") -#set (OpenMP_libiomp5_LIBRARY -# /opt/intel/oneapi/compiler/2021.1-beta08/linux/compiler/lib/intel64_lin/libiomp5.so -# ) find_package(OpenMP REQUIRED) set(CMAKE_VERBOSE_MAKEFILE YES) -#set(SYCL_RUNTIME DPCPP) -#set(HIPSYCL_INSTALL_DIR /home/tom/hypsycl_dist) - - -#if (SYCL_RUNTIME) -# -# list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) -# -# if (${SYCL_RUNTIME} STREQUAL "HIPSYCL") -# -# if (NOT HIPSYCL_INSTALL_DIR) -# message(FATAL_ERROR "HIPSYCL_INSTALL_DIR is undefined") -# endif () -# -# set(hipSYCL_DIR ${HIPSYCL_INSTALL_DIR}/lib/cmake) -# find_package(hipSYCL CONFIG REQUIRED) -# set(EXTRA_FLAGS -Wno-sign-compare -Wno-stringop-truncation) -# elseif (${SYCL_RUNTIME} STREQUAL "COMPUTECPP") -# -# if (NOT ComputeCpp_DIR) -# message(FATAL_ERROR "ComputeCpp_DIR is undefined") -# endif () -# add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) -# set(COMPUTECPP_USER_FLAGS -O3 -fsycl-split-modules=20 -mllvm -inline-threshold=10000 -no-serial-memop) -# find_package(ComputeCpp REQUIRED) -# # set(EXTRA_FLAGS -pedantic) -# elseif (${SYCL_RUNTIME} STREQUAL "DPCPP") -# -# set(CMAKE_CXX_STANDARD 17) -# set(CMAKE_CXX_COMPILER "dpcpp") -# set(EXTRA_FLAGS -pedantic) -# include_directories(/opt/intel/oneapi/compiler/2021.1-beta08/linux/compiler/include/) -# else () -# message(FATAL_ERROR "SYCL_RUNTIME unsupported, must be one of HIPSYCL|COMPUTECPP|DPCPP, got ${SYCL_RUNTIME}") -# endif () -#else () -# message(FATAL_ERROR "SYCL_RUNTIME not defined, must be one of HIPSYCL|COMPUTECPP|DPCPP") -#endif () - if (MPI_AS_LIBRARY) @@ -87,6 +42,7 @@ set(SOURCES src/advec_mom.cpp src/advection.cpp src/build_field.cpp + src/finalise_field.cpp src/calc_dt.cpp src/clover_leaf.cpp src/comms.cpp @@ -120,34 +76,51 @@ target_compile_options(clover_leaf -Wall -Wextra -Wcast-align - -Wfatal-errors +# -Wfatal-errors -Werror=return-type -Wno-unused-parameter -Wno-unused-variable - -Wno-ignored-attributes +# -Wno-ignored-attributes ${EXTRA_FLAGS} ) -set(OMP_OFFLOAD_FLAGS -foffload=nvptx-none) +separate_arguments(OMP_OFFLOAD_FLAGS) +separate_arguments(CXX_EXTRA_FLAGS) +separate_arguments(CXX_EXTRA_LINKER_FLAGS) + + +option(OMP_ALLOW_HOST "Whether the OMP clause `if(target: )` is included at compile time, some compilers may not support this feature" ON) -set(DEBUG_OPTIONS -O2 -fno-omit-frame-pointer -fsanitize=address ${OMP_OFFLOAD_FLAGS} ${CXX_EXTRA_FLAGS}) -set(RELEASE_OPTIONS -Ofast -march=native -mtune=native ${OMP_OFFLOAD_FLAGS} ${CXX_EXTRA_FLAGS}) +if (OMP_ALLOW_HOST) + add_definitions(-DOMP_ALLOW_HOST) +endif() + +set(DEBUG_OPTIONS -O2 -fno-omit-frame-pointer ${OMP_OFFLOAD_FLAGS} ${CXX_EXTRA_FLAGS}) +set(RELEASE_OPTIONS -O3 ${OMP_OFFLOAD_FLAGS} ${CXX_EXTRA_FLAGS}) target_link_libraries(clover_leaf PUBLIC ${MPI_C_LIB}) target_link_libraries(clover_leaf PUBLIC OpenMP::OpenMP_CXX OpenMP::OpenMP_C) -# remove when using omp target -#target_link_libraries(clover_leaf PUBLIC $<$:-Wl,-lasan>) -#target_link_libraries(clover_leaf PUBLIC $<$:-Wl,-lasan>) -target_link_libraries(clover_leaf PUBLIC $<$:-Wl,-lasan>) target_compile_options(clover_leaf PUBLIC "$<$:${RELEASE_OPTIONS}>") target_compile_options(clover_leaf PUBLIC "$<$:${RELEASE_OPTIONS}>") target_compile_options(clover_leaf PUBLIC "$<$:${DEBUG_OPTIONS}>") -target_link_options(clover_leaf PUBLIC ${OpenMP_CXX_FLAGS} ${OMP_OFFLOAD_FLAGS}) +if (${CMAKE_VERSION} VERSION_LESS "3.13.0") + message(WARNING "target_link_options is only available in CMake >= 3.13.0, using fallback target_link_libraries, this may cause issues with some compilers") + + set(EXTRA_LINK_FLAGS ${OpenMP_CXX_FLAGS} ${OMP_OFFLOAD_FLAGS}) -#target_link_options(clover_leaf PUBLIC LINKER:${CXX_EXTRA_LINKER_FLAGS}) + if (DEFINED CXX_EXTRA_LINKER_FLAGS) + list(APPEND EXTRA_LINK_FLAGS "-Wl,${CXX_EXTRA_LINKER_FLAGS}") + endif () + + target_link_libraries(clover_leaf PUBLIC ${EXTRA_LINK_FLAGS}) + +else () + target_link_options(clover_leaf PUBLIC ${OpenMP_CXX_FLAGS} ${OMP_OFFLOAD_FLAGS}) + target_link_options(clover_leaf PUBLIC LINKER:${CXX_EXTRA_LINKER_FLAGS}) +endif () diff --git a/README.md b/README.md index 5e8a861..3810c30 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ +> [!WARNING] +> Superseded by , which contains a OpenMP target implementation, along with many other models. + # A OpenMP Target port of CloverLeaf This is a port of [CloverLeaf](https://github.com/UoB-HPC/cloverleaf_kokkos) from MPI+Kokkos to MPI+OpenMP Target. @@ -28,28 +31,41 @@ Flags: * Set `MPI_C_INCLUDE_DIR` to /include * Set `MPI_C_LIB` to the library name, for exampe: mpich for libmpich.so * `CXX_EXTRA_FLAGS` - `STRING`, appends extra flags that will be passed on to the compiler, applies to all configs - * `CXX_EXTRA_LINKER_FLAGS` - `STRING`, appends extra linker flags (the comma separated list after the `-Wl` flag) to the linker, applies to all configs - + * `CXX_EXTRA_LINKER_FLAGS` - `STRING`, appends extra linker flags (the comma separated list after the `-Wl` flag) to the linker, applies to all configs + * `OMP_OFFLOAD_FLAGS` - OpenMP 4.5 target offload flags that will passed directly to the compiler and linker, see examples flag combinations below. + * GCC+NVIDIA - `"-foffload=nvptx-none -foffload=-lm -fno-fast-math -fno-associative-math"` + * GCC+Radeon - `"-foffload=amdgcn-amdhsa='-march=gfx906' -foffload=-lm -fno-fast-math -fno-associative-math"` + * LLVM+NVIDIA - `"-fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_75"` + * ICC - `"-qnextgen -fiopenmp -fopenmp-targets=spir64"` + * CCE+NVIDIA - `"-fopenmp-targets=nvptx64 -Xopenmp-target -march=sm_60"` + * `OMP_ALLOW_HOST` - `BOOL(ON|OFF)`, enabled by default, set to false if the compiler is unable to support dynamic selection of host/target devices. If disabled, running the binary with `--no-target` emits an error. + + If parts of your toolchain are installed at different places, you'll have to specify it manually, for example: cmake3 -Bbuild -H. \ - -DCMAKE_C_COMPILER=/nfs/software/x86_64/gcc/9.1.0/bin/gcc \ - -DCMAKE_CXX_COMPILER=/nfs/software/x86_64/gcc/9.1.0/bin/g++ \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=g++ \ -DCMAKE_BUILD_TYPE=Release \ + -DOMP_OFFLOAD_FLAGS="-foffload=nvptx-none -foffload=-lm -fno-fast-math -fno-associative-math" Proceed with compiling: - cmake3 --build build --target clover_leaf --config Release -j $(nproc) - + cmake3 --build build --target cloverleaf --config Release -j $(nproc) + +## Known issues + + * ICC 2021.1 Beta 20200602 requires `-DOMP_ALLOW_HOST=OFF` + ## Running -The main `clover_leaf` executable takes a `clover.in` file as parameter and outputs `clover.out` at working directory. +The main `cloverleaf` executable takes a `clover.in` file as parameter and outputs `clover.out` at working directory. For example, after successful compilation, at **project root**: - ./build/clover_leaf --file InputDecks/clover_bm16_short.in + ./build/cloverleaf --file InputDecks/clover_bm16_short.in See [Tested configurations](#tested-configurations) for tested platforms and drivers. @@ -58,7 +74,8 @@ For help, use the `-h` flag: Options: -h --help Print the message --list List available devices + --no-target Use OMP fallback --device Select device at INDEX from output of --list - --input Custom clover.in file (defaults to clover.in if unspecified) + --file Custom clover.in file (defaults to clover.in if unspecified) ``` diff --git a/src/PdV.cpp b/src/PdV.cpp index 94c7268..9ce30c1 100644 --- a/src/PdV.cpp +++ b/src/PdV.cpp @@ -26,7 +26,7 @@ #include "ideal_gas.h" #include "update_halo.h" #include "revert.h" -#include "utils.hpp" + // @brief Fortran PdV kernel. // @author Wayne Gaudin @@ -35,88 +35,118 @@ // level of the velocity data depends on whether it is invoked as the // predictor or corrector. void PdV_kernel( + bool use_target, bool predict, int x_min, int x_max, int y_min, int y_max, double dt, - clover::Buffer2D &xarea, - clover::Buffer2D &yarea, - clover::Buffer2D &volume, - clover::Buffer2D &density0, - clover::Buffer2D &density1, - clover::Buffer2D &energy0, - clover::Buffer2D &energy1, - clover::Buffer2D &pressure, - clover::Buffer2D &viscosity, - clover::Buffer2D &xvel0, - clover::Buffer2D &xvel1, - clover::Buffer2D &yvel0, - clover::Buffer2D &yvel1, - clover::Buffer2D &volume_change) { + field_type &field +) { + const int base_stride = field.base_stride; + const int vels_wk_stride = field.vels_wk_stride; + const int flux_x_stride = field.flux_x_stride; + const int flux_y_stride = field.flux_y_stride; // DO k=y_min,y_max // DO j=x_min,x_max if (predict) { - _Pragma("kernel2d") + double *xarea = field.xarea.data; + + double *yarea = field.yarea.data; + double *volume = field.volume.data; + double *density0 = field.density0.data; + double *density1 = field.density1.data; + double *energy0 = field.energy0.data; + double *energy1 = field.energy1.data; + double *pressure = field.pressure.data; + double *viscosity = field.viscosity.data; + double *xvel0 = field.xvel0.data; + double *xvel1 = field.xvel1.data; + double *yvel0 = field.yvel0.data; + double *yvel1 = field.yvel1.data; + double *volume_change = field.work_array1.data; + + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 2); j++) { for (int i = (x_min + 1); i < (x_max + 2); i++) { - double left_flux = (xarea(i, j) * (xvel0(i, j) + - xvel0(i + 0, j + 1) + - xvel0(i, j) + - xvel0(i + 0, j + 1))) * 0.25 * dt * 0.5; - double right_flux = (xarea(i + 1, j + 0) * (xvel0(i + 1, j + 0) + - xvel0(i + 1, j + 1) + - xvel0(i + 1, j + 0) + - xvel0(i + 1, j + 1))) * 0.25 * dt * 0.5; - double bottom_flux = (yarea(i, j) * (yvel0(i, j) + - yvel0(i + 1, j + 0) + - yvel0(i, j) + - yvel0(i + 1, j + 0))) * 0.25 * dt * 0.5; - double top_flux = (yarea(i + 0, j + 1) * (yvel0(i + 0, j + 1) + - yvel0(i + 1, j + 1) + - yvel0(i + 0, j + 1) + - yvel0(i + 1, j + 1))) * 0.25 * dt * 0.5; + double left_flux = (xarea[i + j * flux_x_stride] * (xvel0[i + j * vels_wk_stride] + + xvel0[(i + 0) + (j + 1) * vels_wk_stride] + + xvel0[i + j * vels_wk_stride] + + xvel0[(i + 0) + (j + 1) * vels_wk_stride])) * 0.25 * dt * 0.5; + double right_flux = (xarea[(i + 1) + (j + 0) * flux_x_stride] * (xvel0[(i + 1) + (j + 0) * vels_wk_stride] + + xvel0[(i + 1) + (j + 1) * vels_wk_stride] + + xvel0[(i + 1) + (j + 0) * vels_wk_stride] + + xvel0[(i + 1) + (j + 1) * vels_wk_stride])) * 0.25 * dt * 0.5; + double bottom_flux = (yarea[i + j * flux_y_stride] * (yvel0[i + j * vels_wk_stride] + + yvel0[(i + 1) + (j + 0) * vels_wk_stride] + + yvel0[i + j * vels_wk_stride] + + yvel0[(i + 1) + (j + 0) * vels_wk_stride])) * 0.25 * dt * 0.5; + double top_flux = (yarea[(i + 0) + (j + 1) * flux_y_stride] * (yvel0[(i + 0) + (j + 1) * vels_wk_stride] + + yvel0[(i + 1) + (j + 1) * vels_wk_stride] + + yvel0[(i + 0) + (j + 1) * vels_wk_stride] + + yvel0[(i + 1) + (j + 1) * vels_wk_stride])) * 0.25 * dt * 0.5; double total_flux = right_flux - left_flux + top_flux - bottom_flux; - double volume_change_s = volume(i, j) / (volume(i, j) + total_flux); - double min_cell_volume = std::fmin(std::fmin(volume(i, j) + right_flux - left_flux + top_flux - bottom_flux, volume(i, j) + right_flux - left_flux), volume(i, j) + top_flux - bottom_flux); - double recip_volume = 1.0 / volume(i, j); - double energy_change = (pressure(i, j) / density0(i, j) + viscosity(i, j) / density0(i, j)) * total_flux * recip_volume; - energy1(i, j) = energy0(i, j) - energy_change; - density1(i, j) = density0(i, j) * volume_change_s; + double volume_change_s = volume[i + j * base_stride] / (volume[i + j * base_stride] + total_flux); + double min_cell_volume = fmin(fmin(volume[i + j * base_stride] + right_flux - left_flux + top_flux - bottom_flux, volume[i + j * base_stride] + right_flux - left_flux), + volume[i + j * base_stride] + top_flux - bottom_flux); + double recip_volume = 1.0 / volume[i + j * base_stride]; + double energy_change = + (pressure[i + j * base_stride] / density0[i + j * base_stride] + viscosity[i + j * base_stride] / density0[i + j * base_stride]) * total_flux * + recip_volume; + energy1[i + j * base_stride] = energy0[i + j * base_stride] - energy_change; + density1[i + j * base_stride] = density0[i + j * base_stride] * volume_change_s; } } } else { - _Pragma("kernel2d") + double *xarea = field.xarea.data; + double *yarea = field.yarea.data; + double *volume = field.volume.data; + double *density0 = field.density0.data; + double *density1 = field.density1.data; + double *energy0 = field.energy0.data; + double *energy1 = field.energy1.data; + double *pressure = field.pressure.data; + double *viscosity = field.viscosity.data; + double *xvel0 = field.xvel0.data; + double *xvel1 = field.xvel1.data; + double *yvel0 = field.yvel0.data; + double *yvel1 = field.yvel1.data; + double *volume_change = field.work_array1.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 2); j++) { for (int i = (x_min + 1); i < (x_max + 2); i++) { - double left_flux = (xarea(i, j) * (xvel0(i, j) + - xvel0(i + 0, j + 1) + - xvel1(i, j) + - xvel1(i + 0, j + 1))) * 0.25 * dt; - double right_flux = (xarea(i + 1, j + 0) * (xvel0(i + 1, j + 0) + - xvel0(i + 1, j + 1) + - xvel1(i + 1, j + 0) + - xvel1(i + 1, j + 1))) * 0.25 * dt; - double bottom_flux = (yarea(i, j) * (yvel0(i, j) + - yvel0(i + 1, j + 0) + - yvel1(i, j) + - yvel1(i + 1, j + 0))) * 0.25 * dt; - double top_flux = (yarea(i + 0, j + 1) * (yvel0(i + 0, j + 1) + - yvel0(i + 1, j + 1) + - yvel1(i + 0, j + 1) + yvel1(i + 1, j + 1))) * 0.25 * dt; + double left_flux = (xarea[i + j * flux_x_stride] * (xvel0[i + j * vels_wk_stride] + + xvel0[(i + 0) + (j + 1) * vels_wk_stride] + + xvel1[i + j * vels_wk_stride] + + xvel1[(i + 0) + (j + 1) * vels_wk_stride])) * 0.25 * dt; + double right_flux = (xarea[(i + 1) + (j + 0) * flux_x_stride] * (xvel0[(i + 1) + (j + 0) * vels_wk_stride] + + xvel0[(i + 1) + (j + 1) * vels_wk_stride] + + xvel1[(i + 1) + (j + 0) * vels_wk_stride] + + xvel1[(i + 1) + (j + 1) * vels_wk_stride])) * 0.25 * dt; + double bottom_flux = (yarea[i + j * flux_y_stride] * (yvel0[i + j * vels_wk_stride] + + yvel0[(i + 1) + (j + 0) * vels_wk_stride] + + yvel1[i + j * vels_wk_stride] + + yvel1[(i + 1) + (j + 0) * vels_wk_stride])) * 0.25 * dt; + double top_flux = (yarea[(i + 0) + (j + 1) * flux_y_stride] * (yvel0[(i + 0) + (j + 1) * vels_wk_stride] + + yvel0[(i + 1) + (j + 1) * vels_wk_stride] + + yvel1[(i + 0) + (j + 1) * vels_wk_stride] + yvel1[(i + 1) + (j + 1) * vels_wk_stride])) * 0.25 * dt; double total_flux = right_flux - left_flux + top_flux - bottom_flux; - double volume_change_s = volume(i, j) / (volume(i, j) + total_flux); - double min_cell_volume = std::fmin(std::fmin( - volume(i, j) + right_flux - left_flux + top_flux - bottom_flux, volume(i, j) + right_flux - left_flux), - volume(i, j) + top_flux - bottom_flux); - double recip_volume = 1.0 / volume(i, j); - double energy_change = (pressure(i, j) / density0(i, j) + viscosity(i, j) / density0(i, j)) * total_flux * recip_volume; - energy1(i, j) = energy0(i, j) - energy_change; - density1(i, j) = density0(i, j) * volume_change_s; + double volume_change_s = volume[i + j * base_stride] / (volume[i + j * base_stride] + total_flux); + double min_cell_volume = fmin(fmin( + volume[i + j * base_stride] + right_flux - left_flux + top_flux - bottom_flux, volume[i + j * base_stride] + right_flux - left_flux), + volume[i + j * base_stride] + top_flux - bottom_flux); + double recip_volume = 1.0 / volume[i + j * base_stride]; + double energy_change = + (pressure[i + j * base_stride] / density0[i + j * base_stride] + viscosity[i + j * base_stride] / density0[i + j * base_stride]) * total_flux * + recip_volume; + energy1[i + j * base_stride] = energy0[i + j * base_stride] - energy_change; + density1[i + j * base_stride] = density0[i + j * base_stride] * volume_change_s; } } } @@ -134,30 +164,25 @@ void PdV(global_variables &globals, bool predict) { globals.error_condition = 0; + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) { tile_type &t = globals.chunk.tiles[tile]; - PdV_kernel(predict, + PdV_kernel(globals.use_target, + predict, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, t.info.t_ymax, globals.dt, - t.field.xarea, - t.field.yarea, - t.field.volume, - t.field.density0, - t.field.density1, - t.field.energy0, - t.field.energy1, - t.field.pressure, - t.field.viscosity, - t.field.xvel0, - t.field.xvel1, - t.field.yvel0, - t.field.yvel1, - t.field.work_array1); + t.field); } + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif clover_check_error(globals.error_condition); if (globals.profiler_on) globals.profiler.PdV += timer() - kernel_time; diff --git a/src/accelerate.cpp b/src/accelerate.cpp index 7e04ab7..e8c93d4 100644 --- a/src/accelerate.cpp +++ b/src/accelerate.cpp @@ -21,7 +21,7 @@ #include "accelerate.h" #include "timer.h" -#include "utils.hpp" + // @brief Fortran acceleration kernel @@ -29,18 +29,10 @@ // @details The pressure and viscosity gradients are used to update the // velocity field. void accelerate_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, double dt, - clover::Buffer2D &xarea, - clover::Buffer2D &yarea, - clover::Buffer2D &volume, - clover::Buffer2D &density0, - clover::Buffer2D &pressure, - clover::Buffer2D &viscosity, - clover::Buffer2D &xvel0, - clover::Buffer2D &yvel0, - clover::Buffer2D &xvel1, - clover::Buffer2D &yvel1) { + field_type &field) { double halfdt = 0.5 * dt; @@ -51,29 +43,47 @@ void accelerate_kernel( //for(int j = ) - - _Pragma("kernel2d") + const int xarea_sizex = field.flux_x_stride; + const int yarea_sizex = field.flux_y_stride; + const int base_stride = field.base_stride; + const int vels_wk_stride = field.vels_wk_stride; + + double *xarea = field.xarea.data; + double *yarea = field.yarea.data; + double *volume = field.volume.data; + double *density0 = field.density0.data; + double *pressure = field.pressure.data; + double *viscosity = field.viscosity.data; + double *xvel0 = field.xvel0.data; + double *yvel0 = field.yvel0.data; + double *xvel1 = field.xvel1.data; + double *yvel1 = field.yvel1.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) { for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) { - double stepbymass_s = halfdt / ((density0(i - 1, j - 1) * volume(i - 1, j - 1) + - density0(i - 1, j + 0) * volume(i - 1, j + 0) + density0(i, j) * volume(i, j) + - density0(i + 0, j - 1) * volume(i + 0, j - 1)) * 0.25); - xvel1(i, j) = xvel0(i, j) - - stepbymass_s * (xarea(i, j) * (pressure(i, j) - pressure(i - 1, j + 0)) + - xarea(i + 0, j - 1) * (pressure(i + 0, j - 1) - pressure(i - 1, j - 1))); - yvel1(i, j) = yvel0(i, j) - - stepbymass_s * (yarea(i, j) * - (pressure(i, j) - pressure(i + 0, j - 1)) + - yarea(i - 1, j + 0) * (pressure(i - 1, j + 0) - pressure(i - 1, j - 1))); - xvel1(i, j) = xvel1(i, j) - - stepbymass_s * (xarea(i, j) * - (viscosity(i, j) - - viscosity(i - 1, j + 0)) + - xarea(i + 0, j - 1) * (viscosity(i + 0, j - 1) - viscosity(i - 1, j - 1))); - yvel1(i, j) = yvel1(i, j) - - stepbymass_s * (yarea(i, j) * - (viscosity(i, j) - viscosity(i + 0, j - 1)) + - yarea(i - 1, j + 0) * (viscosity(i - 1, j + 0) - viscosity(i - 1, j - 1))); + double stepbymass_s = halfdt / ((density0[(i - 1) + (j - 1) * base_stride] * volume[(i - 1) + (j - 1) * base_stride] + + density0[(i - 1) + (j + 0) * base_stride] * volume[(i - 1) + (j + 0) * base_stride] + + density0[i + j * base_stride] * volume[i + j * base_stride] + + density0[(i + 0) + (j - 1) * base_stride] * volume[(i + 0) + (j - 1) * base_stride]) * 0.25); + xvel1[i + j * vels_wk_stride] = xvel0[i + j * vels_wk_stride] - + stepbymass_s * (xarea[i + j * xarea_sizex] * (pressure[i + j * base_stride] - pressure[(i - 1) + (j + 0) * base_stride]) + + xarea[(i + 0) + (j - 1) * xarea_sizex] * (pressure[(i + 0) + (j - 1) * base_stride] - pressure[(i - 1) + (j - 1) * base_stride])); + yvel1[i + j * vels_wk_stride] = yvel0[i + j * vels_wk_stride] - + stepbymass_s * (yarea[i + j * yarea_sizex] * + (pressure[i + j * base_stride] - pressure[(i + 0) + (j - 1) * base_stride]) + + yarea[(i - 1) + (j + 0) * yarea_sizex] * (pressure[(i - 1) + (j + 0) * base_stride] - pressure[(i - 1) + (j - 1) * base_stride])); + xvel1[i + j * vels_wk_stride] = xvel1[i + j * vels_wk_stride] - + stepbymass_s * (xarea[i + j * xarea_sizex] * + (viscosity[i + j * base_stride] - + viscosity[(i - 1) + (j + 0) * base_stride]) + + xarea[(i + 0) + (j - 1) * xarea_sizex] * + (viscosity[(i + 0) + (j - 1) * base_stride] - viscosity[(i - 1) + (j - 1) * base_stride])); + yvel1[i + j * vels_wk_stride] = yvel1[i + j * vels_wk_stride] - + stepbymass_s * (yarea[i + j * yarea_sizex] * + (viscosity[i + j * base_stride] - viscosity[(i + 0) + (j - 1) * base_stride]) + + yarea[(i - 1) + (j + 0) * yarea_sizex] * + (viscosity[(i - 1) + (j + 0) * base_stride] - viscosity[(i - 1) + (j - 1) * base_stride])); } } } @@ -87,30 +97,29 @@ void accelerate(global_variables &globals) { double kernel_time = 0; if (globals.profiler_on) kernel_time = timer(); + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) { tile_type &t = globals.chunk.tiles[tile]; accelerate_kernel( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, t.info.t_ymax, globals.dt, - t.field.xarea, - t.field.yarea, - t.field.volume, - t.field.density0, - t.field.pressure, - t.field.viscosity, - t.field.xvel0, - t.field.yvel0, - t.field.xvel1, - t.field.yvel1); + t.field); } + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif + if (globals.profiler_on) globals.profiler.acceleration += timer() - kernel_time; } diff --git a/src/advec_cell.cpp b/src/advec_cell.cpp index f3aae30..759c665 100644 --- a/src/advec_cell.cpp +++ b/src/advec_cell.cpp @@ -20,7 +20,9 @@ #include #include "advec_cell.h" -#include "utils.hpp" + + +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) // @brief Fortran cell advection kernel. @@ -28,31 +30,22 @@ // @details Performs a second order advective remap using van-Leer limiting // with directional splitting. void advec_cell_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, int dir, int sweep_number, - clover::Buffer1D &vertexdx, - clover::Buffer1D &vertexdy, - clover::Buffer2D &volume, - clover::Buffer2D &density1, - clover::Buffer2D &energy1, - clover::Buffer2D &mass_flux_x, - clover::Buffer2D &vol_flux_x, - clover::Buffer2D &mass_flux_y, - clover::Buffer2D &vol_flux_y, - clover::Buffer2D &pre_vol, - clover::Buffer2D &post_vol, - clover::Buffer2D &pre_mass, - clover::Buffer2D &post_mass, - clover::Buffer2D &advec_vol, - clover::Buffer2D &post_ener, - clover::Buffer2D &ener_flux) { + field_type &field) { const double one_by_six = 1.0 / 6.0; + const int base_stride = field.base_stride; + const int vels_wk_stride = field.vels_wk_stride; + const int flux_x_stride = field.flux_x_stride; + const int flux_y_stride = field.flux_y_stride; + if (dir == g_xdir) { // DO k=y_min-2,y_max+2 @@ -61,11 +54,20 @@ void advec_cell_kernel( if (sweep_number == 1) { - _Pragma("kernel2d") + double *volume = field.volume.data; + double *vol_flux_x = field.vol_flux_x.data; + double *vol_flux_y = field.vol_flux_y.data; + double *pre_vol = field.work_array1.data; + double *post_vol = field.work_array2.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) { for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) { - pre_vol(i, j) = volume(i, j) + (vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j) + vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j)); - post_vol(i, j) = pre_vol(i, j) - (vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j)); + pre_vol[i + j * vels_wk_stride] = volume[i + j * base_stride] + + (vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - vol_flux_x[i + j * flux_x_stride] + + vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - + vol_flux_y[i + j * flux_y_stride]); + post_vol[i + j * vels_wk_stride] = pre_vol[i + j * vels_wk_stride] - (vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - vol_flux_x[i + j * flux_x_stride]); } } @@ -73,11 +75,16 @@ void advec_cell_kernel( } else { - _Pragma("kernel2d") + double *volume = field.volume.data; + double *vol_flux_x = field.vol_flux_x.data; + double *pre_vol = field.work_array1.data; + double *post_vol = field.work_array2.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) { for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) { - pre_vol(i, j) = volume(i, j) + vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j); - post_vol(i, j) = volume(i, j); + pre_vol[i + j * vels_wk_stride] = volume[i + j * base_stride] + vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - vol_flux_x[i + j * flux_x_stride]; + post_vol[i + j * vels_wk_stride] = volume[i + j * base_stride]; } } @@ -85,58 +92,66 @@ void advec_cell_kernel( // DO k=y_min,y_max // DO j=x_min,x_max+2 - _Pragma("kernel2d") + double *vertexdx = field.vertexdx.data; + double *density1 = field.density1.data; + double *energy1 = field.energy1.data; + double *mass_flux_x = field.mass_flux_x.data; + double *vol_flux_x = field.vol_flux_x.data; + double *pre_vol = field.work_array1.data; + double *ener_flux = field.work_array7.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 2); j++) { for (int i = (x_min + 1); i < (x_max + 2 + 2); i++) ({ int upwind, donor, downwind, dif; double sigmat, sigma3, sigma4, sigmav, sigma, sigmam, diffuw, diffdw, limiter, wind; - if (vol_flux_x(i, j) > 0.0) { + if (vol_flux_x[i + j * flux_x_stride] > 0.0) { upwind = i - 2; donor = i - 1; downwind = i; dif = donor; } else { - upwind = std::min(i + 1, x_max + 2); + upwind = MIN(i + 1, x_max + 2); donor = i; downwind = i - 1; dif = upwind; } - sigmat = std::fabs(vol_flux_x(i, j)) / pre_vol(donor, j); + sigmat = fabs(vol_flux_x[i + j * flux_x_stride]) / pre_vol[donor + j * vels_wk_stride]; sigma3 = (1.0 + sigmat) * (vertexdx[i] / vertexdx[dif]); sigma4 = 2.0 - sigmat; - sigma = sigmat; +// sigma = sigmat; sigmav = sigmat; - diffuw = density1(donor, j) - density1(upwind, j); - diffdw = density1(downwind, j) - density1(donor, j); + diffuw = density1[donor + j * base_stride] - density1[upwind + j * base_stride]; + diffdw = density1[downwind + j * base_stride] - density1[donor + j * base_stride]; wind = 1.0; if (diffdw <= 0.0)wind = -1.0; if (diffuw * diffdw > 0.0) { limiter = (1.0 - sigmav) * wind * - std::fmin(std::fmin( - std::fabs(diffuw), - std::fabs(diffdw)), - one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw))); + fmin(fmin( + fabs(diffuw), + fabs(diffdw)), + one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); } else { limiter = 0.0; } - mass_flux_x(i, j) = vol_flux_x(i, j) * (density1(donor, j) + limiter); - sigmam = std::fabs(mass_flux_x(i, j)) / (density1(donor, j) * pre_vol(donor, j)); - diffuw = energy1(donor, j) - energy1(upwind, j); - diffdw = energy1(downwind, j) - energy1(donor, j); + mass_flux_x[i + j * flux_x_stride] = vol_flux_x[i + j * flux_x_stride] * (density1[donor + j * base_stride] + limiter); + sigmam = fabs(mass_flux_x[i + j * flux_x_stride]) / (density1[donor + j * base_stride] * pre_vol[donor + j * vels_wk_stride]); + diffuw = energy1[donor + j * base_stride] - energy1[upwind + j * base_stride]; + diffdw = energy1[downwind + j * base_stride] - energy1[donor + j * base_stride]; wind = 1.0; if (diffdw <= 0.0)wind = -1.0; if (diffuw * diffdw > 0.0) { limiter = (1.0 - sigmam) * wind * - std::fmin(std::fmin( - std::fabs(diffuw), - std::fabs(diffdw)), - one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw))); + fmin(fmin( + fabs(diffuw), + fabs(diffdw)), + one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); } else { limiter = 0.0; } - ener_flux(i, j) = mass_flux_x(i, j) * (energy1(donor, j) + limiter); + ener_flux[i + j * vels_wk_stride] = mass_flux_x[i + j * flux_x_stride] * (energy1[donor + j * base_stride] + limiter); }); } @@ -145,15 +160,17 @@ void advec_cell_kernel( // DO k=y_min,y_max // DO j=x_min,x_max - _Pragma("kernel2d") + + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 2); j++) { for (int i = (x_min + 1); i < (x_max + 2); i++) { - double pre_mass_s = density1(i, j) * pre_vol(i, j); - double post_mass_s = pre_mass_s + mass_flux_x(i, j) - mass_flux_x(i + 1, j + 0); - double post_ener_s = (energy1(i, j) * pre_mass_s + ener_flux(i, j) - ener_flux(i + 1, j + 0)) / post_mass_s; - double advec_vol_s = pre_vol(i, j) + vol_flux_x(i, j) - vol_flux_x(i + 1, j + 0); - density1(i, j) = post_mass_s / advec_vol_s; - energy1(i, j) = post_ener_s; + double pre_mass_s = density1[i + j * base_stride] * pre_vol[i + j * vels_wk_stride]; + double post_mass_s = pre_mass_s + mass_flux_x[i + j * flux_x_stride] - mass_flux_x[(i + 1) + (j + 0) * flux_x_stride]; + double post_ener_s = (energy1[i + j * base_stride] * pre_mass_s + ener_flux[i + j * vels_wk_stride] - ener_flux[(i + 1) + (j + 0) * vels_wk_stride]) / post_mass_s; + double advec_vol_s = pre_vol[i + j * vels_wk_stride] + vol_flux_x[i + j * flux_x_stride] - vol_flux_x[(i + 1) + (j + 0) * flux_x_stride]; + density1[i + j * base_stride] = post_mass_s / advec_vol_s; + energy1[i + j * base_stride] = post_ener_s; } } @@ -165,11 +182,20 @@ void advec_cell_kernel( if (sweep_number == 1) { - _Pragma("kernel2d") + double *volume = field.volume.data; + double *vol_flux_x = field.vol_flux_x.data; + double *vol_flux_y = field.vol_flux_y.data; + double *pre_vol = field.work_array1.data; + double *post_vol = field.work_array2.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) { for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) { - pre_vol(i, j) = volume(i, j) + (vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j) + vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j)); - post_vol(i, j) = pre_vol(i, j) - (vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j)); + pre_vol[i + j * vels_wk_stride] = volume[i + j * base_stride] + + (vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - vol_flux_y[i + j * flux_y_stride] + + vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - + vol_flux_x[i + j * flux_x_stride]); + post_vol[i + j * vels_wk_stride] = pre_vol[i + j * vels_wk_stride] - (vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - vol_flux_y[i + j * flux_y_stride]); } } @@ -177,11 +203,16 @@ void advec_cell_kernel( } else { - _Pragma("kernel2d") + double *volume = field.volume.data; + double *vol_flux_y = field.vol_flux_y.data; + double *pre_vol = field.work_array1.data; + double *post_vol = field.work_array2.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) { for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) { - pre_vol(i, j) = volume(i, j) + vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j); - post_vol(i, j) = volume(i, j); + pre_vol[i + j * vels_wk_stride] = volume[i + j * base_stride] + vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - vol_flux_y[i + j * flux_y_stride]; + post_vol[i + j * vels_wk_stride] = volume[i + j * base_stride]; } } @@ -191,72 +222,81 @@ void advec_cell_kernel( // DO k=y_min,y_max+2 // DO j=x_min,x_max - _Pragma("kernel2d") + double *vertexdy = field.vertexdy.data; + double *density1 = field.density1.data; + double *energy1 = field.energy1.data; + double *mass_flux_y = field.mass_flux_y.data; + double *vol_flux_y = field.vol_flux_y.data; + double *pre_vol = field.work_array1.data; + double *ener_flux = field.work_array7.data; + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 2 + 2); j++) { for (int i = (x_min + 1); i < (x_max + 2); i++) ({ int upwind, donor, downwind, dif; double sigmat, sigma3, sigma4, sigmav, sigma, sigmam, diffuw, diffdw, limiter, wind; - if (vol_flux_y(i, j) > 0.0) { + if (vol_flux_y[i + j * flux_y_stride] > 0.0) { upwind = j - 2; donor = j - 1; downwind = j; dif = donor; } else { - upwind = std::min(j + 1, y_max + 2); + upwind = MIN(j + 1, y_max + 2); donor = j; downwind = j - 1; dif = upwind; } - sigmat = std::fabs(vol_flux_y(i, j)) / pre_vol(i, donor); + sigmat = fabs(vol_flux_y[i + j * flux_y_stride]) / pre_vol[i + donor * vels_wk_stride]; sigma3 = (1.0 + sigmat) * (vertexdy[j] / vertexdy[dif]); sigma4 = 2.0 - sigmat; - sigma = sigmat; +// sigma = sigmat; sigmav = sigmat; - diffuw = density1(i, donor) - density1(i, upwind); - diffdw = density1(i, downwind) - density1(i, donor); + diffuw = density1[i + donor * base_stride] - density1[i + upwind * base_stride]; + diffdw = density1[i + downwind * base_stride] - density1[i + donor * base_stride]; wind = 1.0; if (diffdw <= 0.0)wind = -1.0; if (diffuw * diffdw > 0.0) { limiter = (1.0 - sigmav) * wind * - std::fmin(std::fmin( - std::fabs(diffuw), - std::fabs(diffdw)), - one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw))); + fmin(fmin( + fabs(diffuw), + fabs(diffdw)), + one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); } else { limiter = 0.0; } - mass_flux_y(i, j) = vol_flux_y(i, j) * (density1(i, donor) + limiter); - sigmam = std::fabs(mass_flux_y(i, j)) / (density1(i, donor) * pre_vol(i, donor)); - diffuw = energy1(i, donor) - energy1(i, upwind); - diffdw = energy1(i, downwind) - energy1(i, donor); + mass_flux_y[i + j * flux_y_stride] = vol_flux_y[i + j * flux_y_stride] * (density1[i + donor * base_stride] + limiter); + sigmam = fabs(mass_flux_y[i + j * flux_y_stride]) / (density1[i + donor * base_stride] * pre_vol[i + donor * vels_wk_stride]); + diffuw = energy1[i + donor * base_stride] - energy1[i + upwind * base_stride]; + diffdw = energy1[i + downwind * base_stride] - energy1[i + donor * base_stride]; wind = 1.0; if (diffdw <= 0.0)wind = -1.0; if (diffuw * diffdw > 0.0) { limiter = (1.0 - sigmam) * wind * - std::fmin(std::fmin( - std::fabs(diffuw), - std::fabs(diffdw)), - one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw))); + fmin(fmin( + fabs(diffuw), + fabs(diffdw)), + one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); } else { limiter = 0.0; } - ener_flux(i, j) = mass_flux_y(i, j) * (energy1(i, donor) + limiter); + ener_flux[i + j * vels_wk_stride] = mass_flux_y[i + j * flux_y_stride] * (energy1[i + donor * base_stride] + limiter); }); } // DO k=y_min,y_max // DO j=x_min,x_max - _Pragma("kernel2d") + + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 2); j++) { for (int i = (x_min + 1); i < (x_max + 2); i++) { - double pre_mass_s = density1(i, j) * pre_vol(i, j); - double post_mass_s = pre_mass_s + mass_flux_y(i, j) - mass_flux_y(i + 0, j + 1); - double post_ener_s = (energy1(i, j) * pre_mass_s + ener_flux(i, j) - ener_flux(i + 0, j + 1)) / post_mass_s; - double advec_vol_s = pre_vol(i, j) + vol_flux_y(i, j) - vol_flux_y(i + 0, j + 1); - density1(i, j) = post_mass_s / advec_vol_s; - energy1(i, j) = post_ener_s; + double pre_mass_s = density1[i + j * base_stride] * pre_vol[i + j * vels_wk_stride]; + double post_mass_s = pre_mass_s + mass_flux_y[i + j * flux_y_stride] - mass_flux_y[(i + 0) + (j + 1) * flux_y_stride]; + double post_ener_s = (energy1[i + j * base_stride] * pre_mass_s + ener_flux[i + j * vels_wk_stride] - ener_flux[(i + 0) + (j + 1) * vels_wk_stride]) / post_mass_s; + double advec_vol_s = pre_vol[i + j * vels_wk_stride] + vol_flux_y[i + j * flux_y_stride] - vol_flux_y[(i + 0) + (j + 1) * flux_y_stride]; + density1[i + j * base_stride] = post_mass_s / advec_vol_s; + energy1[i + j * base_stride] = post_ener_s; } } @@ -270,30 +310,24 @@ void advec_cell_kernel( // @details Invokes the user selected advection kernel. void advec_cell_driver(global_variables &globals, int tile, int sweep_number, int direction) { + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + tile_type &t = globals.chunk.tiles[tile]; advec_cell_kernel( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, t.info.t_ymax, direction, sweep_number, - t.field.vertexdx, - t.field.vertexdy, - t.field.volume, - t.field.density1, - t.field.energy1, - t.field.mass_flux_x, - t.field.vol_flux_x, - t.field.mass_flux_y, - t.field.vol_flux_y, - t.field.work_array1, - t.field.work_array2, - t.field.work_array3, - t.field.work_array4, - t.field.work_array5, - t.field.work_array6, - t.field.work_array7); + t.field); + + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif } diff --git a/src/advec_mom.cpp b/src/advec_mom.cpp index 48c226d..eb543ea 100644 --- a/src/advec_mom.cpp +++ b/src/advec_mom.cpp @@ -20,7 +20,7 @@ #include #include "advec_mom.h" -#include "utils.hpp" + // @brief Fortran momentum advection kernel // @author Wayne Gaudin @@ -29,22 +29,10 @@ // Note that although pre_vol is only set and not used in the update, please // leave it in the method. void advec_mom_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &vel1, - clover::Buffer2D &mass_flux_x, - clover::Buffer2D &vol_flux_x, - clover::Buffer2D &mass_flux_y, - clover::Buffer2D &vol_flux_y, - clover::Buffer2D &volume, - clover::Buffer2D &density1, - clover::Buffer2D &node_flux, - clover::Buffer2D &node_mass_post, - clover::Buffer2D &node_mass_pre, - clover::Buffer2D &mom_flux, - clover::Buffer2D &pre_vol, - clover::Buffer2D &post_vol, - clover::Buffer1D &celldx, - clover::Buffer1D &celldy, + clover::Buffer2D &vel1_buffer, + field_type &field, int which_vel, int sweep_number, int direction) { @@ -55,44 +43,76 @@ void advec_mom_kernel( // DO k=y_min-2,y_max+2 // DO j=x_min-2,x_max+2 + const int base_stride = field.base_stride; + const int vels_wk_stride = field.vels_wk_stride; + const int flux_x_stride = field.flux_x_stride; + const int flux_y_stride = field.flux_y_stride; + + if (mom_sweep == 1) { // x 1 - _Pragma("kernel2d") + double *vol_flux_y = field.vol_flux_y.data; + double *vol_flux_x = field.vol_flux_x.data; + double *volume = field.volume.data; + double *pre_vol = field.work_array5.data; + double *post_vol = field.work_array6.data; + + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) { for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) { - post_vol(i, j) = volume(i, j) + vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j); - pre_vol(i, j) = post_vol(i, j) + vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j); + post_vol[i + j * vels_wk_stride] = volume[i + j * base_stride] + vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - vol_flux_y[i + j * flux_y_stride]; + pre_vol[i + j * vels_wk_stride] = post_vol[i + j * vels_wk_stride] + vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - vol_flux_x[i + j * flux_x_stride]; } } } else if (mom_sweep == 2) { // y 1 - _Pragma("kernel2d") + double *vol_flux_y = field.vol_flux_y.data; + double *vol_flux_x = field.vol_flux_x.data; + double *volume = field.volume.data; + double *pre_vol = field.work_array5.data; + double *post_vol = field.work_array6.data; + + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) { for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) { - post_vol(i, j) = volume(i, j) + vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j); - pre_vol(i, j) = post_vol(i, j) + vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j); + post_vol[i + j * vels_wk_stride] = volume[i + j * base_stride] + vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - vol_flux_x[i + j * flux_x_stride]; + pre_vol[i + j * vels_wk_stride] = post_vol[i + j * vels_wk_stride] + vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - vol_flux_y[i + j * flux_y_stride]; } } } else if (mom_sweep == 3) { // x 2 - _Pragma("kernel2d") + double *vol_flux_y = field.vol_flux_y.data; + double *volume = field.volume.data; + double *pre_vol = field.work_array5.data; + double *post_vol = field.work_array6.data; + + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) { for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) { - post_vol(i, j) = volume(i, j); - pre_vol(i, j) = post_vol(i, j) + vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j); + post_vol[i + j * vels_wk_stride] = volume[i + j * base_stride]; + pre_vol[i + j * vels_wk_stride] = post_vol[i + j * vels_wk_stride] + vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - vol_flux_y[i + j * flux_y_stride]; } } } else if (mom_sweep == 4) { // y 2 - _Pragma("kernel2d") + double *vol_flux_x = field.vol_flux_x.data; + double *volume = field.volume.data; + double *pre_vol = field.work_array5.data; + double *post_vol = field.work_array6.data; + + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) { for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) { - post_vol(i, j) = volume(i, j); - pre_vol(i, j) = post_vol(i, j) + vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j); + post_vol[i + j * vels_wk_stride] = volume[i + j * base_stride]; + pre_vol[i + j * vels_wk_stride] = post_vol[i + j * vels_wk_stride] + vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - vol_flux_x[i + j * flux_x_stride]; } } } @@ -103,46 +123,62 @@ void advec_mom_kernel( // DO j=x_min-2,x_max+2 + double *mass_flux_x = field.mass_flux_x.data; + double *node_flux = field.work_array1.data; - _Pragma("kernel2d") + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) { for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) { - node_flux(i, j) = 0.25 * (mass_flux_x(i + 0, j - 1) + mass_flux_x(i, j) + - mass_flux_x(i + 1, j - 1) + mass_flux_x(i + 1, j + 0)); + node_flux[i + j * vels_wk_stride] = 0.25 * (mass_flux_x[(i + 0) + (j - 1) * flux_x_stride] + mass_flux_x[i + j * flux_x_stride] + + mass_flux_x[(i + 1) + (j - 1) * flux_x_stride] + mass_flux_x[(i + 1) + (j + 0) * flux_x_stride]); } } // DO k=y_min,y_max+1 // DO j=x_min-1,x_max+2 + double *density1 = field.density1.data; + double *node_mass_post = field.work_array2.data; + double *node_mass_pre = field.work_array3.data; + double *post_vol = field.work_array6.data; + - _Pragma("kernel2d") + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) { for (int i = (x_min - 1 + 1); i < (x_max + 2 + 2); i++) { - node_mass_post(i, j) = 0.25 * (density1(i + 0, j - 1) * - post_vol(i + 0, j - 1) + - density1(i, j) * - post_vol(i, j) + - density1(i - 1, j - 1) * - post_vol(i - 1, j - 1) + - density1(i - 1, j + 0) * post_vol(i - 1, j + 0)); - node_mass_pre(i, j) = node_mass_post(i, j) - node_flux(i - 1, j + 0) + node_flux(i, j); + node_mass_post[i + j * vels_wk_stride] = 0.25 * (density1[(i + 0) + (j - 1) * base_stride] * + post_vol[(i + 0) + (j - 1) * vels_wk_stride] + + density1[i + j * base_stride] * + post_vol[i + j * vels_wk_stride] + + density1[(i - 1) + (j - 1) * base_stride] * + post_vol[(i - 1) + (j - 1) * vels_wk_stride] + + density1[(i - 1) + (j + 0) * base_stride] * post_vol[(i - 1) + (j + 0) * vels_wk_stride]); + node_mass_pre[i + j * vels_wk_stride] = + node_mass_post[i + j * vels_wk_stride] - node_flux[(i - 1) + (j + 0) * vels_wk_stride] + node_flux[i + j * vels_wk_stride]; } } } - // DO k=y_min,y_max+1 - // DO j=x_min-1,x_max+1 + // DO k=y_min,y_max+1 + // DO j=x_min-1,x_max+1 + + const int vel1_sizex = vel1_buffer.nX(); + double *vel1 = vel1_buffer.data; + double *node_flux = field.work_array1.data; + double *node_mass_pre = field.work_array3.data; + double *mom_flux = field.work_array4.data; + double *celldx = field.celldx.data; - _Pragma("kernel2d") + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) { for (int i = (x_min - 1 + 1); i < (x_max + 1 + 2); i++) ({ int upwind, donor, downwind, dif; double sigma, width, limiter, vdiffuw, vdiffdw, auw, adw, wind, advec_vel_s; - if (node_flux(i, j) < 0.0) { + if (node_flux[i + j * vels_wk_stride] < 0.0) { upwind = i + 2; donor = i + 1; downwind = i; @@ -153,21 +189,21 @@ void advec_mom_kernel( downwind = i + 1; dif = upwind; } - sigma = std::fabs(node_flux(i, j)) / (node_mass_pre(donor, j)); + sigma = fabs(node_flux[i + j * vels_wk_stride]) / (node_mass_pre[donor + j * vels_wk_stride]); width = celldx[i]; - vdiffuw = vel1(donor, j) - vel1(upwind, j); - vdiffdw = vel1(downwind, j) - vel1(donor, j); + vdiffuw = vel1[donor + j * vel1_sizex] - vel1[upwind + j * vel1_sizex]; + vdiffdw = vel1[downwind + j * vel1_sizex] - vel1[donor + j * vel1_sizex]; limiter = 0.0; if (vdiffuw * vdiffdw > 0.0) { - auw = std::fabs(vdiffuw); - adw = std::fabs(vdiffdw); + auw = fabs(vdiffuw); + adw = fabs(vdiffdw); wind = 1.0; if (vdiffdw <= 0.0)wind = -1.0; - limiter = wind * std::fmin(std::fmin( + limiter = wind * fmin(fmin( width * ((2.0 - sigma) * adw / width + (1.0 + sigma) * auw / celldx[dif]) / 6.0, auw), adw); } - advec_vel_s = vel1(donor, j) + (1.0 - sigma) * limiter; - mom_flux(i, j) = advec_vel_s * node_flux(i, j); + advec_vel_s = vel1[donor + j * vel1_sizex] + (1.0 - sigma) * limiter; + mom_flux[i + j * vels_wk_stride] = advec_vel_s * node_flux[i + j * vels_wk_stride]; }); } @@ -176,10 +212,15 @@ void advec_mom_kernel( - _Pragma("kernel2d") + double *node_mass_post = field.work_array2.data; + + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) { for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) { - vel1(i, j) = (vel1(i, j) * node_mass_pre(i, j) + mom_flux(i - 1, j + 0) - mom_flux(i, j)) / node_mass_post(i, j); + vel1[i + j * vel1_sizex] = + (vel1[i + j * vel1_sizex] * node_mass_pre[i + j * vels_wk_stride] + mom_flux[(i - 1) + (j + 0) * vels_wk_stride] - mom_flux[i + j * vels_wk_stride]) / + node_mass_post[i + j * vels_wk_stride]; } } } else if (direction == 2) { @@ -188,45 +229,60 @@ void advec_mom_kernel( // DO j=x_min,x_max+1 + double *node_flux = field.work_array1.data; + double *mass_flux_y = field.mass_flux_y.data; - _Pragma("kernel2d") + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) { for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) { - node_flux(i, j) = 0.25 * (mass_flux_y(i - 1, j + 0) + mass_flux_y(i, j) + - mass_flux_y(i - 1, j + 1) + mass_flux_y(i + 0, j + 1)); + node_flux[i + j * vels_wk_stride] = 0.25 * (mass_flux_y[(i - 1) + (j + 0) * flux_y_stride] + mass_flux_y[i + j * flux_y_stride] + + mass_flux_y[(i - 1) + (j + 1) * flux_y_stride] + mass_flux_y[(i + 0) + (j + 1) * flux_y_stride]); } } // DO k=y_min-1,y_max+2 // DO j=x_min,x_max+1 + double *density1 = field.density1.data; + double *node_mass_post = field.work_array2.data; + double *node_mass_pre = field.work_array3.data; + double *post_vol = field.work_array6.data; - _Pragma("kernel2d") + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min - 1 + 1); j < (y_max + 2 + 2); j++) { for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) { - node_mass_post(i, j) = 0.25 * (density1(i + 0, j - 1) * - post_vol(i + 0, j - 1) + - density1(i, j) * - post_vol(i, j) + - density1(i - 1, j - 1) * - post_vol(i - 1, j - 1) + - density1(i - 1, j + 0) * - post_vol(i - 1, j + 0)); - node_mass_pre(i, j) = node_mass_post(i, j) - node_flux(i + 0, j - 1) + node_flux(i, j); + node_mass_post[i + j * vels_wk_stride] = 0.25 * (density1[(i + 0) + (j - 1) * base_stride] * + post_vol[(i + 0) + (j - 1) * vels_wk_stride] + + density1[i + j * base_stride] * + post_vol[i + j * vels_wk_stride] + + density1[(i - 1) + (j - 1) * base_stride] * + post_vol[(i - 1) + (j - 1) * vels_wk_stride] + + density1[(i - 1) + (j + 0) * base_stride] * + post_vol[(i - 1) + (j + 0) * vels_wk_stride]); + node_mass_pre[i + j * vels_wk_stride] = + node_mass_post[i + j * vels_wk_stride] - node_flux[(i + 0) + (j - 1) * vels_wk_stride] + node_flux[i + j * vels_wk_stride]; } } } - // DO k=y_min-1,y_max+1 - // DO j=x_min,x_max+1 + // DO k=y_min-1,y_max+1 + // DO j=x_min,x_max+1 - _Pragma("kernel2d") + const int vel1_sizex = vel1_buffer.nX(); + double *vel1 = vel1_buffer.data; + double *node_flux = field.work_array1.data; + double *node_mass_pre = field.work_array3.data; + double *mom_flux = field.work_array4.data; + double *celldy = field.celldy.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min - 1 + 1); j < (y_max + 1 + 2); j++) { for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) ({ int upwind, donor, downwind, dif; double sigma, width, limiter, vdiffuw, vdiffdw, auw, adw, wind, advec_vel_s; - if (node_flux(i, j) < 0.0) { + if (node_flux[i + j * vels_wk_stride] < 0.0) { upwind = j + 2; donor = j + 1; downwind = j; @@ -237,21 +293,21 @@ void advec_mom_kernel( downwind = j + 1; dif = upwind; } - sigma = std::fabs(node_flux(i, j)) / (node_mass_pre(i, donor)); + sigma = fabs(node_flux[i + j * vels_wk_stride]) / (node_mass_pre[i + donor * vels_wk_stride]); width = celldy[j]; - vdiffuw = vel1(i, donor) - vel1(i, upwind); - vdiffdw = vel1(i, downwind) - vel1(i, donor); + vdiffuw = vel1[i + donor * vel1_sizex] - vel1[i + upwind * vel1_sizex]; + vdiffdw = vel1[i + downwind * vel1_sizex] - vel1[i + donor * vel1_sizex]; limiter = 0.0; if (vdiffuw * vdiffdw > 0.0) { - auw = std::fabs(vdiffuw); - adw = std::fabs(vdiffdw); + auw = fabs(vdiffuw); + adw = fabs(vdiffdw); wind = 1.0; if (vdiffdw <= 0.0)wind = -1.0; - limiter = wind * std::fmin(std::fmin( + limiter = wind * fmin(fmin( width * ((2.0 - sigma) * adw / width + (1.0 + sigma) * auw / celldy[dif]) / 6.0, auw), adw); } - advec_vel_s = vel1(i, donor) + (1.0 - sigma) * limiter; - mom_flux(i, j) = advec_vel_s * node_flux(i, j); + advec_vel_s = vel1[i + donor * vel1_sizex] + (1.0 - sigma) * limiter; + mom_flux[i + j * vels_wk_stride] = advec_vel_s * node_flux[i + j * vels_wk_stride]; }); } @@ -261,10 +317,15 @@ void advec_mom_kernel( - _Pragma("kernel2d") + double *node_mass_post = field.work_array2.data; + + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) { for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) { - vel1(i, j) = (vel1(i, j) * node_mass_pre(i, j) + mom_flux(i + 0, j - 1) - mom_flux(i, j)) / node_mass_post(i, j); + vel1[i + j * vel1_sizex] = + (vel1[i + j * vel1_sizex] * node_mass_pre[i + j * vels_wk_stride] + mom_flux[(i + 0) + (j - 1) * vels_wk_stride] - mom_flux[i + j * vels_wk_stride]) / + node_mass_post[i + j * vels_wk_stride]; } } } @@ -278,57 +339,42 @@ void advec_mom_driver(global_variables &globals, int tile, int which_vel, int di int sweep_number) { + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + + tile_type &t = globals.chunk.tiles[tile]; if (which_vel == 1) { advec_mom_kernel( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, t.info.t_ymax, t.field.xvel1, - t.field.mass_flux_x, - t.field.vol_flux_x, - t.field.mass_flux_y, - t.field.vol_flux_y, - t.field.volume, - t.field.density1, - t.field.work_array1, - t.field.work_array2, - t.field.work_array3, - t.field.work_array4, - t.field.work_array5, - t.field.work_array6, - t.field.celldx, - t.field.celldy, + t.field, which_vel, sweep_number, direction); } else { advec_mom_kernel( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, t.info.t_ymax, t.field.yvel1, - t.field.mass_flux_x, - t.field.vol_flux_x, - t.field.mass_flux_y, - t.field.vol_flux_y, - t.field.volume, - t.field.density1, - t.field.work_array1, - t.field.work_array2, - t.field.work_array3, - t.field.work_array4, - t.field.work_array5, - t.field.work_array6, - t.field.celldx, - t.field.celldy, + t.field, which_vel, sweep_number, direction); } + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif + } diff --git a/src/build_field.cpp b/src/build_field.cpp index f9c9842..06bb7f5 100644 --- a/src/build_field.cpp +++ b/src/build_field.cpp @@ -25,14 +25,87 @@ #include "build_field.h" -#include "utils.hpp" -// Allocate Kokkos Views for the data arrays + +// Allocate device buffers for the data arrays void build_field(global_variables &globals) { for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) { tile_type &t = globals.chunk.tiles[tile]; + field_type &field = t.field; + + + + double * density0 = field.density0.data; + double * density1 = field.density1.data; + double * energy0 = field.energy0.data; + double * energy1 = field.energy1.data; + double * pressure = field.pressure.data; + double * viscosity = field.viscosity.data; + double * soundspeed = field.soundspeed.data; + double * yvel0 = field.yvel0.data; + double * yvel1 = field.yvel1.data; + double * xvel0 = field.xvel0.data; + double * xvel1 = field.xvel1.data; + double * vol_flux_x = field.vol_flux_x.data; + double * vol_flux_y = field.vol_flux_y.data; + double * mass_flux_x = field.mass_flux_x.data; + double * mass_flux_y = field.mass_flux_y.data; + double * work_array1 = field.work_array1.data; + double * work_array2 = field.work_array2.data; + double * work_array3 = field.work_array3.data; + double * work_array4 = field.work_array4.data; + double * work_array5 = field.work_array5.data; + double * work_array6 = field.work_array6.data; + double * work_array7 = field.work_array7.data; + double * cellx = field.cellx.data; + double * celldx = field.celldx.data; + double * celly = field.celly.data; + double * celldy = field.celldy.data; + double * vertexx = field.vertexx.data; + double * vertexdx = field.vertexdx.data; + double * vertexy = field.vertexy.data; + double * vertexdy = field.vertexdy.data; + double * volume = field.volume.data; + double * xarea = field.xarea.data; + double * yarea = field.yarea.data; + + + #pragma omp target enter data \ + map(alloc: density0[:field.density0.N()]) \ + map(alloc: density1[:field.density1.N()]) \ + map(alloc: energy0[:field.energy0.N()]) \ + map(alloc: energy1[:field.energy1.N()]) \ + map(alloc: pressure[:field.pressure.N()]) \ + map(alloc: viscosity[:field.viscosity.N()]) \ + map(alloc: soundspeed[:field.soundspeed.N()]) \ + map(alloc: yvel0[:field.yvel0.N()]) \ + map(alloc: yvel1[:field.yvel1.N()]) \ + map(alloc: xvel0[:field.xvel0.N()]) \ + map(alloc: xvel1[:field.xvel1.N()]) \ + map(alloc: vol_flux_x[:field.vol_flux_x.N()]) \ + map(alloc: vol_flux_y[:field.vol_flux_y.N()]) \ + map(alloc: mass_flux_x[:field.mass_flux_x.N()]) \ + map(alloc: mass_flux_y[:field.mass_flux_y.N()]) \ + map(alloc: work_array1[:field.work_array1.N()]) \ + map(alloc: work_array2[:field.work_array2.N()]) \ + map(alloc: work_array3[:field.work_array3.N()]) \ + map(alloc: work_array4[:field.work_array4.N()]) \ + map(alloc: work_array5[:field.work_array5.N()]) \ + map(alloc: work_array6[:field.work_array6.N()]) \ + map(alloc: work_array7[:field.work_array7.N()]) \ + map(alloc: cellx[:field.cellx.N()]) \ + map(alloc: celldx[:field.celldx.N()]) \ + map(alloc: celly[:field.celly.N()]) \ + map(alloc: celldy[:field.celldy.N()]) \ + map(alloc: vertexx[:field.vertexx.N()]) \ + map(alloc: vertexdx[:field.vertexdx.N()]) \ + map(alloc: vertexy[:field.vertexy.N()]) \ + map(alloc: vertexdy[:field.vertexdy.N()]) \ + map(alloc: volume[:field.volume.N()]) \ + map(alloc: xarea[:field.xarea.N()]) \ + map(alloc: yarea[:field.yarea.N()]) \ const int xrange = (t.info.t_xmax + 2) - (t.info.t_xmin - 2) + 1; const int yrange = (t.info.t_ymax + 2) - (t.info.t_ymin - 2) + 1; @@ -94,90 +167,99 @@ void build_field(global_variables &globals) { // cycle which can skew timings in the first step // Take a reference to the lowest structure, as Kokkos device cannot necessarily chase through the structure. - field_type &field = t.field; // Kokkos::MDRangePolicy > loop_bounds_1({0, 0}, {xrange + 1, yrange + 1}); // Nested loop over (t_ymin-2:t_ymax+3) and (t_xmin-2:t_xmax+3) inclusive - _Pragma("kernel2d") - for (int j = (0); j < (yrange + 1); j++) { - for (int i = (0); i < (xrange + 1); i++) { - field.work_array1(i, j) = 0.0; - field.work_array2(i, j) = 0.0; - field.work_array3(i, j) = 0.0; - field.work_array4(i, j) = 0.0; - field.work_array5(i, j) = 0.0; - field.work_array6(i, j) = 0.0; - field.work_array7(i, j) = 0.0; - field.xvel0(i, j) = 0.0; - field.xvel1(i, j) = 0.0; - field.yvel0(i, j) = 0.0; - field.yvel1(i, j) = 0.0; + + + const int vels_wk_stride = field.vels_wk_stride; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target) + for (int j = 0; j < (yrange + 1); j++) { + for (int i = 0; i < (xrange + 1); i++) { + work_array1[i + j * vels_wk_stride] = 0.0; + work_array2[i + j * vels_wk_stride] = 0.0; + work_array3[i + j * vels_wk_stride] = 0.0; + work_array4[i + j * vels_wk_stride] = 0.0; + work_array5[i + j * vels_wk_stride] = 0.0; + work_array6[i + j * vels_wk_stride] = 0.0; + work_array7[i + j * vels_wk_stride] = 0.0; + xvel0[i + j * vels_wk_stride] = 0.0; + xvel1[i + j * vels_wk_stride] = 0.0; + yvel0[i + j * vels_wk_stride] = 0.0; + yvel1[i + j * vels_wk_stride] = 0.0; } } // Nested loop over (t_ymin-2:t_ymax+2) and (t_xmin-2:t_xmax+2) inclusive - _Pragma("kernel2d") - for (int j = (0); j < (yrange); j++) { - for (int i = (0); i < (xrange); i++) { - field.density0(i, j) = 0.0; - field.density1(i, j) = 0.0; - field.energy0(i, j) = 0.0; - field.energy1(i, j) = 0.0; - field.pressure(i, j) = 0.0; - field.viscosity(i, j) = 0.0; - field.soundspeed(i, j) = 0.0; - field.volume(i, j) = 0.0; + const int base_stride = field.base_stride; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target) + for (int j = 0; j < (yrange); j++) { + for (int i = 0; i < (xrange); i++) { + density0[i + j * base_stride] = 0.0; + density1[i + j * base_stride] = 0.0; + energy0[i + j * base_stride] = 0.0; + energy1[i + j * base_stride] = 0.0; + pressure[i + j * base_stride] = 0.0; + viscosity[i + j * base_stride] = 0.0; + soundspeed[i + j * base_stride] = 0.0; + volume[i + j * base_stride] = 0.0; } } // Nested loop over (t_ymin-2:t_ymax+2) and (t_xmin-2:t_xmax+3) inclusive - _Pragma("kernel2d") - for (int j = (0); j < (yrange); j++) { - for (int i = (0); i < (xrange); i++) { - field.vol_flux_x(i, j) = 0.0; - field.mass_flux_x(i, j) = 0.0; - field.xarea(i, j) = 0.0; + const int flux_x_stride = field.flux_x_stride; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target) + for (int j = 0; j < (yrange); j++) { + for (int i = 0; i < (xrange); i++) { + vol_flux_x[i + j * flux_x_stride] = 0.0; + mass_flux_x[i + j * flux_x_stride] = 0.0; + xarea[i + j * flux_x_stride] = 0.0; } } // Nested loop over (t_ymin-2:t_ymax+3) and (t_xmin-2:t_xmax+2) inclusive - _Pragma("kernel2d") - for (int j = (0); j < (yrange + 1); j++) { - for (int i = (0); i < (xrange); i++) { - field.vol_flux_y(i, j) = 0.0; - field.mass_flux_y(i, j) = 0.0; - field.yarea(i, j) = 0.0; + const int flux_y_stride = field.flux_y_stride; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target) + for (int j = 0; j < (yrange + 1); j++) { + for (int i = 0; i < (xrange); i++) { + vol_flux_y[i + j * flux_y_stride] = 0.0; + mass_flux_y[i + j * flux_y_stride] = 0.0; + yarea[i + j * flux_y_stride] = 0.0; } } // (t_xmin-2:t_xmax+2) inclusive - _Pragma("kernel1d") - for (int id = (0); id < (xrange); id++) { - field.cellx[id] = 0.0; - field.celldx[id] = 0.0; + #pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target) + for (int id = 0; id < (xrange); id++) { + cellx[id] = 0.0; + celldx[id] = 0.0; } // (t_ymin-2:t_ymax+2) inclusive - _Pragma("kernel1d") - for (int id = (0); id < (yrange); id++) { - field.celly[id] = 0.0; - field.celldy[id] = 0.0; + #pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target) + for (int id = 0; id < (yrange); id++) { + celly[id] = 0.0; + celldy[id] = 0.0; } // (t_xmin-2:t_xmax+3) inclusive - _Pragma("kernel1d") - for (int id = (0); id < (xrange + 1); id++) { - field.vertexx[id] = 0.0; - field.vertexdx[id] = 0.0; + #pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target) + for (int id = 0; id < (xrange + 1); id++) { + vertexx[id] = 0.0; + vertexdx[id] = 0.0; } // (t_ymin-2:t_ymax+3) inclusive - _Pragma("kernel1d") - for (int id = (0); id < (yrange + 1); id++) { - field.vertexy[id] = 0.0; - field.vertexdy[id] = 0.0; + #pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target) + for (int id = 0; id < (yrange + 1); id++) { + vertexy[id] = 0.0; + vertexdy[id] = 0.0; } diff --git a/src/calc_dt.cpp b/src/calc_dt.cpp index 528e249..18694d0 100644 --- a/src/calc_dt.cpp +++ b/src/calc_dt.cpp @@ -21,7 +21,7 @@ #include #include "calc_dt.h" -#include "utils.hpp" + #include // @brief Fortran timestep kernel @@ -32,26 +32,14 @@ void calc_dt_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, double dtmin, double dtc_safe, double dtu_safe, double dtv_safe, double dtdiv_safe, - clover::Buffer2D &xarea, - clover::Buffer2D &yarea, - clover::Buffer1D &cellx, - clover::Buffer1D &celly, - clover::Buffer1D &celldx, - clover::Buffer1D &celldy, - clover::Buffer2D &volume, - clover::Buffer2D &density0, - clover::Buffer2D &energy0, - clover::Buffer2D &pressure, - clover::Buffer2D &viscosity_a, - clover::Buffer2D &soundspeed, - clover::Buffer2D &xvel0, - clover::Buffer2D &yvel0, + field_type &field, double &dt_min_val, int &dtl_control, double &xl_pos, @@ -70,35 +58,56 @@ void calc_dt_kernel( // Kokkos::MDRangePolicy > policy({x_min + 1, y_min + 1}, {x_max + 2, y_max + 2}); - _Pragma("kernel2d") + const int flux_x_stride = field.flux_x_stride; + const int flux_y_stride = field.flux_y_stride; + + const int base_stride = field.base_stride; + const int vels_wk_stride = field.vels_wk_stride; + + double *xarea = field.xarea.data; + double *yarea = field.yarea.data; + double *celldx = field.celldx.data; + double *celldy = field.celldy.data; + double *volume = field.volume.data; + double *density0 = field.density0.data; + double *viscosity = field.viscosity.data; + double *soundspeed = field.soundspeed.data; + double *xvel0 = field.xvel0.data; + double *yvel0 = field.yvel0.data; + + + // XXX See https://forums.developer.nvidia.com/t/nvc-f-0000-internal-compiler-error-unhandled-size-for-preparing-max-constant/221740 + double dt_min_val0 = dt_min_val; + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) map(tofrom:dt_min_val) reduction(min:dt_min_val0) for (int j = (y_min + 1); j < (y_max + 2); j++) { for (int i = (x_min + 1); i < (x_max + 2); i++) { double dsx = celldx[i]; double dsy = celldy[j]; - double cc = soundspeed(i, j) * soundspeed(i, j); - cc = cc + 2.0 * viscosity_a(i, j) / density0(i, j); - cc = std::fmax(std::sqrt(cc), g_small); - double dtct = dtc_safe * std::fmin(dsx, dsy) / cc; + double cc = soundspeed[i + j * base_stride] * soundspeed[i + j * base_stride]; + cc = cc + 2.0 * viscosity[i + j * base_stride] / density0[i + j * base_stride]; + cc = fmax(sqrt(cc), g_small); + double dtct = dtc_safe * fmin(dsx, dsy) / cc; double div = 0.0; - double dv1 = (xvel0(i, j) + xvel0(i + 0, j + 1)) * xarea(i, j); - double dv2 = (xvel0(i + 1, j + 0) + xvel0(i + 1, j + 1)) * xarea(i + 1, j + 0); + double dv1 = (xvel0[i + j * vels_wk_stride] + xvel0[(i + 0) + (j + 1) * vels_wk_stride]) * xarea[i + j * flux_x_stride]; + double dv2 = (xvel0[(i + 1) + (j + 0) * vels_wk_stride] + xvel0[(i + 1) + (j + 1) * vels_wk_stride]) * xarea[(i + 1) + (j + 0) * flux_x_stride]; div = div + dv2 - dv1; - double dtut = dtu_safe * 2.0 * volume(i, j) / std::fmax(std::fmax(std::fabs(dv1), std::fabs(dv2)), g_small * volume(i, j)); - dv1 = (yvel0(i, j) + yvel0(i + 1, j + 0)) * yarea(i, j); - dv2 = (yvel0(i + 0, j + 1) + yvel0(i + 1, j + 1)) * yarea(i + 0, j + 1); + double dtut = dtu_safe * 2.0 * volume[i + j * base_stride] / fmax(fmax(fabs(dv1), fabs(dv2)), g_small * volume[i + j * base_stride]); + dv1 = (yvel0[i + j * vels_wk_stride] + yvel0[(i + 1) + (j + 0) * vels_wk_stride]) * yarea[i + j * flux_y_stride]; + dv2 = (yvel0[(i + 0) + (j + 1) * vels_wk_stride] + yvel0[(i + 1) + (j + 1) * vels_wk_stride]) * yarea[(i + 0) + (j + 1) * flux_y_stride]; div = div + dv2 - dv1; - double dtvt = dtv_safe * 2.0 * volume(i, j) / std::fmax(std::fmax(std::fabs(dv1), std::fabs(dv2)), g_small * volume(i, j)); - div = div / (2.0 * volume(i, j)); + double dtvt = dtv_safe * 2.0 * volume[i + j * base_stride] / fmax(fmax(fabs(dv1), fabs(dv2)), g_small * volume[i + j * base_stride]); + div = div / (2.0 * volume[i + j * base_stride]); double dtdivt; if (div < -g_small) { dtdivt = dtdiv_safe * (-1.0 / div); } else { dtdivt = g_big; } - double mins = std::fmin(dtct, std::fmin(dtut, std::fmin(dtvt, std::fmin(dtdivt, g_big)))); - dt_min_val = std::fmin(mins, dt_min_val); + double mins = fmin(dtct, fmin(dtut, fmin(dtvt, fmin(dtdivt, g_big)))); + dt_min_val0 = fmin(mins, dt_min_val0); } } + dt_min_val = dt_min_val0; dtl_control = static_cast(10.01 * (jk_control - static_cast(jk_control))); @@ -111,14 +120,14 @@ void calc_dt_kernel( if (small != 0) { - auto cellx_acc = cellx; - auto celly_acc = celly; - auto density0_acc = density0; - auto energy0_acc = energy0; - auto pressure_acc = pressure; - auto soundspeed_acc = soundspeed; - auto xvel0_acc = xvel0; - auto yvel0_acc = yvel0; + auto &cellx_acc = field.cellx; + auto &celly_acc = field.celly; + auto &density0_acc = field.density0; + auto &energy0_acc = field.energy0; + auto &pressure_acc = field.pressure; + auto &soundspeed_acc = field.soundspeed; + auto &xvel0_acc = field.xvel0; + auto &yvel0_acc = field.yvel0; std::cout << "Timestep information:" << std::endl @@ -153,9 +162,13 @@ void calc_dt(global_variables &globals, int tile, double &local_dt, std::string int l_control; int small = 0; + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif tile_type &t = globals.chunk.tiles[tile]; calc_dt_kernel( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -165,20 +178,7 @@ void calc_dt(global_variables &globals, int tile, double &local_dt, std::string globals.config.dtu_safe, globals.config.dtv_safe, globals.config.dtdiv_safe, - t.field.xarea, - t.field.yarea, - t.field.cellx, - t.field.celly, - t.field.celldx, - t.field.celldy, - t.field.volume, - t.field.density0, - t.field.energy0, - t.field.pressure, - t.field.viscosity, - t.field.soundspeed, - t.field.xvel0, - t.field.yvel0, + t.field, local_dt, l_control, xl_pos, @@ -188,6 +188,10 @@ void calc_dt(global_variables &globals, int tile, double &local_dt, std::string small ); + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif + if (l_control == 1) local_control = "sound"; if (l_control == 2) local_control = "xvel"; diff --git a/src/clover_leaf.cpp b/src/clover_leaf.cpp index b28c84e..c4e5a2d 100644 --- a/src/clover_leaf.cpp +++ b/src/clover_leaf.cpp @@ -47,6 +47,7 @@ #include "hydro.h" #include "initialise.h" #include "version.h" +#include "finalise_field.h" #include #include @@ -54,7 +55,6 @@ std::ostream g_out(nullptr); int main(int argc, char *argv[]) { - // Initialise MPI first MPI_Init(&argc, &argv); @@ -74,11 +74,13 @@ int main(int argc, char *argv[]) { } - std::unique_ptr config = initialise(parallel, - std::vector(argv + 1, argv + argc)); + auto config = initialise(parallel, std::vector(argv + 1, argv + argc)); std::cout << "Launching hydro" << std::endl; - hydro(*config, parallel); + hydro(config, parallel); + + // calls the appropriate omp target exit data for all buffers, see build_field.cpp for the enter data half + finalise_field(config); // Finilise programming models // Kokkos::finalize(); diff --git a/src/comms.cpp b/src/comms.cpp index 8831ad6..5691eb5 100644 --- a/src/comms.cpp +++ b/src/comms.cpp @@ -34,7 +34,7 @@ #include "comms.h" #include "pack_kernel.h" -#include "utils.hpp" + #include @@ -61,6 +61,8 @@ void clover_barrier() { } void clover_barrier(global_variables &globals) { + #pragma omp flush +// globals.deviceToHost();; clover_barrier(); } @@ -498,8 +500,13 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ tile_type &t = globals.chunk.tiles[tile]; int t_offset = (t.info.t_bottom - globals.chunk.bottom) * depth; + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + if (fields[field_density0] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -512,6 +519,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_density1] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -524,6 +532,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_energy0] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -536,6 +545,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_energy1] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -548,6 +558,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_pressure] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -560,6 +571,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_viscosity] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -572,6 +584,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_soundspeed] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -584,6 +597,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_xvel0] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -596,6 +610,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_xvel1] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -608,6 +623,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_yvel0] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -620,6 +636,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_yvel1] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -632,6 +649,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_vol_flux_x] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -644,6 +662,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_vol_flux_y] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -656,6 +675,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_mass_flux_x] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -668,6 +688,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ } if (fields[field_mass_flux_y] == 1) { clover_pack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -678,26 +699,37 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_ depth, y_face_data, left_right_offset[field_mass_flux_y] + t_offset); } - + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif } void clover_send_recv_message_left( global_variables &globals, - clover::Buffer1D &left_snd, - clover::Buffer1D &left_rcv, + clover::Buffer1D &left_snd_buffer, + clover::Buffer1D &left_rcv_buffer, int total_size, int tag_send, int tag_recv, MPI_Request &req_send, MPI_Request &req_recv) { // First copy send buffer from device to host // Kokkos::deep_copy(globals.chunk.hm_left_snd, left_snd); + int left_task = globals.chunk.chunk_neighbours[chunk_left] - 1; - MPI_Isend(globals.chunk.left_snd.actual(), total_size, MPI_DOUBLE, left_task, tag_send, + + double *left_snd = left_snd_buffer.data; + double *left_rcv = left_rcv_buffer.data; + #pragma omp target update from(left_snd[:left_snd_buffer.N()]) + + MPI_Isend(left_snd, total_size, MPI_DOUBLE, left_task, tag_send, MPI_COMM_WORLD, &req_send); - MPI_Irecv(globals.chunk.left_rcv.actual(), total_size, MPI_DOUBLE, left_task, tag_recv, + MPI_Irecv(left_rcv, total_size, MPI_DOUBLE, left_task, tag_recv, MPI_COMM_WORLD, &req_recv); + #pragma omp target update to(left_rcv[:left_rcv_buffer.N()]) + + } void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], int tile, int depth, @@ -706,8 +738,13 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], tile_type &t = globals.chunk.tiles[tile]; int t_offset = (t.info.t_bottom - globals.chunk.bottom) * depth; + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + if (fields[field_density0] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -720,6 +757,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_density1] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -732,6 +770,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_energy0] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -744,6 +783,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_energy1] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -756,6 +796,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_pressure] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -768,6 +809,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_viscosity] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -780,6 +822,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_soundspeed] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -792,6 +835,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_xvel0] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -804,6 +848,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_xvel1] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -816,6 +861,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_yvel0] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -828,6 +874,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_yvel1] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -840,6 +887,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_vol_flux_x] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -852,6 +900,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_vol_flux_y] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -864,6 +913,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_mass_flux_x] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -876,6 +926,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_mass_flux_y] == 1) { clover_unpack_message_left( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -886,7 +937,9 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], depth, y_face_data, left_right_offset[field_mass_flux_y] + t_offset); } - + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif } void clover_pack_right(global_variables &globals, int tile, const int fields[NUM_FIELDS], int depth, @@ -895,8 +948,13 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM tile_type &t = globals.chunk.tiles[tile]; int t_offset = (t.info.t_bottom - globals.chunk.bottom) * depth; + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + if (fields[field_density0] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -909,6 +967,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_density1] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -921,6 +980,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_energy0] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -933,6 +993,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_energy1] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -945,6 +1006,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_pressure] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -957,6 +1019,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_viscosity] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -969,6 +1032,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_soundspeed] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -981,6 +1045,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_xvel0] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -993,6 +1058,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_xvel1] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1005,6 +1071,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_yvel0] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1017,6 +1084,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_yvel1] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1029,6 +1097,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_vol_flux_x] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1041,6 +1110,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_vol_flux_y] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1053,6 +1123,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_mass_flux_x] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1065,6 +1136,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM } if (fields[field_mass_flux_y] == 1) { clover_pack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1075,13 +1147,15 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM depth, y_face_data, left_right_offset[field_mass_flux_y] + t_offset); } - + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif } void clover_send_recv_message_right( global_variables &globals, - clover::Buffer1D &right_snd, - clover::Buffer1D &right_rcv, + clover::Buffer1D &right_snd_buffer, + clover::Buffer1D &right_rcv_buffer, int total_size, int tag_send, int tag_recv, MPI_Request &req_send, MPI_Request &req_recv) { @@ -1090,11 +1164,16 @@ void clover_send_recv_message_right( int right_task = globals.chunk.chunk_neighbours[chunk_right] - 1; - MPI_Isend(globals.chunk.right_snd.actual(), total_size, MPI_DOUBLE, right_task, + double *right_snd = right_snd_buffer.data; + double *right_rcv = right_rcv_buffer.data; + #pragma omp target update from(right_snd[:right_snd_buffer.N()]) + + MPI_Isend(right_snd, total_size, MPI_DOUBLE, right_task, tag_send, MPI_COMM_WORLD, &req_send); - MPI_Irecv(globals.chunk.right_rcv.actual(), total_size, MPI_DOUBLE, right_task, + MPI_Irecv(right_rcv, total_size, MPI_DOUBLE, right_task, tag_recv, MPI_COMM_WORLD, &req_recv); + #pragma omp target update to(right_rcv[:right_rcv_buffer.N()]) } void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS], int tile, int depth, @@ -1103,8 +1182,13 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] tile_type &t = globals.chunk.tiles[tile]; int t_offset = (t.info.t_bottom - globals.chunk.bottom) * depth; + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + if (fields[field_density0] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1117,6 +1201,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_density1] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1129,6 +1214,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_energy0] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1141,6 +1227,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_energy1] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1153,6 +1240,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_pressure] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1165,6 +1253,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_viscosity] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1177,6 +1266,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_soundspeed] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1189,6 +1279,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_xvel0] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1201,6 +1292,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_xvel1] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1213,6 +1305,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_yvel0] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1225,6 +1318,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_yvel1] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1237,6 +1331,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_vol_flux_x] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1249,6 +1344,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_vol_flux_y] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1261,6 +1357,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_mass_flux_x] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1273,6 +1370,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] } if (fields[field_mass_flux_y] == 1) { clover_unpack_message_right( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1283,7 +1381,9 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS] depth, y_face_data, left_right_offset[field_mass_flux_y] + t_offset); } - + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif } void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_FIELDS], int depth, @@ -1292,8 +1392,13 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F tile_type &t = globals.chunk.tiles[tile]; int t_offset = (t.info.t_left - globals.chunk.left) * depth; + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + if (fields[field_density0] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1306,6 +1411,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_density1] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1318,6 +1424,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_energy0] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1330,6 +1437,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_energy1] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1342,6 +1450,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_pressure] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1354,6 +1463,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_viscosity] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1366,6 +1476,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_soundspeed] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1378,6 +1489,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_xvel0] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1390,6 +1502,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_xvel1] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1402,6 +1515,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_yvel0] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1414,6 +1528,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_yvel1] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1426,6 +1541,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_vol_flux_x] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1438,6 +1554,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_vol_flux_y] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1450,6 +1567,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_mass_flux_x] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1462,6 +1580,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F } if (fields[field_mass_flux_y] == 1) { clover_pack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1472,13 +1591,15 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F depth, y_face_data, bottom_top_offset[field_mass_flux_y] + t_offset); } - + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif } void clover_send_recv_message_top( global_variables &globals, - clover::Buffer1D &top_snd, - clover::Buffer1D &top_rcv, + clover::Buffer1D &top_snd_buffer, + clover::Buffer1D &top_rcv_buffer, int total_size, int tag_send, int tag_recv, MPI_Request &req_send, MPI_Request &req_recv) { @@ -1487,11 +1608,16 @@ void clover_send_recv_message_top( int top_task = globals.chunk.chunk_neighbours[chunk_top] - 1; - MPI_Isend(globals.chunk.top_snd.actual(), total_size, MPI_DOUBLE, top_task, tag_send, + double *top_snd = top_snd_buffer.data; + double *top_rcv = top_rcv_buffer.data; + #pragma omp target update from(top_snd[:top_snd_buffer.N()]) + + MPI_Isend(top_snd, total_size, MPI_DOUBLE, top_task, tag_send, MPI_COMM_WORLD, &req_send); - MPI_Irecv(globals.chunk.top_rcv.actual(), total_size, MPI_DOUBLE, top_task, tag_recv, + MPI_Irecv(top_rcv, total_size, MPI_DOUBLE, top_task, tag_recv, MPI_COMM_WORLD, &req_recv); + #pragma omp target update to(top_rcv[:top_rcv_buffer.N()]) } void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], int tile, int depth, @@ -1500,8 +1626,13 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], tile_type &t = globals.chunk.tiles[tile]; int t_offset = (t.info.t_left - globals.chunk.left) * depth; + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + if (fields[field_density0] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1514,6 +1645,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_density1] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1526,6 +1658,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_energy0] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1538,6 +1671,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_energy1] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1550,6 +1684,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_pressure] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1562,6 +1697,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_viscosity] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1574,6 +1710,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_soundspeed] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1586,6 +1723,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_xvel0] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1598,6 +1736,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_xvel1] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1610,6 +1749,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_yvel0] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1622,6 +1762,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_yvel1] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1634,6 +1775,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_vol_flux_x] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1646,6 +1788,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_vol_flux_y] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1658,6 +1801,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_mass_flux_x] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1670,6 +1814,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], } if (fields[field_mass_flux_y] == 1) { clover_unpack_message_top( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1690,8 +1835,13 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU tile_type &t = globals.chunk.tiles[tile]; int t_offset = (t.info.t_left - globals.chunk.left) * depth; + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + if (fields[field_density0] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1704,6 +1854,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_density1] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1716,6 +1867,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_energy0] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1728,6 +1880,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_energy1] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1740,6 +1893,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_pressure] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1752,6 +1906,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_viscosity] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1764,6 +1919,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_soundspeed] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1776,6 +1932,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_xvel0] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1788,6 +1945,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_xvel1] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1800,6 +1958,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_yvel0] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1812,6 +1971,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_yvel1] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1824,6 +1984,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_vol_flux_x] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1836,6 +1997,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_vol_flux_y] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1848,6 +2010,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_mass_flux_x] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1860,6 +2023,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU } if (fields[field_mass_flux_y] == 1) { clover_pack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1870,13 +2034,15 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU depth, y_face_data, bottom_top_offset[field_mass_flux_y] + t_offset); } - + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif } void clover_send_recv_message_bottom( global_variables &globals, - clover::Buffer1D &bottom_snd, - clover::Buffer1D &bottom_rcv, + clover::Buffer1D &bottom_snd_buffer, + clover::Buffer1D &bottom_rcv_buffer, int total_size, int tag_send, int tag_recv, MPI_Request &req_send, MPI_Request &req_recv) { @@ -1885,11 +2051,16 @@ void clover_send_recv_message_bottom( int bottom_task = globals.chunk.chunk_neighbours[chunk_bottom] - 1; - MPI_Isend(globals.chunk.bottom_snd.actual(), total_size, MPI_DOUBLE, bottom_task, + double *bottom_snd = bottom_snd_buffer.data; + double *bottom_rcv = bottom_rcv_buffer.data; + #pragma omp target update from(bottom_snd[:bottom_snd_buffer.N()]) + + MPI_Isend(bottom_snd, total_size, MPI_DOUBLE, bottom_task, tag_send, MPI_COMM_WORLD, &req_send); - MPI_Irecv(globals.chunk.bottom_rcv.actual(), total_size, MPI_DOUBLE, bottom_task, + MPI_Irecv(bottom_rcv, total_size, MPI_DOUBLE, bottom_task, tag_recv, MPI_COMM_WORLD, &req_recv); + #pragma omp target update to(bottom_rcv[:bottom_rcv_buffer.N()]) } void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS], int tile, int depth, @@ -1899,8 +2070,13 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS tile_type &t = globals.chunk.tiles[tile]; int t_offset = (t.info.t_left - globals.chunk.left) * depth; + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + if (fields[field_density0] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1913,6 +2089,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_density1] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1925,6 +2102,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_energy0] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1937,6 +2115,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_energy1] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1949,6 +2128,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_pressure] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1961,6 +2141,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_viscosity] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1973,6 +2154,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_soundspeed] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1985,6 +2167,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_xvel0] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -1997,6 +2180,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_xvel1] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -2009,6 +2193,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_yvel0] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -2021,6 +2206,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_yvel1] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -2033,6 +2219,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_vol_flux_x] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -2045,6 +2232,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_vol_flux_y] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -2057,6 +2245,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_mass_flux_x] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -2069,6 +2258,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS } if (fields[field_mass_flux_y] == 1) { clover_unpack_message_bottom( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -2079,5 +2269,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS depth, y_face_data, bottom_top_offset[field_mass_flux_y] + t_offset); } - + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif } diff --git a/src/comms.h b/src/comms.h index ba4f2b6..6b55fe1 100644 --- a/src/comms.h +++ b/src/comms.h @@ -22,8 +22,6 @@ #define COMMS_H #include "definitions.h" -#include "utils.hpp" - #include // Structure to hold MPI rank information diff --git a/src/definitions.h b/src/definitions.h index 528db49..498803a 100644 --- a/src/definitions.h +++ b/src/definitions.h @@ -20,7 +20,12 @@ #ifndef GRID_H #define GRID_H +// Enables dumping buffers at each iteration as text files, see hydro.cpp for actual implementation #define DEBUG false +// Enables buffer synchronisation between host and device before and after each kernel invocation. +// This is useful for debugging individual kernels; +// by synchronising buffer data, not all kernels have to be executed on the device or host +#define SYNC_BUFFERS 0 #include @@ -29,7 +34,8 @@ #include #include #include -#include "utils.hpp" +#include +#include #define g_ibig 640000 @@ -37,8 +43,111 @@ #define g_big (1.0e+21) #define NUM_FIELDS 15 +#ifdef OMP_ALLOW_HOST +#define clover_use_target(cond) if(target: (cond)) +#else +#define clover_use_target(cond) /*no-op*/ +#endif + +namespace clover { + + template + static T cpp14_exchange(T &obj, U &&new_value) { + T old_value = std::move(obj); + obj = std::forward(new_value); + return old_value; + } + + template + struct Buffer1D { + + private: + const size_t size; + + public: + T *data; + + explicit Buffer1D(size_t size) : size(size), data(new T[size]) { + assert(size > 0); + } + + Buffer1D(const Buffer1D &that) : size(that.size), data(new T[size]) { + std::copy(that.data, that.data + size, data); + } + + Buffer1D(Buffer1D &&other) noexcept: size(other.size), data(cpp14_exchange(other.data, nullptr)) {} + + Buffer1D &operator=(Buffer1D &&other) noexcept { + size = other.size; + std::swap(data, other.data); + return *this; + } + + [[nodiscard]] T &operator[](size_t i) { return data[i]; } + [[nodiscard]] T operator[](size_t i) const { return data[i]; } + + [[nodiscard]] constexpr size_t N() const { return size; } + + + Buffer1D &operator=(const Buffer1D &other) { + if (this != &other) { + delete[] data; + std::copy(other.data, other.data + size, data); + size = other.size; + } + return *this; + } + + ~Buffer1D() { delete[] data; } + }; + + + template + struct Buffer2D { + private: + const size_t sizeX, sizeY; + public: + + T *data; + + Buffer2D(size_t sizeX, size_t sizeY) : sizeX(sizeX), sizeY(sizeY), data(new T[sizeX * sizeY]) { + assert(sizeX > 0); + assert(sizeY > 0); + } + Buffer2D(const Buffer2D &that) : sizeX(that.sizeX), sizeY(that.sizeY), data(new T[sizeX * sizeY]) { + std::copy(that.data, that.data + (sizeX * sizeY), data); + } + + Buffer2D(Buffer2D &&other) noexcept: sizeX(other.sizeX), sizeY(other.sizeY), data(cpp14_exchange(other.data, nullptr)) {} + + + [[nodiscard]] const T &operator()(size_t i, size_t j) { return data[i + j * sizeX]; } + [[nodiscard]] T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } + [[nodiscard]] constexpr size_t N() const { return sizeX * sizeY; } + [[nodiscard]] constexpr size_t nX() const { return sizeX; } + [[nodiscard]] constexpr size_t nY() const { return sizeY; } + + + Buffer2D &operator=(const Buffer2D &other) { + if (this != &other) { + return *this = Buffer2D(other); + } + } + + Buffer2D &operator=(Buffer2D &&other) noexcept { + sizeX = other.sizeX; + sizeY = other.sizeY; + std::swap(data, other.data); + return *this; + } + ~Buffer2D() { delete[] data; } + + }; + +} + typedef std::chrono::time_point timepoint; @@ -63,10 +172,10 @@ static inline void record(const std::string &name, const std::function &buffer) { - out << name << "(" << 1 << ") [" << buffer.size() << "]" << std::endl; +show(std::ostream &out, const std::string &name, const clover::Buffer1D &buffer) { + out << name << "(" << 1 << ") [" << buffer.N() << "]" << std::endl; out << "\t"; - for (size_t i = 0; i < buffer.size(); ++i) { + for (size_t i = 0; i < buffer.N(); ++i) { out << buffer[i] << ", "; } out << std::endl; @@ -74,11 +183,11 @@ show(std::ostream &out, const std::string &name, clover::Buffer1D &buffe // formats and then dumps content of 2d double buffer to stream static inline void show(std::ostream &out, const std::string &name, clover::Buffer2D &buffer) { - out << name << "(" << 2 << ") [" << buffer.sizeX << "x" << buffer.sizeY << "]" + out << name << "(" << 2 << ") [" << buffer.nX() << "x" << buffer.nY() << "]" << std::endl; - for (size_t i = 0; i < buffer.sizeX; ++i) { + for (size_t i = 0; i < buffer.nX(); ++i) { out << "\t"; - for (size_t j = 0; j < buffer.sizeY; ++j) { + for (size_t j = 0; j < buffer.nY(); ++j) { out << buffer(i, j) << ", "; } out << std::endl; @@ -180,17 +289,24 @@ struct profiler_type { struct field_type { - clover::Buffer2D density0; - clover::Buffer2D density1; - clover::Buffer2D energy0; - clover::Buffer2D energy1; + clover::Buffer2D density0, density1; + clover::Buffer2D energy0, energy1; clover::Buffer2D pressure; clover::Buffer2D viscosity; + clover::Buffer2D volume; clover::Buffer2D soundspeed; + + + int density0_stride, density1_stride; + int energy0_stride, energy1_stride; + int pressure_stride; + int viscosity_stride; + int volume_stride; + int soundspeed_stride; + + clover::Buffer2D xvel0, xvel1; clover::Buffer2D yvel0, yvel1; - clover::Buffer2D vol_flux_x, mass_flux_x; - clover::Buffer2D vol_flux_y, mass_flux_y; clover::Buffer2D work_array1; // node_flux, stepbymass, volume_change, pre_vol clover::Buffer2D work_array2; // node_mass_post, post_vol @@ -200,36 +316,33 @@ struct field_type { clover::Buffer2D work_array6; // pre_vol, post_ener clover::Buffer2D work_array7; // post_vol, ener_flux - clover::Buffer1D cellx; - clover::Buffer1D celldx; - clover::Buffer1D celly; - clover::Buffer1D celldy; - clover::Buffer1D vertexx; - clover::Buffer1D vertexdx; - clover::Buffer1D vertexy; - clover::Buffer1D vertexdy; + clover::Buffer2D vol_flux_x, mass_flux_x; + clover::Buffer2D vol_flux_y, mass_flux_y; + clover::Buffer2D xarea, yarea; + + clover::Buffer1D cellx, celldx; + clover::Buffer1D celly, celldy; + + clover::Buffer1D vertexx, vertexdx; + clover::Buffer1D vertexy, vertexdy; + + + int base_stride; + int vels_wk_stride; + int flux_x_stride, flux_y_stride; - clover::Buffer2D volume; - clover::Buffer2D xarea; - clover::Buffer2D yarea; + explicit field_type(const int xrange, const int yrange) : - explicit field_type(const size_t xrange, const size_t yrange) : - density0(xrange, yrange), - density1(xrange, yrange), - energy0(xrange, yrange), - energy1(xrange, yrange), + density0(xrange, yrange), density1(xrange, yrange), + energy0(xrange, yrange), energy1(xrange, yrange), pressure(xrange, yrange), viscosity(xrange, yrange), + volume(xrange, yrange), soundspeed(xrange, yrange), - xvel0(xrange + 1, yrange + 1), - xvel1(xrange + 1, yrange + 1), - yvel0(xrange + 1, yrange + 1), - yvel1(xrange + 1, yrange + 1), - vol_flux_x(xrange + 1, yrange), - mass_flux_x(xrange + 1, yrange), - vol_flux_y(xrange, yrange + 1), - mass_flux_y(xrange, yrange + 1), + + xvel0(xrange + 1, yrange + 1), xvel1(xrange + 1, yrange + 1), + yvel0(xrange + 1, yrange + 1), yvel1(xrange + 1, yrange + 1), work_array1(xrange + 1, yrange + 1), work_array2(xrange + 1, yrange + 1), work_array3(xrange + 1, yrange + 1), @@ -237,19 +350,22 @@ struct field_type { work_array5(xrange + 1, yrange + 1), work_array6(xrange + 1, yrange + 1), work_array7(xrange + 1, yrange + 1), - cellx(xrange), - celldx(xrange), - celly(yrange), - celldy(yrange), + + vol_flux_x(xrange + 1, yrange), mass_flux_x(xrange + 1, yrange), + vol_flux_y(xrange, yrange + 1), mass_flux_y(xrange, yrange + 1), + xarea(xrange + 1, yrange), yarea(xrange, yrange + 1), + + cellx(xrange), celldx(xrange), + celly(yrange), celldy(yrange), + vertexx(xrange + 1), vertexdx(xrange + 1), vertexy(yrange + 1), vertexdy(yrange + 1), - volume(xrange, yrange), - xarea(xrange + 1, yrange), - yarea(xrange, yrange + 1) {} - + base_stride(xrange), + vels_wk_stride(xrange + 1), + flux_x_stride(xrange + 1), flux_y_stride(xrange) {} }; @@ -368,6 +484,8 @@ struct global_variables { const global_config config; const size_t omp_device; + bool use_target; + chunk_type chunk; int error_condition; @@ -392,15 +510,175 @@ struct global_variables { explicit global_variables( const global_config &config, size_t omp_device, + bool use_target, chunk_type chunk) : - config(config), omp_device(omp_device), chunk(std::move(chunk)), + config(config), omp_device(omp_device), use_target(use_target), chunk(std::move(chunk)), dt(config.dtinit), dtold(config.dtinit), profiler_on(config.profiler_on) {} + void hostToDevice() { + + for (int tile = 0; tile < config.tiles_per_chunk; ++tile) { + tile_type &t = chunk.tiles[tile]; + field_type &field = t.field; + + + double *density0 = field.density0.data; + double *density1 = field.density1.data; + double *energy0 = field.energy0.data; + double *energy1 = field.energy1.data; + double *pressure = field.pressure.data; + double *viscosity = field.viscosity.data; + double *soundspeed = field.soundspeed.data; + double *yvel0 = field.yvel0.data; + double *yvel1 = field.yvel1.data; + double *xvel0 = field.xvel0.data; + double *xvel1 = field.xvel1.data; + double *vol_flux_x = field.vol_flux_x.data; + double *vol_flux_y = field.vol_flux_y.data; + double *mass_flux_x = field.mass_flux_x.data; + double *mass_flux_y = field.mass_flux_y.data; + double *work_array1 = field.work_array1.data; + double *work_array2 = field.work_array2.data; + double *work_array3 = field.work_array3.data; + double *work_array4 = field.work_array4.data; + double *work_array5 = field.work_array5.data; + double *work_array6 = field.work_array6.data; + double *work_array7 = field.work_array7.data; + double *cellx = field.cellx.data; + double *celldx = field.celldx.data; + double *celly = field.celly.data; + double *celldy = field.celldy.data; + double *vertexx = field.vertexx.data; + double *vertexdx = field.vertexdx.data; + double *vertexy = field.vertexy.data; + double *vertexdy = field.vertexdy.data; + double *volume = field.volume.data; + double *xarea = field.xarea.data; + double *yarea = field.yarea.data; + + #pragma omp target update \ + to(density0[:field.density0.N()]) \ + to(density1[:field.density1.N()]) \ + to(energy0[:field.energy0.N()]) \ + to(energy1[:field.energy1.N()]) \ + to(pressure[:field.pressure.N()]) \ + to(viscosity[:field.viscosity.N()]) \ + to(soundspeed[:field.soundspeed.N()]) \ + to(yvel0[:field.yvel0.N()]) \ + to(yvel1[:field.yvel1.N()]) \ + to(xvel0[:field.xvel0.N()]) \ + to(xvel1[:field.xvel1.N()]) \ + to(vol_flux_x[:field.vol_flux_x.N()]) \ + to(vol_flux_y[:field.vol_flux_y.N()]) \ + to(mass_flux_x[:field.mass_flux_x.N()]) \ + to(mass_flux_y[:field.mass_flux_y.N()]) \ + to(work_array1[:field.work_array1.N()]) \ + to(work_array2[:field.work_array2.N()]) \ + to(work_array3[:field.work_array3.N()]) \ + to(work_array4[:field.work_array4.N()]) \ + to(work_array5[:field.work_array5.N()]) \ + to(work_array6[:field.work_array6.N()]) \ + to(work_array7[:field.work_array7.N()]) \ + to(cellx[:field.cellx.N()]) \ + to(celldx[:field.celldx.N()]) \ + to(celly[:field.celly.N()]) \ + to(celldy[:field.celldy.N()]) \ + to(vertexx[:field.vertexx.N()]) \ + to(vertexdx[:field.vertexdx.N()]) \ + to(vertexy[:field.vertexy.N()]) \ + to(vertexdy[:field.vertexdy.N()]) \ + to(volume[:field.volume.N()]) \ + to(xarea[:field.xarea.N()]) \ + to(yarea[:field.yarea.N()]) + } + + } + + void deviceToHost() { + + for (int tile = 0; tile < config.tiles_per_chunk; ++tile) { + tile_type &t = chunk.tiles[tile]; + field_type &field = t.field; + + + double *density0 = field.density0.data; + double *density1 = field.density1.data; + double *energy0 = field.energy0.data; + double *energy1 = field.energy1.data; + double *pressure = field.pressure.data; + double *viscosity = field.viscosity.data; + double *soundspeed = field.soundspeed.data; + double *yvel0 = field.yvel0.data; + double *yvel1 = field.yvel1.data; + double *xvel0 = field.xvel0.data; + double *xvel1 = field.xvel1.data; + double *vol_flux_x = field.vol_flux_x.data; + double *vol_flux_y = field.vol_flux_y.data; + double *mass_flux_x = field.mass_flux_x.data; + double *mass_flux_y = field.mass_flux_y.data; + double *work_array1 = field.work_array1.data; + double *work_array2 = field.work_array2.data; + double *work_array3 = field.work_array3.data; + double *work_array4 = field.work_array4.data; + double *work_array5 = field.work_array5.data; + double *work_array6 = field.work_array6.data; + double *work_array7 = field.work_array7.data; + double *cellx = field.cellx.data; + double *celldx = field.celldx.data; + double *celly = field.celly.data; + double *celldy = field.celldy.data; + double *vertexx = field.vertexx.data; + double *vertexdx = field.vertexdx.data; + double *vertexy = field.vertexy.data; + double *vertexdy = field.vertexdy.data; + double *volume = field.volume.data; + double *xarea = field.xarea.data; + double *yarea = field.yarea.data; + + #pragma omp target update \ + from(density0[:field.density0.N()]) \ + from(density1[:field.density1.N()]) \ + from(energy0[:field.energy0.N()]) \ + from(energy1[:field.energy1.N()]) \ + from(pressure[:field.pressure.N()]) \ + from(viscosity[:field.viscosity.N()]) \ + from(soundspeed[:field.soundspeed.N()]) \ + from(yvel0[:field.yvel0.N()]) \ + from(yvel1[:field.yvel1.N()]) \ + from(xvel0[:field.xvel0.N()]) \ + from(xvel1[:field.xvel1.N()]) \ + from(vol_flux_x[:field.vol_flux_x.N()]) \ + from(vol_flux_y[:field.vol_flux_y.N()]) \ + from(mass_flux_x[:field.mass_flux_x.N()]) \ + from(mass_flux_y[:field.mass_flux_y.N()]) \ + from(work_array1[:field.work_array1.N()]) \ + from(work_array2[:field.work_array2.N()]) \ + from(work_array3[:field.work_array3.N()]) \ + from(work_array4[:field.work_array4.N()]) \ + from(work_array5[:field.work_array5.N()]) \ + from(work_array6[:field.work_array6.N()]) \ + from(work_array7[:field.work_array7.N()]) \ + from(cellx[:field.cellx.N()]) \ + from(celldx[:field.celldx.N()]) \ + from(celly[:field.celly.N()]) \ + from(celldy[:field.celldy.N()]) \ + from(vertexx[:field.vertexx.N()]) \ + from(vertexdx[:field.vertexdx.N()]) \ + from(vertexy[:field.vertexy.N()]) \ + from(vertexdy[:field.vertexdy.N()]) \ + from(volume[:field.volume.N()]) \ + from(xarea[:field.xarea.N()]) \ + from(yarea[:field.yarea.N()]) + } + } + // dumps all content to file; for debugging only void dump(const std::string &filename) { + deviceToHost(); + std::cout << "Dumping globals to " << filename << std::endl; record(filename, [&](std::ostream &out) { @@ -500,6 +778,7 @@ struct global_variables { } + }; diff --git a/src/field_summary.cpp b/src/field_summary.cpp index 0e9c016..3214804 100644 --- a/src/field_summary.cpp +++ b/src/field_summary.cpp @@ -22,7 +22,7 @@ #include "field_summary.h" #include "timer.h" #include "ideal_gas.h" -#include "utils.hpp" + #include @@ -75,6 +75,9 @@ void field_summary(global_variables &globals, parallel_ ¶llel) { double ke = 0.0; double press = 0.0; + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) { tile_type &t = globals.chunk.tiles[tile]; @@ -85,28 +88,50 @@ void field_summary(global_variables &globals, parallel_ ¶llel) { int xmin = t.info.t_xmin; field_type &field = t.field; - _Pragma("kernel1d") - for (int idx = (0); idx < ((ymax - ymin + 1) * (xmax - xmin + 1)); idx++) { + + const int base_stride = field.base_stride; + const int vels_wk_stride = field.vels_wk_stride; + + double *volume = field.volume.data; + double *density0 = field.density0.data; + double *energy0 = field.energy0.data; + double *pressure = field.pressure.data; + double *xvel0 = field.xvel0.data; + double *yvel0 = field.yvel0.data; + + + #pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target) \ + map(tofrom:vol) \ + map(tofrom:mass) \ + map(tofrom:ie) \ + map(tofrom:ke) \ + map(tofrom:press) \ + reduction(+:vol, mass, ie, ke, press) + for (int idx = 0; idx < ((ymax - ymin + 1) * (xmax - xmin + 1)); idx++) { const int j = xmin + 1 + idx % (xmax - xmin + 1); const int k = ymin + 1 + idx / (xmax - xmin + 1); double vsqrd = 0.0; for (int kv = k; kv <= k + 1; ++kv) { for (int jv = j; jv <= j + 1; ++jv) { - vsqrd += 0.25 * (field.xvel0(jv, kv) * field.xvel0(jv, kv) + field.yvel0(jv, kv) * field.yvel0(jv, kv)); + vsqrd += 0.25 * (xvel0[(jv) + (kv) * vels_wk_stride] * xvel0[(jv) + (kv) * vels_wk_stride] + yvel0[(jv) + (kv) * vels_wk_stride] * yvel0[(jv) + (kv) * vels_wk_stride]); } } - double cell_vol = field.volume(j, k); - double cell_mass = cell_vol * field.density0(j, k); + double cell_vol = volume[j + (k) * base_stride]; + double cell_mass = cell_vol * density0[j + (k) * base_stride]; vol += cell_vol; mass += cell_mass; - ie += cell_mass * field.energy0(j, k); + ie += cell_mass * energy0[j + (k) * base_stride]; ke += cell_mass * 0.5 * vsqrd; - press += cell_vol * field.pressure(j, k); + press += cell_vol * pressure[j + (k) * base_stride]; } } + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif + clover_sum(vol); clover_sum(mass); clover_sum(ie); diff --git a/src/finalise_field.cpp b/src/finalise_field.cpp new file mode 100644 index 0000000..3ce2150 --- /dev/null +++ b/src/finalise_field.cpp @@ -0,0 +1,110 @@ +/* + Crown Copyright 2012 AWE. + + This file is part of CloverLeaf. + + CloverLeaf is free software: you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the + Free Software Foundation, either version 3 of the License, or (at your option) + any later version. + + CloverLeaf is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + CloverLeaf. If not, see http://www.gnu.org/licenses/. + */ + + +// @brief Allocates the data for each mesh chunk +// @author Wayne Gaudin +// @details The data fields for the mesh chunk are allocated based on the mesh +// size. + + +#include "finalise_field.h" + + +// Allocate Kokkos Views for the data arrays +void finalise_field(global_variables &globals) { + + for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) { + + tile_type &t = globals.chunk.tiles[tile]; + field_type &field = t.field; + + double *density0 = field.density0.data; + double *density1 = field.density1.data; + double *energy0 = field.energy0.data; + double *energy1 = field.energy1.data; + double *pressure = field.pressure.data; + double *viscosity = field.viscosity.data; + double *soundspeed = field.soundspeed.data; + double *yvel0 = field.yvel0.data; + double *yvel1 = field.yvel1.data; + double *xvel0 = field.xvel0.data; + double *xvel1 = field.xvel1.data; + double *vol_flux_x = field.vol_flux_x.data; + double *vol_flux_y = field.vol_flux_y.data; + double *mass_flux_x = field.mass_flux_x.data; + double *mass_flux_y = field.mass_flux_y.data; + double *work_array1 = field.work_array1.data; + double *work_array2 = field.work_array2.data; + double *work_array3 = field.work_array3.data; + double *work_array4 = field.work_array4.data; + double *work_array5 = field.work_array5.data; + double *work_array6 = field.work_array6.data; + double *work_array7 = field.work_array7.data; + double *cellx = field.cellx.data; + double *celldx = field.celldx.data; + double *celly = field.celly.data; + double *celldy = field.celldy.data; + double *vertexx = field.vertexx.data; + double *vertexdx = field.vertexdx.data; + double *vertexy = field.vertexy.data; + double *vertexdy = field.vertexdy.data; + double *volume = field.volume.data; + double *xarea = field.xarea.data; + double *yarea = field.yarea.data; + + #pragma omp target exit data \ + map(release: density0[:0]) \ + map(release: density1[:0]) \ + map(release: energy0[:0]) \ + map(release: energy1[:0]) \ + map(release: pressure[:0]) \ + map(release: viscosity[:0]) \ + map(release: soundspeed[:0]) \ + map(release: yvel0[:0]) \ + map(release: yvel1[:0]) \ + map(release: xvel0[:0]) \ + map(release: xvel1[:0]) \ + map(release: vol_flux_x[:0]) \ + map(release: vol_flux_y[:0]) \ + map(release: mass_flux_x[:0]) \ + map(release: mass_flux_y[:0]) \ + map(release: work_array1[:0]) \ + map(release: work_array2[:0]) \ + map(release: work_array3[:0]) \ + map(release: work_array4[:0]) \ + map(release: work_array5[:0]) \ + map(release: work_array6[:0]) \ + map(release: work_array7[:0]) \ + map(release: cellx[:0]) \ + map(release: celldx[:0]) \ + map(release: celly[:0]) \ + map(release: celldy[:0]) \ + map(release: vertexx[:0]) \ + map(release: vertexdx[:0]) \ + map(release: vertexy[:0]) \ + map(release: vertexdy[:0]) \ + map(release: volume[:0]) \ + map(release: xarea[:0]) \ + map(release: yarea[:0]) + + } + +} + diff --git a/src/cxx14_compat.hpp b/src/finalise_field.h similarity index 58% rename from src/cxx14_compat.hpp rename to src/finalise_field.h index 7181e0f..c558a3e 100644 --- a/src/cxx14_compat.hpp +++ b/src/finalise_field.h @@ -3,14 +3,14 @@ This file is part of CloverLeaf. - CloverLeaf is free software: you can redistribute it and/or modify it under - the terms of the GNU General Public License as published by the - Free Software Foundation, either version 3 of the License, or (at your option) + CloverLeaf is free software: you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the + Free Software Foundation, either version 3 of the License, or (at your option) any later version. - CloverLeaf is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + CloverLeaf is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with @@ -18,17 +18,12 @@ */ -#ifndef CXX14_COMPAT_HPP -#define CXX14_COMPAT_HPP +#ifndef FINALISE_FIELD_H +#define FINALISE_FIELD_H -#include +#include "definitions.h" +void finalise_field(global_variables &globals); -// taken from https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique -// one of the possible reference implementations -template -std::unique_ptr make_unique(Args &&... args) { - return std::unique_ptr(new T(std::forward(args)...)); -} +#endif -#endif //CXX14_COMPAT_HPP diff --git a/src/flux_calc.cpp b/src/flux_calc.cpp index 2095a10..694283c 100644 --- a/src/flux_calc.cpp +++ b/src/flux_calc.cpp @@ -21,33 +21,45 @@ #include "flux_calc.h" #include "timer.h" -#include "utils.hpp" + // @brief Fortran flux kernel. // @author Wayne Gaudin // @details The edge volume fluxes are calculated based on the velocity fields. void flux_calc_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, double dt, - clover::Buffer2D &xarea, - clover::Buffer2D &yarea, - clover::Buffer2D &xvel0, - clover::Buffer2D &yvel0, - clover::Buffer2D &xvel1, - clover::Buffer2D &yvel1, - clover::Buffer2D &vol_flux_x, - clover::Buffer2D &vol_flux_y) { + field_type &field) { // DO k=y_min,y_max+1 // DO j=x_min,x_max+1 -// Note that the loops calculate one extra flux than required, but this + // Note that the loops calculate one extra flux than required, but this // allows loop fusion that improves performance - _Pragma("kernel2d") + + const int flux_x_stride = field.flux_x_stride; + const int flux_y_stride = field.flux_y_stride; + const int vels_wk_stride = field.vels_wk_stride; + + double *xarea = field.xarea.data; + double *yarea = field.yarea.data; + double *xvel0 = field.xvel0.data; + double *yvel0 = field.yvel0.data; + double *xvel1 = field.xvel1.data; + double *yvel1 = field.yvel1.data; + double *vol_flux_x = field.vol_flux_x.data; + double *vol_flux_y = field.vol_flux_y.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) { for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) { - vol_flux_x(i, j) = 0.25 * dt * xarea(i, j) * (xvel0(i, j) + xvel0(i + 0, j + 1) + xvel1(i, j) + xvel1(i + 0, j + 1)); - vol_flux_y(i, j) = 0.25 * dt * yarea(i, j) * (yvel0(i, j) + yvel0(i + 1, j + 0) + yvel1(i, j) + yvel1(i + 1, j + 0)); + vol_flux_x[i + j * flux_x_stride] = 0.25 * dt * xarea[i + j * flux_x_stride] * + (xvel0[i + j * vels_wk_stride] + xvel0[(i + 0) + (j + 1) * vels_wk_stride] + xvel1[i + j * vels_wk_stride] + + xvel1[(i + 0) + (j + 1) * vels_wk_stride]); + vol_flux_y[i + j * flux_y_stride] = 0.25 * dt * yarea[i + j * flux_y_stride] * + (yvel0[i + j * vels_wk_stride] + yvel0[(i + 1) + (j + 0) * vels_wk_stride] + yvel1[i + j * vels_wk_stride] + + yvel1[(i + 1) + (j + 0) * vels_wk_stride]); } } } @@ -61,25 +73,27 @@ void flux_calc(global_variables &globals) { if (globals.profiler_on) kernel_time = timer(); + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) { tile_type &t = globals.chunk.tiles[tile]; flux_calc_kernel( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, t.info.t_ymax, globals.dt, - t.field.xarea, - t.field.yarea, - t.field.xvel0, - t.field.yvel0, - t.field.xvel1, - t.field.yvel1, - t.field.vol_flux_x, - t.field.vol_flux_y); + t.field); } + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif + if (globals.profiler_on) globals.profiler.flux += timer() - kernel_time; } diff --git a/src/generate_chunk.cpp b/src/generate_chunk.cpp index db57d71..7db9169 100644 --- a/src/generate_chunk.cpp +++ b/src/generate_chunk.cpp @@ -27,36 +27,35 @@ #include #include "generate_chunk.h" -#include "utils.hpp" void generate_chunk(const int tile, global_variables &globals) { // Need to copy the host array of state input data into a device array - Buffer1D state_density(globals.config.number_of_states); - Buffer1D state_energy(globals.config.number_of_states); - Buffer1D state_xvel(globals.config.number_of_states); - Buffer1D state_yvel(globals.config.number_of_states); - Buffer1D state_xmin(globals.config.number_of_states); - Buffer1D state_xmax(globals.config.number_of_states); - Buffer1D state_ymin(globals.config.number_of_states); - Buffer1D state_ymax(globals.config.number_of_states); - Buffer1D state_radius(globals.config.number_of_states); - Buffer1D state_geometry(globals.config.number_of_states); + clover::Buffer1D state_density_buffer(globals.config.number_of_states); + clover::Buffer1D state_energy_buffer(globals.config.number_of_states); + clover::Buffer1D state_xvel_buffer(globals.config.number_of_states); + clover::Buffer1D state_yvel_buffer(globals.config.number_of_states); + clover::Buffer1D state_xmin_buffer(globals.config.number_of_states); + clover::Buffer1D state_xmax_buffer(globals.config.number_of_states); + clover::Buffer1D state_ymin_buffer(globals.config.number_of_states); + clover::Buffer1D state_ymax_buffer(globals.config.number_of_states); + clover::Buffer1D state_radius_buffer(globals.config.number_of_states); + clover::Buffer1D state_geometry_buffer(globals.config.number_of_states); // Copy the data to the new views for (int state = 0; state < globals.config.number_of_states; ++state) { - state_density[state] = globals.config.states[state].density; - state_energy[state] = globals.config.states[state].energy; - state_xvel[state] = globals.config.states[state].xvel; - state_yvel[state] = globals.config.states[state].yvel; - state_xmin[state] = globals.config.states[state].xmin; - state_xmax[state] = globals.config.states[state].xmax; - state_ymin[state] = globals.config.states[state].ymin; - state_ymax[state] = globals.config.states[state].ymax; - state_radius[state] = globals.config.states[state].radius; - state_geometry[state] = globals.config.states[state].geometry; + state_density_buffer[state] = globals.config.states[state].density; + state_energy_buffer[state] = globals.config.states[state].energy; + state_xvel_buffer[state] = globals.config.states[state].xvel; + state_yvel_buffer[state] = globals.config.states[state].yvel; + state_xmin_buffer[state] = globals.config.states[state].xmin; + state_xmax_buffer[state] = globals.config.states[state].xmax; + state_ymin_buffer[state] = globals.config.states[state].ymin; + state_ymax_buffer[state] = globals.config.states[state].ymax; + state_radius_buffer[state] = globals.config.states[state].radius; + state_geometry_buffer[state] = globals.config.states[state].geometry; } // Kokkos::deep_copy (TO, FROM) @@ -76,58 +75,99 @@ void generate_chunk(const int tile, global_variables &globals) { field_type &field = globals.chunk.tiles[tile].field; + const double state_energy_0 = state_energy_buffer[0]; + const double state_density_0 = state_density_buffer[0]; + const double state_xvel_0 = state_xvel_buffer[0]; + const double state_yvel_0 = state_yvel_buffer[0]; + + const int base_stride = field.base_stride; + const int vels_wk_stride = field.vels_wk_stride; + // State 1 is always the background state - _Pragma("kernel2d") - for (int j = (0); j < (yrange); j++) { - for (int i = (0); i < (xrange); i++) { - field.energy0(i, j) = state_energy[0]; - field.density0(i, j) = state_density[0]; - field.xvel0(i, j) = state_xvel[0]; - field.yvel0(i, j) = state_yvel[0]; + double *energy0 = field.energy0.data; + double *density0 = field.density0.data; + double *xvel0 = field.xvel0.data; + double *yvel0 = field.yvel0.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target) + for (int j = 0; j < (yrange); j++) { + for (int i = 0; i < (xrange); i++) { + energy0[i + j * base_stride] = state_energy_0; + density0[i + j * base_stride] = state_density_0; + xvel0[i + j * vels_wk_stride] = state_xvel_0; + yvel0[i + j * vels_wk_stride] = state_yvel_0; } } for (int state = 1; state < globals.config.number_of_states; ++state) { - _Pragma("kernel2d") - for (int j = (0); j < (yrange); j++) { - for (int i = (0); i < (xrange); i++) { + + double *cellx = field.cellx.data; + double *celly = field.celly.data; + + double *vertexx = field.vertexx.data; + double *vertexy = field.vertexy.data; + + const double *state_density = state_density_buffer.data; + const double *state_energy = state_energy_buffer.data; + const double *state_xvel = state_xvel_buffer.data; + const double *state_yvel = state_yvel_buffer.data; + const double *state_xmin = state_xmin_buffer.data; + const double *state_xmax = state_xmax_buffer.data; + const double *state_ymin = state_ymin_buffer.data; + const double *state_ymax = state_ymax_buffer.data; + const double *state_radius = state_radius_buffer.data; + const int *state_geometry = state_geometry_buffer.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target) \ + map(to : state_density[:state_density_buffer.N()]) \ + map(to : state_energy[:state_energy_buffer.N()]) \ + map(to : state_xvel[:state_xvel_buffer.N()]) \ + map(to : state_yvel[:state_yvel_buffer.N()]) \ + map(to : state_xmin[:state_xmin_buffer.N()]) \ + map(to : state_xmax[:state_xmax_buffer.N()]) \ + map(to : state_ymin[:state_ymin_buffer.N()]) \ + map(to : state_ymax[:state_ymax_buffer.N()]) \ + map(to : state_radius[:state_radius_buffer.N()]) \ + map(to : state_geometry[:state_geometry_buffer.N()]) + for (int j = 0; j < (yrange); j++) { + for (int i = 0; i < (xrange); i++) { double x_cent = state_xmin[state]; double y_cent = state_ymin[state]; if (state_geometry[state] == g_rect) { - if (field.vertexx[i + 1] >= state_xmin[state] && field.vertexx[i] < state_xmax[state]) { - if (field.vertexy[j + 1] >= state_ymin[state] && field.vertexy[j] < state_ymax[state]) { - field.energy0(i, j) = state_energy[state]; - field.density0(i, j) = state_density[state]; + if (vertexx[i + 1] >= state_xmin[state] && vertexx[i] < state_xmax[state]) { + if (vertexy[j + 1] >= state_ymin[state] && vertexy[j] < state_ymax[state]) { + energy0[i + j * base_stride] = state_energy[state]; + density0[i + j * base_stride] = state_density[state]; for (int kt = j; kt <= j + 1; ++kt) { for (int jt = i; jt <= i + 1; ++jt) { - field.xvel0(jt, kt) = state_xvel[state]; - field.yvel0(jt, kt) = state_yvel[state]; + xvel0[jt + kt * vels_wk_stride] = state_xvel[state]; + yvel0[jt + kt * vels_wk_stride] = state_yvel[state]; } } } } } else if (state_geometry[state] == g_circ) { - double radius = std::sqrt((field.cellx[i] - x_cent) * - (field.cellx[i] - x_cent) + (field.celly[j] - y_cent) * (field.celly[j] - y_cent)); + double radius = sqrt((cellx[i] - x_cent) * + (cellx[i] - x_cent) + (celly[j] - y_cent) * (celly[j] - y_cent)); if (radius <= state_radius[state]) { - field.energy0(i, j) = state_energy[state]; - field.density0(i, j) = state_density[state]; + energy0[i + j * base_stride] = state_energy[state]; + density0[i + j * base_stride] = state_density[state]; for (int kt = j; kt <= j + 1; ++kt) { for (int jt = i; jt <= i + 1; ++jt) { - field.xvel0(jt, kt) = state_xvel[state]; - field.yvel0(jt, kt) = state_yvel[state]; + xvel0[jt + kt * vels_wk_stride] = state_xvel[state]; + yvel0[jt + kt * vels_wk_stride] = state_yvel[state]; } } } } else if (state_geometry[state] == g_point) { - if (field.vertexx[i] == x_cent && field.vertexy[j] == y_cent) { - field.energy0(i, j) = state_energy[state]; - field.density0(i, j) = state_density[state]; + if (vertexx[i] == x_cent && vertexy[j] == y_cent) { + energy0[i + j * base_stride] = state_energy[state]; + density0[i + j * base_stride] = state_density[state]; for (int kt = j; kt <= j + 1; ++kt) { for (int jt = i; jt <= i + 1; ++jt) { - field.xvel0(jt, kt) = state_xvel[state]; - field.yvel0(jt, kt) = state_yvel[state]; + xvel0[jt + kt * vels_wk_stride] = state_xvel[state]; + yvel0[jt + kt * vels_wk_stride] = state_yvel[state]; } } } diff --git a/src/hydro.cpp b/src/hydro.cpp index 5d53597..060c108 100644 --- a/src/hydro.cpp +++ b/src/hydro.cpp @@ -18,18 +18,18 @@ */ +#include "accelerate.h" #include "hydro.h" #include "timer.h" #include "field_summary.h" #include "visit.h" #include "timestep.h" #include "PdV.h" -#include "accelerate.h" #include "flux_calc.h" #include "advection.h" #include "reset_field.h" +#include "finalise_field.h" -#include extern std::ostream g_out; @@ -220,6 +220,7 @@ void hydro(global_variables &globals, parallel_ ¶llel) { } } + //clover_finalize(); Skipped as just closes the file and calls MPI_Finalize (which is done back in main). break; @@ -229,16 +230,16 @@ void hydro(global_variables &globals, parallel_ ¶llel) { if (parallel.boss) { wall_clock = timer() - timerstart; double step_clock = timer() - step_time; - g_out << "Wall clock " << wall_clock << std::endl; - std::cout << "Wall clock " << wall_clock << std::endl; + g_out << "Wall clock " << wall_clock << "\n"; + std::cout << "Wall clock " << wall_clock << "\n"; double cells = globals.config.grid.x_cells * globals.config.grid.y_cells; double rstep = globals.step; double grind_time = wall_clock / (rstep * cells); double step_grind = step_clock / cells; - std::cout << "Average time per cell " << grind_time << std::endl; - g_out << "Average time per cell " << grind_time << std::endl; - std::cout << "Step time per cell " << step_grind << std::endl; - g_out << "Step time per cell " << step_grind << std::endl; + std::cout << "Average time per cell " << grind_time << "\n"; + g_out << "Average time per cell " << grind_time << "\n"; + std::cout << "Step time per cell " << step_grind << "\n"; + g_out << "Step time per cell " << step_grind << "\n"; } } diff --git a/src/ideal_gas.cpp b/src/ideal_gas.cpp index 2b55231..42a389e 100644 --- a/src/ideal_gas.cpp +++ b/src/ideal_gas.cpp @@ -21,8 +21,8 @@ #include #include "ideal_gas.h" -#include "utils.hpp" +#include "comms.h" #define IDX(buffer, x, y) buffer[idx[(x)]][idx[(y)]] @@ -35,11 +35,11 @@ int N = 0; // @details Calculates the pressure and sound speed for the mesh chunk using // the ideal gas equation of state, with a fixed gamma of 1.4. void ideal_gas_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &density, - clover::Buffer2D &energy, - clover::Buffer2D &pressure, - clover::Buffer2D &soundspeed) { + field_type &field, + clover::Buffer2D &density_buffer, + clover::Buffer2D &energy_buffer) { //std::cout <<" ideal_gas(" << x_min+1 << ","<< y_min+1<< ","<< x_max+2<< ","<< y_max +2 << ")" << std::endl; // DO k=y_min,y_max @@ -47,15 +47,22 @@ void ideal_gas_kernel( // Kokkos::MDRangePolicy > policy({x_min + 1, y_min + 1}, {x_max + 2, y_max + 2}); - _Pragma("kernel2d") + const int base_stride = field.base_stride; + + double *density = density_buffer.data; + double *energy = energy_buffer.data; + double *pressure = field.pressure.data; + double *soundspeed = field.soundspeed.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 2); j++) { for (int i = (x_min + 1); i < (x_max + 2); i++) { - double v = 1.0 / density(i, j); - pressure(i, j) = (1.4 - 1.0) * density(i, j) * energy(i, j); - double pressurebyenergy = (1.4 - 1.0) * density(i, j); - double pressurebyvolume = -density(i, j) * pressure(i, j); - double sound_speed_squared = v * v * (pressure(i, j) * pressurebyenergy - pressurebyvolume); - soundspeed(i, j) = std::sqrt(sound_speed_squared); + double v = 1.0 / density[i + j * base_stride]; + pressure[i + j * base_stride] = (1.4 - 1.0) * density[i + j * base_stride] * energy[i + j * base_stride]; + double pressurebyenergy = (1.4 - 1.0) * density[i + j * base_stride]; + double pressurebyvolume = -density[i + j * base_stride] * pressure[i + j * base_stride]; + double sound_speed_squared = v * v * (pressure[i + j * base_stride] * pressurebyenergy - pressurebyvolume); + soundspeed[i + j * base_stride] = std::sqrt(sound_speed_squared); } }; @@ -71,30 +78,37 @@ void ideal_gas(global_variables &globals, const int tile, bool predict) { tile_type &t = globals.chunk.tiles[tile]; + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif if (!predict) { ideal_gas_kernel( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, t.info.t_ymax, + t.field, t.field.density0, - t.field.energy0, - t.field.pressure, - t.field.soundspeed + t.field.energy0 ); } else { ideal_gas_kernel( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, t.info.t_ymax, + t.field, t.field.density1, - t.field.energy1, - t.field.pressure, - t.field.soundspeed + t.field.energy1 ); } + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif + } diff --git a/src/initialise.cpp b/src/initialise.cpp index 51a7349..bf3c3cb 100644 --- a/src/initialise.cpp +++ b/src/initialise.cpp @@ -41,7 +41,7 @@ std::ofstream of; struct RunConfig { std::string file; - size_t deviceIdx; + int deviceIdx; }; @@ -66,7 +66,9 @@ void printHelp(const std::string &name) { RunConfig parseArgs(const size_t num_devices, const std::vector &args) { - const auto readParam = [&args](size_t current, const std::string &emptyMessage, auto map) { + const auto readParam = [&args](size_t current, + const std::string &emptyMessage, + const std::function &map) { if (current + 1 < args.size()) { return map(args[current + 1]); } else { @@ -84,21 +86,22 @@ RunConfig parseArgs(const size_t num_devices, printHelp(args[0]); std::exit(EXIT_SUCCESS); } else if (arg == "--list") { + std::cout << "OMP devices:" << std::endl; printSimple(num_devices); std::exit(EXIT_SUCCESS); } else if (arg == "--no-target") { config.deviceIdx = -1; } else if (arg == "--device") { - readParam(i, "--device specified but no index was given", [&](const auto ¶m) { + readParam(i, "--device specified but no index was given", [&](const std::string ¶m) { auto selected = std::stoul(param); - if (selected < 0 || selected >= num_devices) { + if (selected >= num_devices) { std::cerr << "bad device index `" << param << "`" << std::endl; std::exit(EXIT_FAILURE); } config.deviceIdx = selected; }); } else if (arg == "--file") { - readParam(i, "--file specified but no file was given", [&config](const auto ¶m) { + readParam(i, "--file specified but no file was given", [&config](const std::string ¶m) { config.file = param; }); } @@ -106,8 +109,7 @@ RunConfig parseArgs(const size_t num_devices, return config; } -std::unique_ptr -initialise(parallel_ ¶llel, const std::vector &args) { +global_variables initialise(parallel_ ¶llel, const std::vector &args) { global_config config; @@ -133,23 +135,43 @@ initialise(parallel_ ¶llel, const std::vector &args) { clover_barrier(); -// -// int x = 1; -// #pragma omp target map(tofrom: x) -// x = x + 1; auto num_devices = omp_get_num_devices(); - if (num_devices == 0) { - std::cout << "No OMP target devices available" << std::endl; - } else { - std::cout << "Detected OMP devices:" << std::endl; - printSimple(num_devices); + if (parallel.boss) { + + if (num_devices == 0) { + std::cout << "No OMP target devices available" << std::endl; + } else { + std::cout << "Detected OMP devices:" << std::endl; + printSimple(num_devices); + } + std::cout << "\n" << std::endl; } auto runConfig = parseArgs(num_devices, args); auto file = runConfig.file; auto selectedDevice = runConfig.deviceIdx; - std::cout << "Using OMP device: " << selectedDevice << std::endl; + auto useTarget = selectedDevice != -1; + + if (parallel.boss) { + (!useTarget ? + std::cout << "Using OMP device: (host fallback))" : + std::cout << "Using OMP device: #" << selectedDevice) << std::endl; + } + + if (!useTarget) { + std::cout << "Using OMP device: (host fallback))" << std::endl; + + #ifndef OMP_ALLOW_HOST + std::cerr << "Error: host fallback mode selected but OMP_ALLOW_HOST not enabled at compile time" << std::endl; + std::exit(EXIT_FAILURE); + #endif + + + } else { + omp_set_default_device(selectedDevice); + } + std::ifstream g_in; if (parallel.boss) { @@ -207,9 +229,9 @@ initialise(parallel_ ¶llel, const std::vector &args) { config.number_of_chunks = parallel.max_task; - auto globals = start(parallel, config, selectedDevice); + auto globals = start(parallel, config, selectedDevice, useTarget); - clover_barrier(*globals); + clover_barrier(globals); if (parallel.boss) { g_out << "Starting the calculation" << std::endl; diff --git a/src/initialise.h b/src/initialise.h index 2acc50a..4bdc26d 100644 --- a/src/initialise.h +++ b/src/initialise.h @@ -24,7 +24,7 @@ #include "comms.h" #include "definitions.h" -std::unique_ptr initialise(parallel_ ¶llel, const std::vector &args); +global_variables initialise(parallel_ ¶llel, const std::vector &args); #endif diff --git a/src/initialise_chunk.cpp b/src/initialise_chunk.cpp index ed5c694..c7b93fe 100644 --- a/src/initialise_chunk.cpp +++ b/src/initialise_chunk.cpp @@ -27,7 +27,7 @@ #include "initialise_chunk.h" -#include "utils.hpp" + void initialise_chunk(const int tile, global_variables &globals) { @@ -56,43 +56,60 @@ void initialise_chunk(const int tile, global_variables &globals) { field_type &field = globals.chunk.tiles[tile].field; - _Pragma("kernel1d") - for (int j = (0); j < (xrange); j++) { - field.vertexx[j] = xmin + dx * (j - 1 - x_min); - field.vertexdx[j] = dx; + double *vertexx = field.vertexx.data; + double *vertexdx = field.vertexdx.data; + + #pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target) + for (int j = 0; j < (xrange); j++) { + vertexx[j] = xmin + dx * (j - 1 - x_min); + vertexdx[j] = dx; } - _Pragma("kernel1d") - for (int k = (0); k < (yrange); k++) { - field.vertexy[k] = ymin + dy * (k - 1 - y_min); - field.vertexdy[k] = dy; + double *vertexy = field.vertexy.data; + double *vertexdy = field.vertexdy.data; + + #pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target) + for (int k = 0; k < (yrange); k++) { + vertexy[k] = ymin + dy * (k - 1 - y_min); + vertexdy[k] = dy; } const int xrange1 = (x_max + 2) - (x_min - 2) + 1; const int yrange1 = (y_max + 2) - (y_min - 2) + 1; - _Pragma("kernel1d") - for (int j = (0); j < (xrange1); j++) { - field.cellx[j] = 0.5 * (field.vertexx[j] + field.vertexx[j + 1]); - field.celldx[j] = dx; + double *cellx = field.cellx.data; + double *celldx = field.celldx.data; + #pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target) + for (int j = 0; j < (xrange1); j++) { + cellx[j] = 0.5 * (vertexx[j] + vertexx[j + 1]); + celldx[j] = dx; } - _Pragma("kernel1d") - for (int k = (0); k < (yrange1); k++) { - field.celly[k] = 0.5 * (field.vertexy[k] + field.vertexy[k + 1]); - field.celldy[k] = dy; + double *celly = field.celly.data; + double *celldy = field.celldy.data; + #pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target) + for (int k = 0; k < (yrange1); k++) { + celly[k] = 0.5 * (vertexy[k] + vertexy[k + 1]); + celldy[k] = dy; } + const int base_stride = field.base_stride; + const int flux_x_stride = field.flux_x_stride; + const int flux_y_stride = field.flux_y_stride; + + double *volume = field.volume.data; + double *xarea = field.xarea.data; + double *yarea = field.yarea.data; - _Pragma("kernel2d") - for (int j = (0); j < (yrange1); j++) { - for (int i = (0); i < (xrange1); i++) { - field.volume(i, j) = dx * dy; - field.xarea(i, j) = field.celldy[j]; - field.yarea(i, j) = field.celldx[i]; + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target) + for (int j = 0; j < (yrange1); j++) { + for (int i = 0; i < (xrange1); i++) { + volume[i + j * base_stride] = dx * dy; + xarea[i + j * flux_x_stride] = celldy[j]; + yarea[i + j * flux_y_stride] = celldx[i]; } } diff --git a/src/pack_kernel.cpp b/src/pack_kernel.cpp index c55f1d7..3e70ec5 100644 --- a/src/pack_kernel.cpp +++ b/src/pack_kernel.cpp @@ -24,11 +24,11 @@ #include "pack_kernel.h" -#include "utils.hpp" -void clover_pack_message_left(int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &field, - clover::Buffer1D &left_snd, + +void clover_pack_message_left(bool use_target, int x_min, int x_max, int y_min, int y_max, + clover::Buffer2D &field_buffer, + clover::Buffer1D &left_snd_buffer, int cell_data, int vertex_data, int x_face_data, int y_face_data, int depth, int field_type, int buffer_offset) { @@ -54,14 +54,17 @@ void clover_pack_message_left(int x_min, int x_max, int y_min, int y_max, y_inc = 1; } - // DO k=y_min-depth,y_max+y_inc+depth + // DO k=y_min-depth,y_max+y_inc+depth - _Pragma("kernel1d") + double *left_snd = left_snd_buffer.data; + double *field = field_buffer.data; + const int field_sizex = field_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + y_inc + depth + 2); k++) { for (int j = 0; j < depth; ++j) { int index = buffer_offset + j + (k + depth - 1) * depth; - left_snd[index] = field(x_min + x_inc - 1 + j, k); + left_snd[index] = field[(x_min + x_inc - 1 + j) + (k) * field_sizex]; } } @@ -69,9 +72,9 @@ void clover_pack_message_left(int x_min, int x_max, int y_min, int y_max, } -void clover_unpack_message_left(int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &field, - clover::Buffer1D &left_rcv, +void clover_unpack_message_left(bool use_target, int x_min, int x_max, int y_min, int y_max, + clover::Buffer2D &field_buffer, + clover::Buffer1D &left_rcv_buffer, int cell_data, int vertex_data, int x_face_data, int y_face_data, int depth, int field_type, int buffer_offset) { @@ -93,16 +96,19 @@ void clover_unpack_message_left(int x_min, int x_max, int y_min, int y_max, y_inc = 1; } - // DO k=y_min-depth,y_max+y_inc+depth + // DO k=y_min-depth,y_max+y_inc+depth - _Pragma("kernel1d") + double *field = field_buffer.data; + const int field_sizex = field_buffer.nX(); + double *left_rcv = left_rcv_buffer.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + y_inc + depth + 2); k++) { for (int j = 0; j < depth; ++j) { int index = buffer_offset + j + (k + depth - 1) * depth; - field(x_min - j, k) = left_rcv[index]; + field[(x_min - j) + (k) * field_sizex] = left_rcv[index]; } } @@ -110,9 +116,9 @@ void clover_unpack_message_left(int x_min, int x_max, int y_min, int y_max, } -void clover_pack_message_right(int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &field, - clover::Buffer1D &right_snd, +void clover_pack_message_right(bool use_target, int x_min, int x_max, int y_min, int y_max, + clover::Buffer2D &field_buffer, + clover::Buffer1D &right_snd_buffer, int cell_data, int vertex_data, int x_face_data, int y_face_data, int depth, int field_type, int buffer_offset) { @@ -134,12 +140,15 @@ void clover_pack_message_right(int x_min, int x_max, int y_min, int y_max, y_inc = 1; } - // DO k=y_min-depth,y_max+y_inc+depth - _Pragma("kernel1d") + // DO k=y_min-depth,y_max+y_inc+depth + double *right_snd = right_snd_buffer.data; + double *field = field_buffer.data; + const int field_sizex = field_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + y_inc + depth + 2); k++) { for (int j = 0; j < depth; ++j) { int index = buffer_offset + j + (k + depth - 1) * depth; - right_snd[index] = field(x_min + 1 + j, k); + right_snd[index] = field[(x_min + 1 + j) + (k) * field_sizex]; } } @@ -147,9 +156,9 @@ void clover_pack_message_right(int x_min, int x_max, int y_min, int y_max, } -void clover_unpack_message_right(int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &field, - clover::Buffer1D &right_rcv, +void clover_unpack_message_right(bool use_target, int x_min, int x_max, int y_min, int y_max, + clover::Buffer2D &field_buffer, + clover::Buffer1D &right_rcv_buffer, int cell_data, int vertex_data, int x_face_data, int y_face_data, int depth, int field_type, int buffer_offset) { @@ -175,21 +184,24 @@ void clover_unpack_message_right(int x_min, int x_max, int y_min, int y_max, y_inc = 1; } - // DO k=y_min-depth,y_max+y_inc+depth - _Pragma("kernel1d") + // DO k=y_min-depth,y_max+y_inc+depth + double *right_rcv = right_rcv_buffer.data; + double *field = field_buffer.data; + const int field_sizex = field_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + y_inc + depth + 2); k++) { for (int j = 0; j < depth; ++j) { int index = buffer_offset + j + (k + depth - 1) * depth; - right_rcv[index] = field(x_max + x_inc + j, k); + right_rcv[index] = field[(x_max + x_inc + j) + (k) * field_sizex]; } } } -void clover_pack_message_top(int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &field, - clover::Buffer1D &top_snd, +void clover_pack_message_top(bool use_target, int x_min, int x_max, int y_min, int y_max, + clover::Buffer2D &field_buffer, + clover::Buffer1D &top_snd_buffer, int cell_data, int vertex_data, int x_face_data, int y_face_data, int depth, int field_type, int buffer_offset) { @@ -214,17 +226,20 @@ void clover_pack_message_top(int x_min, int x_max, int y_min, int y_max, for (int k = 0; k < depth; ++k) { // DO j=x_min-depth,x_max+x_inc+depth - _Pragma("kernel1d") + double *top_snd = top_snd_buffer.data; + double *field = field_buffer.data; + const int field_sizex = field_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + x_inc + depth + 2); j++) { int index = buffer_offset + k + (j + depth - 1) * depth; - top_snd[index] = field(j, y_max + 1 - k); + top_snd[index] = field[j + (y_max + 1 - k) * field_sizex]; } } } -void clover_unpack_message_top(int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &field, - clover::Buffer1D &top_rcv, +void clover_unpack_message_top(bool use_target, int x_min, int x_max, int y_min, int y_max, + clover::Buffer2D &field_buffer, + clover::Buffer1D &top_rcv_buffer, int cell_data, int vertex_data, int x_face_data, int y_face_data, int depth, int field_type, int buffer_offset) { @@ -254,18 +269,21 @@ void clover_unpack_message_top(int x_min, int x_max, int y_min, int y_max, // DO j=x_min-depth,x_max+x_inc+depth - _Pragma("kernel1d") + double *field = field_buffer.data; + const int field_sizex = field_buffer.nX(); + double *top_rcv = top_rcv_buffer.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + x_inc + depth + 2); j++) { int index = buffer_offset + k + (j + depth - 1) * depth; - field(j, y_max + y_inc + k) = top_rcv[index]; + field[j + (y_max + y_inc + k) * field_sizex] = top_rcv[index]; } } } -void clover_pack_message_bottom(int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &field, - clover::Buffer1D &bottom_snd, +void clover_pack_message_bottom(bool use_target, int x_min, int x_max, int y_min, int y_max, + clover::Buffer2D &field_buffer, + clover::Buffer1D &bottom_snd_buffer, int cell_data, int vertex_data, int x_face_data, int y_face_data, int depth, int field_type, int buffer_offset) { @@ -294,17 +312,20 @@ void clover_pack_message_bottom(int x_min, int x_max, int y_min, int y_max, for (int k = 0; k < depth; ++k) { // DO j=x_min-depth,x_max+x_inc+depth - _Pragma("kernel1d") + double *bottom_snd = bottom_snd_buffer.data; + double *field = field_buffer.data; + const int field_sizex = field_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + x_inc + depth + 2); j++) { int index = buffer_offset + k + (j + depth - 1) * depth; - bottom_snd[index] = field(j, y_min + y_inc - 1 + k); + bottom_snd[index] = field[j + (y_min + y_inc - 1 + k) * field_sizex]; } } } -void clover_unpack_message_bottom(int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &field, - clover::Buffer1D &bottom_rcv, +void clover_unpack_message_bottom(bool use_target, int x_min, int x_max, int y_min, int y_max, + clover::Buffer2D &field_buffer, + clover::Buffer1D &bottom_rcv_buffer, int cell_data, int vertex_data, int x_face_data, int y_face_data, int depth, int field_type, int buffer_offset) { @@ -329,10 +350,13 @@ void clover_unpack_message_bottom(int x_min, int x_max, int y_min, int y_max, for (int k = 0; k < depth; ++k) { // DO j=x_min-depth,x_max+x_inc+depth - _Pragma("kernel1d") + double *field = field_buffer.data; + const int field_sizex = field_buffer.nX(); + double *bottom_rcv = bottom_rcv_buffer.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + x_inc + depth + 2); j++) { int index = buffer_offset + k + (j + depth - 1) * depth; - field(j, y_min - k) = bottom_rcv[index]; + field[j + (y_min - k) * field_sizex] = bottom_rcv[index]; } } } diff --git a/src/pack_kernel.h b/src/pack_kernel.h index dfa6c38..32f27dc 100644 --- a/src/pack_kernel.h +++ b/src/pack_kernel.h @@ -22,44 +22,44 @@ #define PACK_KERNEL_H #include "definitions.h" -#include "utils.hpp" -void clover_pack_message_left(int x_min, int x_max, int y_min, int y_max, + +void clover_pack_message_left(bool use_target, int x_min, int x_max, int y_min, int y_max, clover::Buffer2D &field, clover::Buffer1D &left_snd, int cell_data, int vertex_data, int x_face_fata, int y_face_data, int depth, int field_type, int buffer_offset); -void clover_unpack_message_left(int x_min, int x_max, int y_min, int y_max, +void clover_unpack_message_left(bool use_target, int x_min, int x_max, int y_min, int y_max, clover::Buffer2D &field, clover::Buffer1D &left_rcv, int cell_data, int vertex_data, int x_face_fata, int y_face_data, int depth, int field_type, int buffer_offset); -void clover_pack_message_right(int x_min, int x_max, int y_min, int y_max, +void clover_pack_message_right(bool use_target, int x_min, int x_max, int y_min, int y_max, clover::Buffer2D &field, clover::Buffer1D &right_snd, int cell_data, int vertex_data, int x_face_fata, int y_face_data, int depth, int field_type, int buffer_offset); -void clover_unpack_message_right(int x_min, int x_max, int y_min, int y_max, +void clover_unpack_message_right(bool use_target, int x_min, int x_max, int y_min, int y_max, clover::Buffer2D &field, clover::Buffer1D &right_rcv, int cell_data, int vertex_data, int x_face_fata, int y_face_data, int depth, int field_type, int buffer_offset); -void clover_pack_message_top(int x_min, int x_max, int y_min, int y_max, +void clover_pack_message_top(bool use_target, int x_min, int x_max, int y_min, int y_max, clover::Buffer2D &field, clover::Buffer1D &top_snd, int cell_data, int vertex_data, int x_face_fata, int y_face_data, int depth, int field_type, int buffer_offset); -void clover_unpack_message_top(int x_min, int x_max, int y_min, int y_max, +void clover_unpack_message_top(bool use_target, int x_min, int x_max, int y_min, int y_max, clover::Buffer2D &field, clover::Buffer1D &top_rcv, int cell_data, int vertex_data, int x_face_fata, int y_face_data, int depth, int field_type, int buffer_offset); -void clover_pack_message_bottom(int x_min, int x_max, int y_min, int y_max, +void clover_pack_message_bottom(bool use_target, int x_min, int x_max, int y_min, int y_max, clover::Buffer2D &field, clover::Buffer1D &bottom_snd, int cell_data, int vertex_data, int x_face_fata, int y_face_data, int depth, int field_type, int buffer_offset); -void clover_unpack_message_bottom(int x_min, int x_max, int y_min, int y_max, +void clover_unpack_message_bottom(bool use_target, int x_min, int x_max, int y_min, int y_max, clover::Buffer2D &field, clover::Buffer1D &bottom_rcv, int cell_data, int vertex_data, int x_face_fata, int y_face_data, int depth, diff --git a/src/reset_field.cpp b/src/reset_field.cpp index daa4c9d..62982d4 100644 --- a/src/reset_field.cpp +++ b/src/reset_field.cpp @@ -20,32 +20,32 @@ #include "reset_field.h" #include "timer.h" -#include "utils.hpp" + // @brief Fortran reset field kernel. // @author Wayne Gaudin // @details Copies all of the final end of step filed data to the begining of // step data, ready for the next timestep. void reset_field_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &density0, - clover::Buffer2D &density1, - clover::Buffer2D &energy0, - clover::Buffer2D &energy1, - clover::Buffer2D &xvel0, - clover::Buffer2D &xvel1, - clover::Buffer2D &yvel0, - clover::Buffer2D &yvel1) { + field_type &field) { // DO k=y_min,y_max // DO j=x_min,x_max - _Pragma("kernel2d") + const int base_stride = field.base_stride; + double *density0 = field.density0.data; + double *density1 = field.density1.data; + double *energy0 = field.energy0.data; + double *energy1 = field.energy1.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 2); j++) { for (int i = (x_min + 1); i < (x_max + 2); i++) { - density0(i, j) = density1(i, j); - energy0(i, j) = energy1(i, j); + density0[i + j * base_stride] = density1[i + j * base_stride]; + energy0[i + j * base_stride] = energy1[i + j * base_stride]; } } @@ -54,11 +54,17 @@ void reset_field_kernel( // DO k=y_min,y_max+1 // DO j=x_min,x_max+1 - _Pragma("kernel2d") + const int vels_wk_stride = field.vels_wk_stride; + double *xvel0 = field.xvel0.data; + double *xvel1 = field.xvel1.data; + double *yvel0 = field.yvel0.data; + double *yvel1 = field.yvel1.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) { for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) { - xvel0(i, j) = xvel1(i, j); - yvel0(i, j) = yvel1(i, j); + xvel0[i + j * vels_wk_stride] = xvel1[i + j * vels_wk_stride]; + yvel0[i + j * vels_wk_stride] = yvel1[i + j * vels_wk_stride]; } } @@ -73,26 +79,26 @@ void reset_field(global_variables &globals) { double kernel_time = 0; if (globals.profiler_on) kernel_time = timer(); + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) { tile_type &t = globals.chunk.tiles[tile]; reset_field_kernel( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, t.info.t_ymax, - - t.field.density0, - t.field.density1, - t.field.energy0, - t.field.energy1, - t.field.xvel0, - t.field.xvel1, - t.field.yvel0, - t.field.yvel1); + t.field); } + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif + if (globals.profiler_on) globals.profiler.reset += timer() - kernel_time; } diff --git a/src/revert.cpp b/src/revert.cpp index 681d0ad..036925a 100644 --- a/src/revert.cpp +++ b/src/revert.cpp @@ -19,7 +19,7 @@ #include "revert.h" -#include "utils.hpp" + // @brief Fortran revert kernel. // @author Wayne Gaudin @@ -27,19 +27,24 @@ // it to the start of step data, ready for the corrector. // Note that this does not seem necessary in this proxy-app but should be // left in to remain relevant to the full method. -void revert_kernel(int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &density0, - clover::Buffer2D &density1, - clover::Buffer2D &energy0, - clover::Buffer2D &energy1) { +void revert_kernel( + bool use_target, + int x_min, int x_max, int y_min, int y_max, + field_type &field) { // DO k=y_min,y_max // DO j=x_min,x_max - _Pragma("kernel2d") + const int base_stride = field.base_stride; + double *density0 = field.density0.data; + double *density1 = field.density1.data; + double *energy0 = field.energy0.data; + double *energy1 = field.energy1.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 2); j++) { for (int i = (x_min + 1); i < (x_max + 2); i++) { - density1(i, j) = density0(i, j); - energy1(i, j) = energy0(i, j); + density1[i + j * base_stride] = density0[i + j * base_stride]; + energy1[i + j * base_stride] = energy0[i + j * base_stride]; } } @@ -51,19 +56,25 @@ void revert_kernel(int x_min, int x_max, int y_min, int y_max, // @details Invokes the user specified revert kernel. void revert(global_variables &globals) { + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) { tile_type &t = globals.chunk.tiles[tile]; revert_kernel( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, t.info.t_ymax, - t.field.density0, - t.field.density1, - t.field.energy0, - t.field.energy1); + t.field); } + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif + + } diff --git a/src/start.cpp b/src/start.cpp index 0cfff35..ad647be 100644 --- a/src/start.cpp +++ b/src/start.cpp @@ -34,16 +34,17 @@ #include "field_summary.h" #include "update_halo.h" #include "visit.h" -#include "cxx14_compat.hpp" +#include "flux_calc.h" #include #include extern std::ostream g_out; -std::unique_ptr start(parallel_ ¶llel, - const global_config &config, - size_t omp_device) { +global_variables start(parallel_ ¶llel, + const global_config &config, + size_t omp_device, + bool use_target) { if (parallel.boss) { g_out << "Setting up initial geometry" << std::endl @@ -67,6 +68,7 @@ std::unique_ptr start(parallel_ ¶llel, global_variables globals(config, omp_device, + use_target, chunk_type( chunkNeighbours, parallel.task, 1, 1, x_cells, y_cells, @@ -80,8 +82,15 @@ std::unique_ptr start(parallel_ ¶llel, auto infos = clover_tile_decompose(globals, x_cells, y_cells); - std::transform(infos.begin(), infos.end(), std::back_inserter(globals.chunk.tiles), - [](const tile_info &ti) { return tile_type(ti); }); + for (auto &ti : infos) { + globals.chunk.tiles.emplace_back(ti); + } + +// std::transform(infos.begin(), infos.end(), std::back_inserter(globals.chunk.tiles), +// [](const tile_info &ti) { return tile_type(ti); }); + + +// #pragma omp target enter data map(alloc: globals.chunk.tiles[0:N]) @@ -99,7 +108,10 @@ std::unique_ptr start(parallel_ ¶llel, for (int tile = 0; tile < config.tiles_per_chunk; ++tile) { initialise_chunk(tile, globals); + if (DEBUG) std::cout << "Field initialised2" << std::endl; + generate_chunk(tile, globals); + if (DEBUG) std::cout << "Field initialised3" << std::endl; } @@ -111,9 +123,17 @@ std::unique_ptr start(parallel_ ¶llel, bool profiler_off = globals.profiler_on; globals.profiler_on = false; + + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif + for (int tile = 0; tile < config.tiles_per_chunk; ++tile) { ideal_gas(globals, tile, false); } + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif if (DEBUG) globals.dump("dump_0_after_ideal_gas.txt"); // Prime all halo data for the first step @@ -132,7 +152,11 @@ std::unique_ptr start(parallel_ ¶llel, fields[field_xvel1] = 1; fields[field_yvel1] = 1; + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif update_halo(globals, fields, 2); + if (DEBUG)globals.dump("dump_0_after_update_halo.txt"); @@ -149,6 +173,6 @@ std::unique_ptr start(parallel_ ¶llel, globals.profiler_on = profiler_off; - return make_unique(globals); + return globals; } diff --git a/src/start.h b/src/start.h index 146ffd4..b8ad4e3 100644 --- a/src/start.h +++ b/src/start.h @@ -25,9 +25,10 @@ #include "comms.h" #include "definitions.h" -std::unique_ptr start(parallel_ ¶llel, - const global_config &config, - size_t omp_device); +global_variables start(parallel_ ¶llel, + const global_config &config, + size_t omp_device, + bool use_target); #endif diff --git a/src/timestep.cpp b/src/timestep.cpp index 8efcbfa..10ee443 100644 --- a/src/timestep.cpp +++ b/src/timestep.cpp @@ -97,10 +97,10 @@ void timestep(global_variables &globals, parallel_ ¶llel) { if (parallel.boss) { g_out << " Step " << globals.step << " time " << globals.time << " control " << dt_control << " timestep " << globals.dt << " " << globals.jdt << "," << globals.kdt << " x " - << x_pos << " y " << y_pos << std::endl; + << x_pos << " y " << y_pos << "\n"; std::cout << " Step " << globals.step << " time " << globals.time << " control " << dt_control << " timestep " << globals.dt << " " << globals.jdt << "," - << globals.kdt << " x " << x_pos << " y " << y_pos << std::endl; + << globals.kdt << " x " << x_pos << " y " << y_pos << "\n"; } if (small == 1) { diff --git a/src/update_halo.cpp b/src/update_halo.cpp index 478366d..1f84522 100644 --- a/src/update_halo.cpp +++ b/src/update_halo.cpp @@ -23,7 +23,7 @@ #include "update_halo.h" #include "update_tile_halo.h" #include "timer.h" -#include "utils.hpp" + // @brief Fortran kernel to update the external halo cells in a chunk. @@ -33,6 +33,7 @@ // of data governs how this is carried out. External boundaries are always // reflective. void update_halo_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, const std::array &chunk_neighbours, const std::array &tile_neighbours, @@ -41,6 +42,11 @@ void update_halo_kernel( int depth) { + const int base_stride = field.base_stride; + const int vels_wk_stride = field.vels_wk_stride; + const int flux_x_stride = field.flux_x_stride; + const int flux_y_stride = field.flux_y_stride; + // Update values in external halo cells based on depth and fields requested // Even though half of these loops look the wrong way around, it should be noted // that depth is either 1 or 2 so that it is more efficient to always thread @@ -51,10 +57,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *density0 = field.density0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.density0(j, 1 - k) = field.density0(j, 2 + k); + density0[j + (1 - k) * base_stride] = density0[j + (2 + k) * base_stride]; } } @@ -64,10 +71,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *density0 = field.density0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.density0(j, y_max + 2 + k) = field.density0(j, y_max + 1 - k); + density0[j + (y_max + 2 + k) * base_stride] = density0[j + (y_max + 1 - k) * base_stride]; } } @@ -77,10 +85,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *density0 = field.density0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.density0(1 - j, k) = field.density0(2 + j, k); + density0[(1 - j) + (k) * base_stride] = density0[(2 + j) + (k) * base_stride]; } } @@ -90,10 +99,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *density0 = field.density0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.density0(x_max + 2 + j, k) = field.density0(x_max + 1 - j, k); + density0[(x_max + 2 + j) + (k) * base_stride] = density0[(x_max + 1 - j) + (k) * base_stride]; } } @@ -107,10 +117,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *density1 = field.density1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.density1(j, 1 - k) = field.density1(j, 2 + k); + density1[j + (1 - k) * base_stride] = density1[j + (2 + k) * base_stride]; } } @@ -120,10 +131,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *density1 = field.density1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.density1(j, y_max + 2 + k) = field.density1(j, y_max + 1 - k); + density1[j + (y_max + 2 + k) * base_stride] = density1[j + (y_max + 1 - k) * base_stride]; } } @@ -133,10 +145,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *density1 = field.density1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.density1(1 - j, k) = field.density1(2 + j, k); + density1[(1 - j) + (k) * base_stride] = density1[(2 + j) + (k) * base_stride]; } } @@ -146,10 +159,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *density1 = field.density1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.density1(x_max + 2 + j, k) = field.density1(x_max + 1 - j, k); + density1[(x_max + 2 + j) + (k) * base_stride] = density1[(x_max + 1 - j) + (k) * base_stride]; } } @@ -161,10 +175,11 @@ void update_halo_kernel( (tile_neighbours[tile_bottom] == external_tile)) { // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *energy0 = field.energy0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.energy0(j, 1 - k) = field.energy0(j, 2 + k); + energy0[j + (1 - k) * base_stride] = energy0[j + (2 + k) * base_stride]; } } @@ -173,10 +188,11 @@ void update_halo_kernel( (tile_neighbours[tile_top] == external_tile)) { // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *energy0 = field.energy0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.energy0(j, y_max + 2 + k) = field.energy0(j, y_max + 1 - k); + energy0[j + (y_max + 2 + k) * base_stride] = energy0[j + (y_max + 1 - k) * base_stride]; } } @@ -185,10 +201,11 @@ void update_halo_kernel( (tile_neighbours[tile_left] == external_tile)) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *energy0 = field.energy0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.energy0(1 - j, k) = field.energy0(2 + j, k); + energy0[(1 - j) + (k) * base_stride] = energy0[(2 + j) + (k) * base_stride]; } } @@ -197,10 +214,11 @@ void update_halo_kernel( (tile_neighbours[tile_right] == external_tile)) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *energy0 = field.energy0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.energy0(x_max + 2 + j, k) = field.energy0(x_max + 1 - j, k); + energy0[(x_max + 2 + j) + (k) * base_stride] = energy0[(x_max + 1 - j) + (k) * base_stride]; } } @@ -214,10 +232,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *energy1 = field.energy1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.energy1(j, 1 - k) = field.energy1(j, 2 + k); + energy1[j + (1 - k) * base_stride] = energy1[j + (2 + k) * base_stride]; } } @@ -227,10 +246,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *energy1 = field.energy1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.energy1(j, y_max + 2 + k) = field.energy1(j, y_max + 1 - k); + energy1[j + (y_max + 2 + k) * base_stride] = energy1[j + (y_max + 1 - k) * base_stride]; } } @@ -240,10 +260,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *energy1 = field.energy1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.energy1(1 - j, k) = field.energy1(2 + j, k); + energy1[(1 - j) + (k) * base_stride] = energy1[(2 + j) + (k) * base_stride]; } } @@ -253,10 +274,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *energy1 = field.energy1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.energy1(x_max + 2 + j, k) = field.energy1(x_max + 1 - j, k); + energy1[(x_max + 2 + j) + (k) * base_stride] = energy1[(x_max + 1 - j) + (k) * base_stride]; } } @@ -269,10 +291,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *pressure = field.pressure.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.pressure(j, 1 - k) = field.pressure(j, 2 + k); + pressure[j + (1 - k) * base_stride] = pressure[j + (2 + k) * base_stride]; } } @@ -282,10 +305,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *pressure = field.pressure.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.pressure(j, y_max + 2 + k) = field.pressure(j, y_max + 1 - k); + pressure[j + (y_max + 2 + k) * base_stride] = pressure[j + (y_max + 1 - k) * base_stride]; } } @@ -295,10 +319,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *pressure = field.pressure.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.pressure(1 - j, k) = field.pressure(2 + j, k); + pressure[(1 - j) + (k) * base_stride] = pressure[(2 + j) + (k) * base_stride]; } } @@ -308,10 +333,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *pressure = field.pressure.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.pressure(x_max + 2 + j, k) = field.pressure(x_max + 1 - j, k); + pressure[(x_max + 2 + j) + (k) * base_stride] = pressure[(x_max + 1 - j) + (k) * base_stride]; } } @@ -324,10 +350,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *viscosity = field.viscosity.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.viscosity(j, 1 - k) = field.viscosity(j, 2 + k); + viscosity[j + (1 - k) * base_stride] = viscosity[j + (2 + k) * base_stride]; } } @@ -337,10 +364,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *viscosity = field.viscosity.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.viscosity(j, y_max + 2 + k) = field.viscosity(j, y_max + 1 - k); + viscosity[j + (y_max + 2 + k) * base_stride] = viscosity[j + (y_max + 1 - k) * base_stride]; } } @@ -350,10 +378,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *viscosity = field.viscosity.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.viscosity(1 - j, k) = field.viscosity(2 + j, k); + viscosity[(1 - j) + (k) * base_stride] = viscosity[(2 + j) + (k) * base_stride]; } } @@ -363,10 +392,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *viscosity = field.viscosity.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.viscosity(x_max + 2 + j, k) = field.viscosity(x_max + 1 - j, k); + viscosity[(x_max + 2 + j) + (k) * base_stride] = viscosity[(x_max + 1 - j) + (k) * base_stride]; } } @@ -379,10 +409,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *soundspeed = field.soundspeed.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.soundspeed(j, 1 - k) = field.soundspeed(j, +k); + soundspeed[j + (1 - k) * base_stride] = soundspeed[j + (+k) * base_stride]; } } @@ -392,10 +423,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *soundspeed = field.soundspeed.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.soundspeed(j, y_max + 2 + k) = field.soundspeed(j, y_max + 1 - k); + soundspeed[j + (y_max + 2 + k) * base_stride] = soundspeed[j + (y_max + 1 - k) * base_stride]; } } @@ -405,10 +437,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *soundspeed = field.soundspeed.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.soundspeed(1 - j, k) = field.soundspeed(2 + j, k); + soundspeed[(1 - j) + (k) * base_stride] = soundspeed[(2 + j) + (k) * base_stride]; } } @@ -418,10 +451,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *soundspeed = field.soundspeed.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.soundspeed(x_max + 2 + j, k) = field.soundspeed(x_max + 1 - j, k); + soundspeed[(x_max + 2 + j) + (k) * base_stride] = soundspeed[(x_max + 1 - j) + (k) * base_stride]; } } @@ -430,17 +464,18 @@ void update_halo_kernel( if (fields[field_xvel0] == 1) { + + if ((chunk_neighbours[chunk_bottom] == external_face) && (tile_neighbours[tile_bottom] == external_tile)) { // DO j=x_min-depth,x_max+1+depth - _Pragma("kernel1d") + double *xvel0 = field.xvel0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.xvel0(j, 1 - k) = field.xvel0(j, - 1 + 2 + - k); + xvel0[j + (1 - k) * vels_wk_stride] = xvel0[j + (1 + 2 + k) * vels_wk_stride]; } } @@ -450,10 +485,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+1+depth - _Pragma("kernel1d") + double *xvel0 = field.xvel0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.xvel0(j, y_max + 1 + 2 + k) = field.xvel0(j, y_max + 1 - k); + xvel0[j + (y_max + 1 + 2 + k) * vels_wk_stride] = xvel0[j + (y_max + 1 - k) * vels_wk_stride]; } } @@ -463,10 +499,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *xvel0 = field.xvel0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.xvel0(1 - j, k) = -field.xvel0(1 + 2 + j, k); + xvel0[(1 - j) + (k) * vels_wk_stride] = -xvel0[(1 + 2 + j) + (k) * vels_wk_stride]; } } @@ -476,10 +513,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *xvel0 = field.xvel0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.xvel0(x_max + 2 + 1 + j, k) = -field.xvel0(x_max + 1 - j, k); + xvel0[(x_max + 2 + 1 + j) + (k) * vels_wk_stride] = -xvel0[(x_max + 1 - j) + (k) * vels_wk_stride]; } } @@ -492,10 +530,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+1+depth - _Pragma("kernel1d") + double *xvel1 = field.xvel1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.xvel1(j, 1 - k) = field.xvel1(j, 1 + 2 + k); + xvel1[j + (1 - k) * vels_wk_stride] = xvel1[j + (1 + 2 + k) * vels_wk_stride]; } } @@ -505,10 +544,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+1+depth - _Pragma("kernel1d") + double *xvel1 = field.xvel1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.xvel1(j, y_max + 1 + 2 + k) = field.xvel1(j, y_max + 1 - k); + xvel1[j + (y_max + 1 + 2 + k) * vels_wk_stride] = xvel1[j + (y_max + 1 - k) * vels_wk_stride]; } } @@ -518,10 +558,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *xvel1 = field.xvel1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.xvel1(1 - j, k) = -field.xvel1(1 + 2 + j, k); + xvel1[(1 - j) + (k) * vels_wk_stride] = -xvel1[(1 + 2 + j) + (k) * vels_wk_stride]; } } @@ -531,10 +572,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *xvel1 = field.xvel1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.xvel1(x_max + 2 + 1 + j, k) = -field.xvel1(x_max + 1 - j, k); + xvel1[(x_max + 2 + 1 + j) + (k) * vels_wk_stride] = -xvel1[(x_max + 1 - j) + (k) * vels_wk_stride]; } } @@ -547,10 +589,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+1+depth - _Pragma("kernel1d") + double *yvel0 = field.yvel0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.yvel0(j, 1 - k) = -field.yvel0(j, 1 + 2 + k); + yvel0[j + (1 - k) * vels_wk_stride] = -yvel0[j + (1 + 2 + k) * vels_wk_stride]; } } @@ -560,10 +603,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+1+depth - _Pragma("kernel1d") + double *yvel0 = field.yvel0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.yvel0(j, y_max + 1 + 2 + k) = -field.yvel0(j, y_max + 1 - k); + yvel0[j + (y_max + 1 + 2 + k) * vels_wk_stride] = -yvel0[j + (y_max + 1 - k) * vels_wk_stride]; } } @@ -573,10 +617,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *yvel0 = field.yvel0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.yvel0(1 - j, k) = field.yvel0(1 + 2 + j, k); + yvel0[(1 - j) + (k) * vels_wk_stride] = yvel0[(1 + 2 + j) + (k) * vels_wk_stride]; } } @@ -586,10 +631,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *yvel0 = field.yvel0.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.yvel0(x_max + 2 + 1 + j, k) = field.yvel0(x_max + 1 - j, k); + yvel0[(x_max + 2 + 1 + j) + (k) * vels_wk_stride] = yvel0[(x_max + 1 - j) + (k) * vels_wk_stride]; } } @@ -602,10 +648,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+1+depth - _Pragma("kernel1d") + double *yvel1 = field.yvel1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.yvel1(j, 1 - k) = -field.yvel1(j, 1 + 2 + k); + yvel1[j + (1 - k) * vels_wk_stride] = -yvel1[j + (1 + 2 + k) * vels_wk_stride]; } } @@ -615,10 +662,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+1+depth - _Pragma("kernel1d") + double *yvel1 = field.yvel1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.yvel1(j, y_max + 1 + 2 + k) = -field.yvel1(j, y_max + 1 - k); + yvel1[j + (y_max + 1 + 2 + k) * vels_wk_stride] = -yvel1[j + (y_max + 1 - k) * vels_wk_stride]; } } @@ -628,10 +676,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *yvel1 = field.yvel1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.yvel1(1 - j, k) = field.yvel1(1 + 2 + j, k); + yvel1[(1 - j) + (k) * vels_wk_stride] = yvel1[(1 + 2 + j) + (k) * vels_wk_stride]; } } @@ -641,10 +690,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *yvel1 = field.yvel1.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.yvel1(x_max + 2 + 1 + j, k) = field.yvel1(x_max + 1 - j, k); + yvel1[(x_max + 2 + 1 + j) + (k) * vels_wk_stride] = yvel1[(x_max + 1 - j) + (k) * vels_wk_stride]; } } @@ -658,10 +708,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+1+depth - _Pragma("kernel1d") + double *vol_flux_x = field.vol_flux_x.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.vol_flux_x(j, 1 - k) = field.vol_flux_x(j, 1 + 2 + k); + vol_flux_x[j + (1 - k) * flux_x_stride] = vol_flux_x[j + (1 + 2 + k) * flux_x_stride]; } } @@ -671,10 +722,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+1+depth - _Pragma("kernel1d") + double *vol_flux_x = field.vol_flux_x.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.vol_flux_x(j, y_max + 2 + k) = field.vol_flux_x(j, y_max - k); + vol_flux_x[j + (y_max + 2 + k) * flux_x_stride] = vol_flux_x[j + (y_max - k) * flux_x_stride]; } } @@ -684,10 +736,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *vol_flux_x = field.vol_flux_x.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.vol_flux_x(1 - j, k) = -field.vol_flux_x(1 + 2 + j, k); + vol_flux_x[(1 - j) + (k) * flux_x_stride] = -vol_flux_x[(1 + 2 + j) + (k) * flux_x_stride]; } } @@ -697,10 +750,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *vol_flux_x = field.vol_flux_x.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.vol_flux_x(x_max + j + 1 + 2, k) = -field.vol_flux_x(x_max + 1 - j, k); + vol_flux_x[(x_max + j + 1 + 2) + (k) * flux_x_stride] = -vol_flux_x[(x_max + 1 - j) + (k) * flux_x_stride]; } } @@ -714,10 +768,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+1+depth - _Pragma("kernel1d") + double *mass_flux_x = field.mass_flux_x.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.mass_flux_x(j, 1 - k) = field.mass_flux_x(j, 1 + 2 + k); + mass_flux_x[j + (1 - k) * flux_x_stride] = mass_flux_x[j + (1 + 2 + k) * flux_x_stride]; } } @@ -727,10 +782,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+1+depth - _Pragma("kernel1d") + double *mass_flux_x = field.mass_flux_x.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.mass_flux_x(j, y_max + 2 + k) = field.mass_flux_x(j, y_max - k); + mass_flux_x[j + (y_max + 2 + k) * flux_x_stride] = mass_flux_x[j + (y_max - k) * flux_x_stride]; } } @@ -740,10 +796,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *mass_flux_x = field.mass_flux_x.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.mass_flux_x(1 - j, k) = -field.mass_flux_x(1 + 2 + j, k); + mass_flux_x[(1 - j) + (k) * flux_x_stride] = -mass_flux_x[(1 + 2 + j) + (k) * flux_x_stride]; } } @@ -753,10 +810,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *mass_flux_x = field.mass_flux_x.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.mass_flux_x(x_max + j + 1 + 2, k) = -field.mass_flux_x(x_max + 1 - j, k); + mass_flux_x[(x_max + j + 1 + 2) + (k) * flux_x_stride] = -mass_flux_x[(x_max + 1 - j) + (k) * flux_x_stride]; } } @@ -770,10 +828,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *vol_flux_y = field.vol_flux_y.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.vol_flux_y(j, 1 - k) = -field.vol_flux_y(j, 1 + 2 + k); + vol_flux_y[j + (1 - k) * flux_y_stride] = -vol_flux_y[j + (1 + 2 + k) * flux_y_stride]; } } @@ -783,10 +842,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *vol_flux_y = field.vol_flux_y.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.vol_flux_y(j, y_max + k + 1 + 2) = -field.vol_flux_y(j, y_max + 1 - k); + vol_flux_y[j + (y_max + k + 1 + 2) * flux_y_stride] = -vol_flux_y[j + (y_max + 1 - k) * flux_y_stride]; } } @@ -796,10 +856,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *vol_flux_y = field.vol_flux_y.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.vol_flux_y(1 - j, k) = field.vol_flux_y(1 + 2 + j, k); + vol_flux_y[(1 - j) + (k) * flux_y_stride] = vol_flux_y[(1 + 2 + j) + (k) * flux_y_stride]; } } @@ -809,10 +870,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *vol_flux_y = field.vol_flux_y.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.vol_flux_y(x_max + 2 + j, k) = field.vol_flux_y(x_max - j, k); + vol_flux_y[(x_max + 2 + j) + (k) * flux_y_stride] = vol_flux_y[(x_max - j) + (k) * flux_y_stride]; } } @@ -825,10 +887,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *mass_flux_y = field.mass_flux_y.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.mass_flux_y(j, 1 - k) = -field.mass_flux_y(j, 1 + 2 + k); + mass_flux_y[j + (1 - k) * flux_y_stride] = -mass_flux_y[j + (1 + 2 + k) * flux_y_stride]; } } @@ -838,10 +901,11 @@ void update_halo_kernel( // DO j=x_min-depth,x_max+depth - _Pragma("kernel1d") + double *mass_flux_y = field.mass_flux_y.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { for (int k = 0; k < depth; ++k) { - field.mass_flux_y(j, y_max + k + 1 + 2) = -field.mass_flux_y(j, y_max + 1 - k); + mass_flux_y[j + (y_max + k + 1 + 2) * flux_y_stride] = -mass_flux_y[j + (y_max + 1 - k) * flux_y_stride]; } } @@ -851,10 +915,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *mass_flux_y = field.mass_flux_y.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.mass_flux_y(1 - j, k) = field.mass_flux_y(1 + 2 + j, k); + mass_flux_y[(1 - j) + (k) * flux_y_stride] = mass_flux_y[(1 + 2 + j) + (k) * flux_y_stride]; } } @@ -864,10 +929,11 @@ void update_halo_kernel( // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *mass_flux_y = field.mass_flux_y.data; + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - field.mass_flux_y(x_max + 2 + j, k) = field.mass_flux_y(x_max - j, k); + mass_flux_y[(x_max + 2 + j) + (k) * flux_y_stride] = mass_flux_y[(x_max - j) + (k) * flux_y_stride]; } } @@ -903,10 +969,15 @@ void update_halo(global_variables &globals, int fields[NUM_FIELDS], int depth) { (globals.chunk.chunk_neighbours[chunk_bottom] == external_face) || (globals.chunk.chunk_neighbours[chunk_top] == external_face)) { + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) { tile_type &t = globals.chunk.tiles[tile]; update_halo_kernel( + globals.use_target, t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, @@ -918,6 +989,11 @@ void update_halo(global_variables &globals, int fields[NUM_FIELDS], int depth) { depth); } + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif + + } diff --git a/src/update_tile_halo.cpp b/src/update_tile_halo.cpp index 9bc9d25..1c54243 100644 --- a/src/update_tile_halo.cpp +++ b/src/update_tile_halo.cpp @@ -30,6 +30,10 @@ void update_tile_halo(global_variables &globals, int fields[NUM_FIELDS], int dep // Update Top Bottom - Real to Real + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) { tile_type &tt = globals.chunk.tiles[tile]; @@ -39,6 +43,7 @@ void update_tile_halo(global_variables &globals, int fields[NUM_FIELDS], int dep if (t_up != external_tile) { tile_type &tup = globals.chunk.tiles[t_up]; update_tile_halo_t_kernel( + globals.use_target, tt.info.t_xmin, tt.info.t_xmax, tt.info.t_ymin, @@ -85,6 +90,7 @@ void update_tile_halo(global_variables &globals, int fields[NUM_FIELDS], int dep if (t_down != external_tile) { tile_type &tdown = globals.chunk.tiles[t_down]; update_tile_halo_b_kernel( + globals.use_target, tt.info.t_xmin, tt.info.t_xmax, tt.info.t_ymin, @@ -139,6 +145,7 @@ void update_tile_halo(global_variables &globals, int fields[NUM_FIELDS], int dep if (t_left != external_tile) { tile_type &tleft = globals.chunk.tiles[t_left]; update_tile_halo_l_kernel( + globals.use_target, tt.info.t_xmin, tt.info.t_xmax, tt.info.t_ymin, @@ -184,6 +191,7 @@ void update_tile_halo(global_variables &globals, int fields[NUM_FIELDS], int dep if (t_right != external_tile) { tile_type &tright = globals.chunk.tiles[t_right]; update_tile_halo_r_kernel( + globals.use_target, tt.info.t_xmin, tt.info.t_xmax, tt.info.t_ymin, @@ -227,5 +235,9 @@ void update_tile_halo(global_variables &globals, int fields[NUM_FIELDS], int dep } } + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif + } diff --git a/src/update_tile_halo_kernel.cpp b/src/update_tile_halo_kernel.cpp index de6f8dc..c57daa0 100644 --- a/src/update_tile_halo_kernel.cpp +++ b/src/update_tile_halo_kernel.cpp @@ -18,7 +18,7 @@ */ #include "update_tile_halo_kernel.h" -#include "utils.hpp" + // @brief Fortran kernel to update the external halo cells in a chunk. // @author Wayne Gaudin @@ -28,40 +28,45 @@ // reflective. void update_tile_halo_l_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &density0, clover::Buffer2D &energy0, - clover::Buffer2D &pressure, clover::Buffer2D &viscosity, - clover::Buffer2D &soundspeed, clover::Buffer2D &density1, - clover::Buffer2D &energy1, clover::Buffer2D &xvel0, - clover::Buffer2D &yvel0, clover::Buffer2D &xvel1, - clover::Buffer2D &yvel1, clover::Buffer2D &vol_flux_x, - clover::Buffer2D &vol_flux_y, - clover::Buffer2D &mass_flux_x, - clover::Buffer2D &mass_flux_y, int left_xmin, int left_xmax, - int left_ymin, int left_ymax, clover::Buffer2D &left_density0, - clover::Buffer2D &left_energy0, - clover::Buffer2D &left_pressure, - clover::Buffer2D &left_viscosity, - clover::Buffer2D &left_soundspeed, - clover::Buffer2D &left_density1, - clover::Buffer2D &left_energy1, - clover::Buffer2D &left_xvel0, - clover::Buffer2D &left_yvel0, - clover::Buffer2D &left_xvel1, - clover::Buffer2D &left_yvel1, - clover::Buffer2D &left_vol_flux_x, - clover::Buffer2D &left_vol_flux_y, - clover::Buffer2D &left_mass_flux_x, - clover::Buffer2D &left_mass_flux_y, const int fields[NUM_FIELDS], + clover::Buffer2D &density0_buffer, clover::Buffer2D &energy0_buffer, + clover::Buffer2D &pressure_buffer, clover::Buffer2D &viscosity_buffer, + clover::Buffer2D &soundspeed_buffer, clover::Buffer2D &density1_buffer, + clover::Buffer2D &energy1_buffer, clover::Buffer2D &xvel0_buffer, + clover::Buffer2D &yvel0_buffer, clover::Buffer2D &xvel1_buffer, + clover::Buffer2D &yvel1_buffer, clover::Buffer2D &vol_flux_x_buffer, + clover::Buffer2D &vol_flux_y_buffer, + clover::Buffer2D &mass_flux_x_buffer, + clover::Buffer2D &mass_flux_y_buffer, int left_xmin, int left_xmax, + int left_ymin, int left_ymax, clover::Buffer2D &left_density0_buffer, + clover::Buffer2D &left_energy0_buffer, + clover::Buffer2D &left_pressure_buffer, + clover::Buffer2D &left_viscosity_buffer, + clover::Buffer2D &left_soundspeed_buffer, + clover::Buffer2D &left_density1_buffer, + clover::Buffer2D &left_energy1_buffer, + clover::Buffer2D &left_xvel0_buffer, + clover::Buffer2D &left_yvel0_buffer, + clover::Buffer2D &left_xvel1_buffer, + clover::Buffer2D &left_yvel1_buffer, + clover::Buffer2D &left_vol_flux_x_buffer, + clover::Buffer2D &left_vol_flux_y_buffer, + clover::Buffer2D &left_mass_flux_x_buffer, + clover::Buffer2D &left_mass_flux_y_buffer, const int fields[NUM_FIELDS], int depth) { // Density 0 if (fields[field_density0] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *density0 = density0_buffer.data; + const int density0_sizex = density0_buffer.nX(); + double *left_density0 = left_density0_buffer.data; + const int left_density0_sizex = left_density0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - density0(x_min - j, k) = left_density0(left_xmax + 1 - j, k); + density0[(x_min - j) + (k) * density0_sizex] = left_density0[(left_xmax + 1 - j) + (k) * left_density0_sizex]; } } } @@ -70,10 +75,14 @@ void update_tile_halo_l_kernel( if (fields[field_density1] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *density1 = density1_buffer.data; + const int density1_sizex = density1_buffer.nX(); + double *left_density1 = left_density1_buffer.data; + const int left_density1_sizex = left_density1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - density1(x_min - j, k) = left_density1(left_xmax + 1 - j, k); + density1[(x_min - j) + (k) * density1_sizex] = left_density1[(left_xmax + 1 - j) + (k) * left_density1_sizex]; } } } @@ -82,10 +91,14 @@ void update_tile_halo_l_kernel( if (fields[field_energy0] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *energy0 = energy0_buffer.data; + const int energy0_sizex = energy0_buffer.nX(); + double *left_energy0 = left_energy0_buffer.data; + const int left_energy0_sizex = left_energy0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - energy0(x_min - j, k) = left_energy0(left_xmax + 1 - j, k); + energy0[(x_min - j) + (k) * energy0_sizex] = left_energy0[(left_xmax + 1 - j) + (k) * left_energy0_sizex]; } } } @@ -94,10 +107,14 @@ void update_tile_halo_l_kernel( if (fields[field_energy1] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *energy1 = energy1_buffer.data; + const int energy1_sizex = energy1_buffer.nX(); + double *left_energy1 = left_energy1_buffer.data; + const int left_energy1_sizex = left_energy1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - energy1(x_min - j, k) = left_energy1(left_xmax + 1 - j, k); + energy1[(x_min - j) + (k) * energy1_sizex] = left_energy1[(left_xmax + 1 - j) + (k) * left_energy1_sizex]; } } } @@ -106,10 +123,14 @@ void update_tile_halo_l_kernel( if (fields[field_pressure] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *pressure = pressure_buffer.data; + const int pressure_sizex = pressure_buffer.nX(); + double *left_pressure = left_pressure_buffer.data; + const int left_pressure_sizex = left_pressure_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - pressure(x_min - j, k) = left_pressure(left_xmax + 1 - j, k); + pressure[(x_min - j) + (k) * pressure_sizex] = left_pressure[(left_xmax + 1 - j) + (k) * left_pressure_sizex]; } } } @@ -118,10 +139,14 @@ void update_tile_halo_l_kernel( if (fields[field_viscosity] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *viscosity = viscosity_buffer.data; + const int viscosity_sizex = viscosity_buffer.nX(); + double *left_viscosity = left_viscosity_buffer.data; + const int left_viscosity_sizex = left_viscosity_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - viscosity(x_min - j, k) = left_viscosity(left_xmax + 1 - j, k); + viscosity[(x_min - j) + (k) * viscosity_sizex] = left_viscosity[(left_xmax + 1 - j) + (k) * left_viscosity_sizex]; } } } @@ -130,10 +155,14 @@ void update_tile_halo_l_kernel( if (fields[field_soundspeed] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *soundspeed = soundspeed_buffer.data; + const int soundspeed_sizex = soundspeed_buffer.nX(); + double *left_soundspeed = left_soundspeed_buffer.data; + const int left_soundspeed_sizex = left_soundspeed_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - soundspeed(x_min - j, k) = left_soundspeed(left_xmax + 1 - j, k); + soundspeed[(x_min - j) + (k) * soundspeed_sizex] = left_soundspeed[(left_xmax + 1 - j) + (k) * left_soundspeed_sizex]; } } } @@ -142,10 +171,14 @@ void update_tile_halo_l_kernel( if (fields[field_xvel0] == 1) { // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *xvel0 = xvel0_buffer.data; + const int xvel0_sizex = xvel0_buffer.nX(); + double *left_xvel0 = left_xvel0_buffer.data; + const int left_xvel0_sizex = left_xvel0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - xvel0(x_min - j, k) = left_xvel0(left_xmax + 1 - j, k); + xvel0[(x_min - j) + (k) * xvel0_sizex] = left_xvel0[(left_xmax + 1 - j) + (k) * left_xvel0_sizex]; } } } @@ -154,10 +187,14 @@ void update_tile_halo_l_kernel( if (fields[field_xvel1] == 1) { // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *xvel1 = xvel1_buffer.data; + const int xvel1_sizex = xvel1_buffer.nX(); + double *left_xvel1 = left_xvel1_buffer.data; + const int left_xvel1_sizex = left_xvel1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - xvel1(x_min - j, k) = left_xvel1(left_xmax + 1 - j, k); + xvel1[(x_min - j) + (k) * xvel1_sizex] = left_xvel1[(left_xmax + 1 - j) + (k) * left_xvel1_sizex]; } } } @@ -166,10 +203,14 @@ void update_tile_halo_l_kernel( if (fields[field_yvel0] == 1) { // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *yvel0 = yvel0_buffer.data; + const int yvel0_sizex = yvel0_buffer.nX(); + double *left_yvel0 = left_yvel0_buffer.data; + const int left_yvel0_sizex = left_yvel0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - yvel0(x_min - j, k) = left_yvel0(left_xmax + 1 - j, k); + yvel0[(x_min - j) + (k) * yvel0_sizex] = left_yvel0[(left_xmax + 1 - j) + (k) * left_yvel0_sizex]; } } } @@ -178,10 +219,14 @@ void update_tile_halo_l_kernel( if (fields[field_yvel1] == 1) { // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *yvel1 = yvel1_buffer.data; + const int yvel1_sizex = yvel1_buffer.nX(); + double *left_yvel1 = left_yvel1_buffer.data; + const int left_yvel1_sizex = left_yvel1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - yvel1(x_min - j, k) = left_yvel1(left_xmax + 1 - j, k); + yvel1[(x_min - j) + (k) * yvel1_sizex] = left_yvel1[(left_xmax + 1 - j) + (k) * left_yvel1_sizex]; } } } @@ -190,10 +235,14 @@ void update_tile_halo_l_kernel( if (fields[field_vol_flux_x] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *vol_flux_x = vol_flux_x_buffer.data; + const int vol_flux_x_sizex = vol_flux_x_buffer.nX(); + double *left_vol_flux_x = left_vol_flux_x_buffer.data; + const int left_vol_flux_x_sizex = left_vol_flux_x_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - vol_flux_x(x_min - j, k) = left_vol_flux_x(left_xmax + 1 - j, k); + vol_flux_x[(x_min - j) + (k) * vol_flux_x_sizex] = left_vol_flux_x[(left_xmax + 1 - j) + (k) * left_vol_flux_x_sizex]; } } } @@ -202,10 +251,14 @@ void update_tile_halo_l_kernel( if (fields[field_mass_flux_x] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *mass_flux_x = mass_flux_x_buffer.data; + const int mass_flux_x_sizex = mass_flux_x_buffer.nX(); + double *left_mass_flux_x = left_mass_flux_x_buffer.data; + const int left_mass_flux_x_sizex = left_mass_flux_x_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - mass_flux_x(x_min - j, k) = left_mass_flux_x(left_xmax + 1 - j, k); + mass_flux_x[(x_min - j) + (k) * mass_flux_x_sizex] = left_mass_flux_x[(left_xmax + 1 - j) + (k) * left_mass_flux_x_sizex]; } } } @@ -214,10 +267,14 @@ void update_tile_halo_l_kernel( if (fields[field_vol_flux_y] == 1) { // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *vol_flux_y = vol_flux_y_buffer.data; + const int vol_flux_y_sizex = vol_flux_y_buffer.nX(); + double *left_vol_flux_y = left_vol_flux_y_buffer.data; + const int left_vol_flux_y_sizex = left_vol_flux_y_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - vol_flux_y(x_min - j, k) = left_vol_flux_y(left_xmax + 1 - j, k); + vol_flux_y[(x_min - j) + (k) * vol_flux_y_sizex] = left_vol_flux_y[(left_xmax + 1 - j) + (k) * left_vol_flux_y_sizex]; } } } @@ -226,50 +283,59 @@ void update_tile_halo_l_kernel( if (fields[field_mass_flux_y] == 1) { // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *mass_flux_y = mass_flux_y_buffer.data; + const int mass_flux_y_sizex = mass_flux_y_buffer.nX(); + double *left_mass_flux_y = left_mass_flux_y_buffer.data; + const int left_mass_flux_y_sizex = left_mass_flux_y_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - mass_flux_y(x_min - j, k) = left_mass_flux_y(left_xmax + 1 - j, k); + mass_flux_y[(x_min - j) + (k) * mass_flux_y_sizex] = left_mass_flux_y[(left_xmax + 1 - j) + (k) * left_mass_flux_y_sizex]; } } } } void update_tile_halo_r_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &density0, clover::Buffer2D &energy0, - clover::Buffer2D &pressure, clover::Buffer2D &viscosity, - clover::Buffer2D &soundspeed, clover::Buffer2D &density1, - clover::Buffer2D &energy1, clover::Buffer2D &xvel0, - clover::Buffer2D &yvel0, clover::Buffer2D &xvel1, - clover::Buffer2D &yvel1, clover::Buffer2D &vol_flux_x, - clover::Buffer2D &vol_flux_y, - clover::Buffer2D &mass_flux_x, - clover::Buffer2D &mass_flux_y, int right_xmin, int right_xmax, - int right_ymin, int right_ymax, clover::Buffer2D &right_density0, - clover::Buffer2D &right_energy0, - clover::Buffer2D &right_pressure, - clover::Buffer2D &right_viscosity, - clover::Buffer2D &right_soundspeed, - clover::Buffer2D &right_density1, - clover::Buffer2D &right_energy1, - clover::Buffer2D &right_xvel0, - clover::Buffer2D &right_yvel0, - clover::Buffer2D &right_xvel1, - clover::Buffer2D &right_yvel1, - clover::Buffer2D &right_vol_flux_x, - clover::Buffer2D &right_vol_flux_y, - clover::Buffer2D &right_mass_flux_x, - clover::Buffer2D &right_mass_flux_y, const int fields[NUM_FIELDS], + clover::Buffer2D &density0_buffer, clover::Buffer2D &energy0_buffer, + clover::Buffer2D &pressure_buffer, clover::Buffer2D &viscosity_buffer, + clover::Buffer2D &soundspeed_buffer, clover::Buffer2D &density1_buffer, + clover::Buffer2D &energy1_buffer, clover::Buffer2D &xvel0_buffer, + clover::Buffer2D &yvel0_buffer, clover::Buffer2D &xvel1_buffer, + clover::Buffer2D &yvel1_buffer, clover::Buffer2D &vol_flux_x_buffer, + clover::Buffer2D &vol_flux_y_buffer, + clover::Buffer2D &mass_flux_x_buffer, + clover::Buffer2D &mass_flux_y_buffer, int right_xmin, int right_xmax, + int right_ymin, int right_ymax, clover::Buffer2D &right_density0_buffer, + clover::Buffer2D &right_energy0_buffer, + clover::Buffer2D &right_pressure_buffer, + clover::Buffer2D &right_viscosity_buffer, + clover::Buffer2D &right_soundspeed_buffer, + clover::Buffer2D &right_density1_buffer, + clover::Buffer2D &right_energy1_buffer, + clover::Buffer2D &right_xvel0_buffer, + clover::Buffer2D &right_yvel0_buffer, + clover::Buffer2D &right_xvel1_buffer, + clover::Buffer2D &right_yvel1_buffer, + clover::Buffer2D &right_vol_flux_x_buffer, + clover::Buffer2D &right_vol_flux_y_buffer, + clover::Buffer2D &right_mass_flux_x_buffer, + clover::Buffer2D &right_mass_flux_y_buffer, const int fields[NUM_FIELDS], int depth) { // Density 0 if (fields[field_density0] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *density0 = density0_buffer.data; + const int density0_sizex = density0_buffer.nX(); + double *right_density0 = right_density0_buffer.data; + const int right_density0_sizex = right_density0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - density0(x_max + 2 + j, k) = right_density0(right_xmin - 1 + 2 + j, k); + density0[(x_max + 2 + j) + (k) * density0_sizex] = right_density0[(right_xmin - 1 + 2 + j) + (k) * right_density0_sizex]; } } } @@ -278,10 +344,14 @@ void update_tile_halo_r_kernel( if (fields[field_density1] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *density1 = density1_buffer.data; + const int density1_sizex = density1_buffer.nX(); + double *right_density1 = right_density1_buffer.data; + const int right_density1_sizex = right_density1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - density1(x_max + 2 + j, k) = right_density1(right_xmin - 1 + 2 + j, k); + density1[(x_max + 2 + j) + (k) * density1_sizex] = right_density1[(right_xmin - 1 + 2 + j) + (k) * right_density1_sizex]; } } } @@ -290,10 +360,14 @@ void update_tile_halo_r_kernel( if (fields[field_energy0] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *energy0 = energy0_buffer.data; + const int energy0_sizex = energy0_buffer.nX(); + double *right_energy0 = right_energy0_buffer.data; + const int right_energy0_sizex = right_energy0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - energy0(x_max + 2 + j, k) = right_energy0(right_xmin - 1 + 2 + j, k); + energy0[(x_max + 2 + j) + (k) * energy0_sizex] = right_energy0[(right_xmin - 1 + 2 + j) + (k) * right_energy0_sizex]; } } } @@ -302,10 +376,14 @@ void update_tile_halo_r_kernel( if (fields[field_energy1] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *energy1 = energy1_buffer.data; + const int energy1_sizex = energy1_buffer.nX(); + double *right_energy1 = right_energy1_buffer.data; + const int right_energy1_sizex = right_energy1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - energy1(x_max + 2 + j, k) = right_energy1(right_xmin - 1 + 2 + j, k); + energy1[(x_max + 2 + j) + (k) * energy1_sizex] = right_energy1[(right_xmin - 1 + 2 + j) + (k) * right_energy1_sizex]; } } } @@ -314,10 +392,14 @@ void update_tile_halo_r_kernel( if (fields[field_pressure] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *pressure = pressure_buffer.data; + const int pressure_sizex = pressure_buffer.nX(); + double *right_pressure = right_pressure_buffer.data; + const int right_pressure_sizex = right_pressure_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - pressure(x_max + 2 + j, k) = right_pressure(right_xmin - 1 + 2 + j, k); + pressure[(x_max + 2 + j) + (k) * pressure_sizex] = right_pressure[(right_xmin - 1 + 2 + j) + (k) * right_pressure_sizex]; } } } @@ -326,10 +408,14 @@ void update_tile_halo_r_kernel( if (fields[field_viscosity] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *viscosity = viscosity_buffer.data; + const int viscosity_sizex = viscosity_buffer.nX(); + double *right_viscosity = right_viscosity_buffer.data; + const int right_viscosity_sizex = right_viscosity_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - viscosity(x_max + 2 + j, k) = right_viscosity(right_xmin - 1 + 2 + j, k); + viscosity[(x_max + 2 + j) + (k) * viscosity_sizex] = right_viscosity[(right_xmin - 1 + 2 + j) + (k) * right_viscosity_sizex]; } } } @@ -338,10 +424,14 @@ void update_tile_halo_r_kernel( if (fields[field_soundspeed] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *soundspeed = soundspeed_buffer.data; + const int soundspeed_sizex = soundspeed_buffer.nX(); + double *right_soundspeed = right_soundspeed_buffer.data; + const int right_soundspeed_sizex = right_soundspeed_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - soundspeed(x_max + 2 + j, k) = right_soundspeed(right_xmin - 1 + 2 + j, k); + soundspeed[(x_max + 2 + j) + (k) * soundspeed_sizex] = right_soundspeed[(right_xmin - 1 + 2 + j) + (k) * right_soundspeed_sizex]; } } } @@ -350,10 +440,14 @@ void update_tile_halo_r_kernel( if (fields[field_xvel0] == 1) { // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *xvel0 = xvel0_buffer.data; + const int xvel0_sizex = xvel0_buffer.nX(); + double *right_xvel0 = right_xvel0_buffer.data; + const int right_xvel0_sizex = right_xvel0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - xvel0(x_max + 1 + 2 + j, k) = right_xvel0(right_xmin + 1 - 1 + 2 + j, k); + xvel0[(x_max + 1 + 2 + j) + (k) * xvel0_sizex] = right_xvel0[(right_xmin + 1 - 1 + 2 + j) + (k) * right_xvel0_sizex]; } } } @@ -362,10 +456,14 @@ void update_tile_halo_r_kernel( if (fields[field_xvel1] == 1) { // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *xvel1 = xvel1_buffer.data; + const int xvel1_sizex = xvel1_buffer.nX(); + double *right_xvel1 = right_xvel1_buffer.data; + const int right_xvel1_sizex = right_xvel1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - xvel1(x_max + 1 + 2 + j, k) = right_xvel1(right_xmin + 1 - 1 + 2 + j, k); + xvel1[(x_max + 1 + 2 + j) + (k) * xvel1_sizex] = right_xvel1[(right_xmin + 1 - 1 + 2 + j) + (k) * right_xvel1_sizex]; } } } @@ -374,10 +472,14 @@ void update_tile_halo_r_kernel( if (fields[field_yvel0] == 1) { // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *yvel0 = yvel0_buffer.data; + const int yvel0_sizex = yvel0_buffer.nX(); + double *right_yvel0 = right_yvel0_buffer.data; + const int right_yvel0_sizex = right_yvel0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - yvel0(x_max + 1 + 2 + j, k) = right_yvel0(right_xmin + 1 - 1 + 2 + j, k); + yvel0[(x_max + 1 + 2 + j) + (k) * yvel0_sizex] = right_yvel0[(right_xmin + 1 - 1 + 2 + j) + (k) * right_yvel0_sizex]; } } } @@ -386,10 +488,14 @@ void update_tile_halo_r_kernel( if (fields[field_yvel1] == 1) { // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *yvel1 = yvel1_buffer.data; + const int yvel1_sizex = yvel1_buffer.nX(); + double *right_yvel1 = right_yvel1_buffer.data; + const int right_yvel1_sizex = right_yvel1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - yvel1(x_max + 1 + 2 + j, k) = right_yvel1(right_xmin + 1 - 1 + 2 + j, k); + yvel1[(x_max + 1 + 2 + j) + (k) * yvel1_sizex] = right_yvel1[(right_xmin + 1 - 1 + 2 + j) + (k) * right_yvel1_sizex]; } } } @@ -398,10 +504,14 @@ void update_tile_halo_r_kernel( if (fields[field_vol_flux_x] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *vol_flux_x = vol_flux_x_buffer.data; + const int vol_flux_x_sizex = vol_flux_x_buffer.nX(); + double *right_vol_flux_x = right_vol_flux_x_buffer.data; + const int right_vol_flux_x_sizex = right_vol_flux_x_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - vol_flux_x(x_max + 1 + 2 + j, k) = right_vol_flux_x(right_xmin + 1 - 1 + 2 + j, k); + vol_flux_x[(x_max + 1 + 2 + j) + (k) * vol_flux_x_sizex] = right_vol_flux_x[(right_xmin + 1 - 1 + 2 + j) + (k) * right_vol_flux_x_sizex]; } } } @@ -410,10 +520,14 @@ void update_tile_halo_r_kernel( if (fields[field_mass_flux_x] == 1) { // DO k=y_min-depth,y_max+depth - _Pragma("kernel1d") + double *mass_flux_x = mass_flux_x_buffer.data; + const int mass_flux_x_sizex = mass_flux_x_buffer.nX(); + double *right_mass_flux_x = right_mass_flux_x_buffer.data; + const int right_mass_flux_x_sizex = right_mass_flux_x_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - mass_flux_x(x_max + 1 + 2 + j, k) = right_mass_flux_x(right_xmin + 1 - 1 + 2 + j, k); + mass_flux_x[(x_max + 1 + 2 + j) + (k) * mass_flux_x_sizex] = right_mass_flux_x[(right_xmin + 1 - 1 + 2 + j) + (k) * right_mass_flux_x_sizex]; } } } @@ -422,10 +536,14 @@ void update_tile_halo_r_kernel( if (fields[field_vol_flux_y] == 1) { // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *vol_flux_y = vol_flux_y_buffer.data; + const int vol_flux_y_sizex = vol_flux_y_buffer.nX(); + double *right_vol_flux_y = right_vol_flux_y_buffer.data; + const int right_vol_flux_y_sizex = right_vol_flux_y_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - vol_flux_y(x_max + 2 + j, k) = right_vol_flux_y(right_xmin - 1 + 2 + j, k); + vol_flux_y[(x_max + 2 + j) + (k) * vol_flux_y_sizex] = right_vol_flux_y[(right_xmin - 1 + 2 + j) + (k) * right_vol_flux_y_sizex]; } } } @@ -434,10 +552,14 @@ void update_tile_halo_r_kernel( if (fields[field_mass_flux_y] == 1) { // DO k=y_min-depth,y_max+1+depth - _Pragma("kernel1d") + double *mass_flux_y = mass_flux_y_buffer.data; + const int mass_flux_y_sizex = mass_flux_y_buffer.nX(); + double *right_mass_flux_y = right_mass_flux_y_buffer.data; + const int right_mass_flux_y_sizex = right_mass_flux_y_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) { for (int j = 0; j < depth; ++j) { - mass_flux_y(x_max + 2 + j, k) = right_mass_flux_y(right_xmin - 1 + 2 + j, k); + mass_flux_y[(x_max + 2 + j) + (k) * mass_flux_y_sizex] = right_mass_flux_y[(right_xmin - 1 + 2 + j) + (k) * right_mass_flux_y_sizex]; } } } @@ -448,38 +570,43 @@ void update_tile_halo_r_kernel( // communication void update_tile_halo_t_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &density0, clover::Buffer2D &energy0, - clover::Buffer2D &pressure, clover::Buffer2D &viscosity, - clover::Buffer2D &soundspeed, clover::Buffer2D &density1, - clover::Buffer2D &energy1, clover::Buffer2D &xvel0, - clover::Buffer2D &yvel0, clover::Buffer2D &xvel1, - clover::Buffer2D &yvel1, clover::Buffer2D &vol_flux_x, - clover::Buffer2D &vol_flux_y, - clover::Buffer2D &mass_flux_x, - clover::Buffer2D &mass_flux_y, int top_xmin, int top_xmax, - int top_ymin, int top_ymax, clover::Buffer2D &top_density0, - clover::Buffer2D &top_energy0, - clover::Buffer2D &top_pressure, - clover::Buffer2D &top_viscosity, - clover::Buffer2D &top_soundspeed, - clover::Buffer2D &top_density1, - clover::Buffer2D &top_energy1, - clover::Buffer2D &top_xvel0, clover::Buffer2D &top_yvel0, - clover::Buffer2D &top_xvel1, clover::Buffer2D &top_yvel1, - clover::Buffer2D &top_vol_flux_x, - clover::Buffer2D &top_vol_flux_y, - clover::Buffer2D &top_mass_flux_x, - clover::Buffer2D &top_mass_flux_y, const int fields[NUM_FIELDS], + clover::Buffer2D &density0_buffer, clover::Buffer2D &energy0_buffer, + clover::Buffer2D &pressure_buffer, clover::Buffer2D &viscosity_buffer, + clover::Buffer2D &soundspeed_buffer, clover::Buffer2D &density1_buffer, + clover::Buffer2D &energy1_buffer, clover::Buffer2D &xvel0_buffer, + clover::Buffer2D &yvel0_buffer, clover::Buffer2D &xvel1_buffer, + clover::Buffer2D &yvel1_buffer, clover::Buffer2D &vol_flux_x_buffer, + clover::Buffer2D &vol_flux_y_buffer, + clover::Buffer2D &mass_flux_x_buffer, + clover::Buffer2D &mass_flux_y_buffer, int top_xmin, int top_xmax, + int top_ymin, int top_ymax, clover::Buffer2D &top_density0_buffer, + clover::Buffer2D &top_energy0_buffer, + clover::Buffer2D &top_pressure_buffer, + clover::Buffer2D &top_viscosity_buffer, + clover::Buffer2D &top_soundspeed_buffer, + clover::Buffer2D &top_density1_buffer, + clover::Buffer2D &top_energy1_buffer, + clover::Buffer2D &top_xvel0_buffer, clover::Buffer2D &top_yvel0_buffer, + clover::Buffer2D &top_xvel1_buffer, clover::Buffer2D &top_yvel1_buffer, + clover::Buffer2D &top_vol_flux_x_buffer, + clover::Buffer2D &top_vol_flux_y_buffer, + clover::Buffer2D &top_mass_flux_x_buffer, + clover::Buffer2D &top_mass_flux_y_buffer, const int fields[NUM_FIELDS], int depth) { // Density 0 if (fields[field_density0] == 1) { for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *density0 = density0_buffer.data; + const int density0_sizex = density0_buffer.nX(); + double *top_density0 = top_density0_buffer.data; + const int top_density0_sizex = top_density0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - density0(j, y_max + 2 + k) = top_density0(j, top_ymin - 1 + 2 + k); + density0[j + (y_max + 2 + k) * density0_sizex] = top_density0[j + (top_ymin - 1 + 2 + k) * top_density0_sizex]; } } } @@ -489,9 +616,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *density1 = density1_buffer.data; + const int density1_sizex = density1_buffer.nX(); + double *top_density1 = top_density1_buffer.data; + const int top_density1_sizex = top_density1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - density1(j, y_max + 2 + k) = top_density1(j, top_ymin - 1 + 2 + k); + density1[j + (y_max + 2 + k) * density1_sizex] = top_density1[j + (top_ymin - 1 + 2 + k) * top_density1_sizex]; } } } @@ -501,9 +632,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *energy0 = energy0_buffer.data; + const int energy0_sizex = energy0_buffer.nX(); + double *top_energy0 = top_energy0_buffer.data; + const int top_energy0_sizex = top_energy0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - energy0(j, y_max + 2 + k) = top_energy0(j, top_ymin - 1 + 2 + k); + energy0[j + (y_max + 2 + k) * energy0_sizex] = top_energy0[j + (top_ymin - 1 + 2 + k) * top_energy0_sizex]; } } } @@ -513,9 +648,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *energy1 = energy1_buffer.data; + const int energy1_sizex = energy1_buffer.nX(); + double *top_energy1 = top_energy1_buffer.data; + const int top_energy1_sizex = top_energy1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - energy1(j, y_max + 2 + k) = top_energy1(j, top_ymin - 1 + 2 + k); + energy1[j + (y_max + 2 + k) * energy1_sizex] = top_energy1[j + (top_ymin - 1 + 2 + k) * top_energy1_sizex]; } } } @@ -525,9 +664,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *pressure = pressure_buffer.data; + const int pressure_sizex = pressure_buffer.nX(); + double *top_pressure = top_pressure_buffer.data; + const int top_pressure_sizex = top_pressure_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - pressure(j, y_max + 2 + k) = top_pressure(j, top_ymin - 1 + 2 + k); + pressure[j + (y_max + 2 + k) * pressure_sizex] = top_pressure[j + (top_ymin - 1 + 2 + k) * top_pressure_sizex]; } } } @@ -537,9 +680,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *viscosity = viscosity_buffer.data; + const int viscosity_sizex = viscosity_buffer.nX(); + double *top_viscosity = top_viscosity_buffer.data; + const int top_viscosity_sizex = top_viscosity_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - viscosity(j, y_max + 2 + k) = top_viscosity(j, top_ymin - 1 + 2 + k); + viscosity[j + (y_max + 2 + k) * viscosity_sizex] = top_viscosity[j + (top_ymin - 1 + 2 + k) * top_viscosity_sizex]; } } } @@ -549,9 +696,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *soundspeed = soundspeed_buffer.data; + const int soundspeed_sizex = soundspeed_buffer.nX(); + double *top_soundspeed = top_soundspeed_buffer.data; + const int top_soundspeed_sizex = top_soundspeed_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - soundspeed(j, y_max + 2 + k) = top_soundspeed(j, top_ymin - 1 + 2 + k); + soundspeed[j + (y_max + 2 + k) * soundspeed_sizex] = top_soundspeed[j + (top_ymin - 1 + 2 + k) * top_soundspeed_sizex]; } } } @@ -561,9 +712,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+1+depth - _Pragma("kernel1d") + double *xvel0 = xvel0_buffer.data; + const int xvel0_sizex = xvel0_buffer.nX(); + double *top_xvel0 = top_xvel0_buffer.data; + const int top_xvel0_sizex = top_xvel0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { - xvel0(j, y_max + 1 + 2 + k) = top_xvel0(j, top_ymin + 1 - 1 + 2 + k); + xvel0[j + (y_max + 1 + 2 + k) * xvel0_sizex] = top_xvel0[j + (top_ymin + 1 - 1 + 2 + k) * top_xvel0_sizex]; } } } @@ -573,9 +728,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+1+depth - _Pragma("kernel1d") + double *xvel1 = xvel1_buffer.data; + const int xvel1_sizex = xvel1_buffer.nX(); + double *top_xvel1 = top_xvel1_buffer.data; + const int top_xvel1_sizex = top_xvel1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { - xvel1(j, y_max + 1 + 2 + k) = top_xvel1(j, top_ymin + 1 - 1 + 2 + k); + xvel1[j + (y_max + 1 + 2 + k) * xvel1_sizex] = top_xvel1[j + (top_ymin + 1 - 1 + 2 + k) * top_xvel1_sizex]; } } } @@ -585,9 +744,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+1+depth - _Pragma("kernel1d") + double *yvel0 = yvel0_buffer.data; + const int yvel0_sizex = yvel0_buffer.nX(); + double *top_yvel0 = top_yvel0_buffer.data; + const int top_yvel0_sizex = top_yvel0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { - yvel0(j, y_max + 1 + 2 + k) = top_yvel0(j, top_ymin + 1 - 1 + 2 + k); + yvel0[j + (y_max + 1 + 2 + k) * yvel0_sizex] = top_yvel0[j + (top_ymin + 1 - 1 + 2 + k) * top_yvel0_sizex]; } } } @@ -597,9 +760,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+1+depth - _Pragma("kernel1d") + double *yvel1 = yvel1_buffer.data; + const int yvel1_sizex = yvel1_buffer.nX(); + double *top_yvel1 = top_yvel1_buffer.data; + const int top_yvel1_sizex = top_yvel1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { - yvel1(j, y_max + 1 + 2 + k) = top_yvel1(j, top_ymin + 1 - 1 + 2 + k); + yvel1[j + (y_max + 1 + 2 + k) * yvel1_sizex] = top_yvel1[j + (top_ymin + 1 - 1 + 2 + k) * top_yvel1_sizex]; } } } @@ -609,9 +776,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+1+depth - _Pragma("kernel1d") + double *vol_flux_x = vol_flux_x_buffer.data; + const int vol_flux_x_sizex = vol_flux_x_buffer.nX(); + double *top_vol_flux_x = top_vol_flux_x_buffer.data; + const int top_vol_flux_x_sizex = top_vol_flux_x_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { - vol_flux_x(j, y_max + 2 + k) = top_vol_flux_x(j, top_ymin - 1 + 2 + k); + vol_flux_x[j + (y_max + 2 + k) * vol_flux_x_sizex] = top_vol_flux_x[j + (top_ymin - 1 + 2 + k) * top_vol_flux_x_sizex]; } } } @@ -621,9 +792,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+1+depth - _Pragma("kernel1d") + double *mass_flux_x = mass_flux_x_buffer.data; + const int mass_flux_x_sizex = mass_flux_x_buffer.nX(); + double *top_mass_flux_x = top_mass_flux_x_buffer.data; + const int top_mass_flux_x_sizex = top_mass_flux_x_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { - mass_flux_x(j, y_max + 2 + k) = top_mass_flux_x(j, top_ymin - 1 + 2 + k); + mass_flux_x[j + (y_max + 2 + k) * mass_flux_x_sizex] = top_mass_flux_x[j + (top_ymin - 1 + 2 + k) * top_mass_flux_x_sizex]; } } } @@ -633,9 +808,13 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *vol_flux_y = vol_flux_y_buffer.data; + const int vol_flux_y_sizex = vol_flux_y_buffer.nX(); + double *top_vol_flux_y = top_vol_flux_y_buffer.data; + const int top_vol_flux_y_sizex = top_vol_flux_y_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - vol_flux_y(j, y_max + 1 + 2 + k) = top_vol_flux_y(j, top_ymin + 1 - 1 + 2 + k); + vol_flux_y[j + (y_max + 1 + 2 + k) * vol_flux_y_sizex] = top_vol_flux_y[j + (top_ymin + 1 - 1 + 2 + k) * top_vol_flux_y_sizex]; } } } @@ -645,50 +824,59 @@ void update_tile_halo_t_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *mass_flux_y = mass_flux_y_buffer.data; + const int mass_flux_y_sizex = mass_flux_y_buffer.nX(); + double *top_mass_flux_y = top_mass_flux_y_buffer.data; + const int top_mass_flux_y_sizex = top_mass_flux_y_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - mass_flux_y(j, y_max + 1 + 2 + k) = top_mass_flux_y(j, top_ymin + 1 - 1 + 2 + k); + mass_flux_y[j + (y_max + 1 + 2 + k) * mass_flux_y_sizex] = top_mass_flux_y[j + (top_ymin + 1 - 1 + 2 + k) * top_mass_flux_y_sizex]; } } } } void update_tile_halo_b_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, - clover::Buffer2D &density0, clover::Buffer2D &energy0, - clover::Buffer2D &pressure, clover::Buffer2D &viscosity, - clover::Buffer2D &soundspeed, clover::Buffer2D &density1, - clover::Buffer2D &energy1, clover::Buffer2D &xvel0, - clover::Buffer2D &yvel0, clover::Buffer2D &xvel1, - clover::Buffer2D &yvel1, clover::Buffer2D &vol_flux_x, - clover::Buffer2D &vol_flux_y, - clover::Buffer2D &mass_flux_x, - clover::Buffer2D &mass_flux_y, int bottom_xmin, int bottom_xmax, + clover::Buffer2D &density0_buffer, clover::Buffer2D &energy0_buffer, + clover::Buffer2D &pressure_buffer, clover::Buffer2D &viscosity_buffer, + clover::Buffer2D &soundspeed_buffer, clover::Buffer2D &density1_buffer, + clover::Buffer2D &energy1_buffer, clover::Buffer2D &xvel0_buffer, + clover::Buffer2D &yvel0_buffer, clover::Buffer2D &xvel1_buffer, + clover::Buffer2D &yvel1_buffer, clover::Buffer2D &vol_flux_x_buffer, + clover::Buffer2D &vol_flux_y_buffer, + clover::Buffer2D &mass_flux_x_buffer, + clover::Buffer2D &mass_flux_y_buffer, int bottom_xmin, int bottom_xmax, int bottom_ymin, int bottom_ymax, - clover::Buffer2D &bottom_density0, - clover::Buffer2D &bottom_energy0, - clover::Buffer2D &bottom_pressure, - clover::Buffer2D &bottom_viscosity, - clover::Buffer2D &bottom_soundspeed, - clover::Buffer2D &bottom_density1, - clover::Buffer2D &bottom_energy1, - clover::Buffer2D &bottom_xvel0, - clover::Buffer2D &bottom_yvel0, - clover::Buffer2D &bottom_xvel1, - clover::Buffer2D &bottom_yvel1, - clover::Buffer2D &bottom_vol_flux_x, - clover::Buffer2D &bottom_vol_flux_y, - clover::Buffer2D &bottom_mass_flux_x, - clover::Buffer2D &bottom_mass_flux_y, const int fields[NUM_FIELDS], + clover::Buffer2D &bottom_density0_buffer, + clover::Buffer2D &bottom_energy0_buffer, + clover::Buffer2D &bottom_pressure_buffer, + clover::Buffer2D &bottom_viscosity_buffer, + clover::Buffer2D &bottom_soundspeed_buffer, + clover::Buffer2D &bottom_density1_buffer, + clover::Buffer2D &bottom_energy1_buffer, + clover::Buffer2D &bottom_xvel0_buffer, + clover::Buffer2D &bottom_yvel0_buffer, + clover::Buffer2D &bottom_xvel1_buffer, + clover::Buffer2D &bottom_yvel1_buffer, + clover::Buffer2D &bottom_vol_flux_x_buffer, + clover::Buffer2D &bottom_vol_flux_y_buffer, + clover::Buffer2D &bottom_mass_flux_x_buffer, + clover::Buffer2D &bottom_mass_flux_y_buffer, const int fields[NUM_FIELDS], int depth) { // Density 0 if (fields[field_density0] == 1) { for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *density0 = density0_buffer.data; + const int density0_sizex = density0_buffer.nX(); + double *bottom_density0 = bottom_density0_buffer.data; + const int bottom_density0_sizex = bottom_density0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - density0(j, y_min - k) = bottom_density0(j, bottom_ymax + 1 - k); + density0[j + (y_min - k) * density0_sizex] = bottom_density0[j + (bottom_ymax + 1 - k) * bottom_density0_sizex]; } } } @@ -698,9 +886,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *density1 = density1_buffer.data; + const int density1_sizex = density1_buffer.nX(); + double *bottom_density1 = bottom_density1_buffer.data; + const int bottom_density1_sizex = bottom_density1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - density1(j, y_min - k) = bottom_density1(j, bottom_ymax + 1 - k); + density1[j + (y_min - k) * density1_sizex] = bottom_density1[j + (bottom_ymax + 1 - k) * bottom_density1_sizex]; } } } @@ -710,9 +902,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *energy0 = energy0_buffer.data; + const int energy0_sizex = energy0_buffer.nX(); + double *bottom_energy0 = bottom_energy0_buffer.data; + const int bottom_energy0_sizex = bottom_energy0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - energy0(j, y_min - k) = bottom_energy0(j, bottom_ymax + 1 - k); + energy0[j + (y_min - k) * energy0_sizex] = bottom_energy0[j + (bottom_ymax + 1 - k) * bottom_energy0_sizex]; } } } @@ -722,9 +918,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *energy1 = energy1_buffer.data; + const int energy1_sizex = energy1_buffer.nX(); + double *bottom_energy1 = bottom_energy1_buffer.data; + const int bottom_energy1_sizex = bottom_energy1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - energy1(j, y_min - k) = bottom_energy1(j, bottom_ymax + 1 - k); + energy1[j + (y_min - k) * energy1_sizex] = bottom_energy1[j + (bottom_ymax + 1 - k) * bottom_energy1_sizex]; } } } @@ -734,9 +934,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *pressure = pressure_buffer.data; + const int pressure_sizex = pressure_buffer.nX(); + double *bottom_pressure = bottom_pressure_buffer.data; + const int bottom_pressure_sizex = bottom_pressure_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - pressure(j, y_min - k) = bottom_pressure(j, bottom_ymax + 1 - k); + pressure[j + (y_min - k) * pressure_sizex] = bottom_pressure[j + (bottom_ymax + 1 - k) * bottom_pressure_sizex]; } } } @@ -746,9 +950,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *viscosity = viscosity_buffer.data; + const int viscosity_sizex = viscosity_buffer.nX(); + double *bottom_viscosity = bottom_viscosity_buffer.data; + const int bottom_viscosity_sizex = bottom_viscosity_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - viscosity(j, y_min - k) = bottom_viscosity(j, bottom_ymax + 1 - k); + viscosity[j + (y_min - k) * viscosity_sizex] = bottom_viscosity[j + (bottom_ymax + 1 - k) * bottom_viscosity_sizex]; } } } @@ -758,9 +966,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *soundspeed = soundspeed_buffer.data; + const int soundspeed_sizex = soundspeed_buffer.nX(); + double *bottom_soundspeed = bottom_soundspeed_buffer.data; + const int bottom_soundspeed_sizex = bottom_soundspeed_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - soundspeed(j, y_min - k) = bottom_soundspeed(j, bottom_ymax + 1 - k); + soundspeed[j + (y_min - k) * soundspeed_sizex] = bottom_soundspeed[j + (bottom_ymax + 1 - k) * bottom_soundspeed_sizex]; } } } @@ -770,9 +982,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+1+depth - _Pragma("kernel1d") + double *xvel0 = xvel0_buffer.data; + const int xvel0_sizex = xvel0_buffer.nX(); + double *bottom_xvel0 = bottom_xvel0_buffer.data; + const int bottom_xvel0_sizex = bottom_xvel0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { - xvel0(j, y_min - k) = bottom_xvel0(j, bottom_ymax + 1 - k); + xvel0[j + (y_min - k) * xvel0_sizex] = bottom_xvel0[j + (bottom_ymax + 1 - k) * bottom_xvel0_sizex]; } } } @@ -782,9 +998,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+1+depth - _Pragma("kernel1d") + double *xvel1 = xvel1_buffer.data; + const int xvel1_sizex = xvel1_buffer.nX(); + double *bottom_xvel1 = bottom_xvel1_buffer.data; + const int bottom_xvel1_sizex = bottom_xvel1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { - xvel1(j, y_min - k) = bottom_xvel1(j, bottom_ymax + 1 - k); + xvel1[j + (y_min - k) * xvel1_sizex] = bottom_xvel1[j + (bottom_ymax + 1 - k) * bottom_xvel1_sizex]; } } } @@ -794,9 +1014,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+1+depth - _Pragma("kernel1d") + double *yvel0 = yvel0_buffer.data; + const int yvel0_sizex = yvel0_buffer.nX(); + double *bottom_yvel0 = bottom_yvel0_buffer.data; + const int bottom_yvel0_sizex = bottom_yvel0_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { - yvel0(j, y_min - k) = bottom_yvel0(j, bottom_ymax + 1 - k); + yvel0[j + (y_min - k) * yvel0_sizex] = bottom_yvel0[j + (bottom_ymax + 1 - k) * bottom_yvel0_sizex]; } } } @@ -806,9 +1030,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+1+depth - _Pragma("kernel1d") + double *yvel1 = yvel1_buffer.data; + const int yvel1_sizex = yvel1_buffer.nX(); + double *bottom_yvel1 = bottom_yvel1_buffer.data; + const int bottom_yvel1_sizex = bottom_yvel1_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { - yvel1(j, y_min - k) = bottom_yvel1(j, bottom_ymax + 1 - k); + yvel1[j + (y_min - k) * yvel1_sizex] = bottom_yvel1[j + (bottom_ymax + 1 - k) * bottom_yvel1_sizex]; } } } @@ -818,9 +1046,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+1+depth - _Pragma("kernel1d") + double *vol_flux_x = vol_flux_x_buffer.data; + const int vol_flux_x_sizex = vol_flux_x_buffer.nX(); + double *bottom_vol_flux_x = bottom_vol_flux_x_buffer.data; + const int bottom_vol_flux_x_sizex = bottom_vol_flux_x_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { - vol_flux_x(j, y_min - k) = bottom_vol_flux_x(j, bottom_ymax + 1 - k); + vol_flux_x[j + (y_min - k) * vol_flux_x_sizex] = bottom_vol_flux_x[j + (bottom_ymax + 1 - k) * bottom_vol_flux_x_sizex]; } } } @@ -830,9 +1062,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+1+depth - _Pragma("kernel1d") + double *mass_flux_x = mass_flux_x_buffer.data; + const int mass_flux_x_sizex = mass_flux_x_buffer.nX(); + double *bottom_mass_flux_x = bottom_mass_flux_x_buffer.data; + const int bottom_mass_flux_x_sizex = bottom_mass_flux_x_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) { - mass_flux_x(j, y_min - k) = bottom_mass_flux_x(j, bottom_ymax + 1 - k); + mass_flux_x[j + (y_min - k) * mass_flux_x_sizex] = bottom_mass_flux_x[j + (bottom_ymax + 1 - k) * bottom_mass_flux_x_sizex]; } } } @@ -842,9 +1078,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *vol_flux_y = vol_flux_y_buffer.data; + const int vol_flux_y_sizex = vol_flux_y_buffer.nX(); + double *bottom_vol_flux_y = bottom_vol_flux_y_buffer.data; + const int bottom_vol_flux_y_sizex = bottom_vol_flux_y_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - vol_flux_y(j, y_min - k) = bottom_vol_flux_y(j, bottom_ymax + 1 - k); + vol_flux_y[j + (y_min - k) * vol_flux_y_sizex] = bottom_vol_flux_y[j + (bottom_ymax + 1 - k) * bottom_vol_flux_y_sizex]; } } } @@ -854,9 +1094,13 @@ void update_tile_halo_b_kernel( for (int k = 0; k < depth; ++k) { // DO j=x_min-depth, x_max+depth - _Pragma("kernel1d") + double *mass_flux_y = mass_flux_y_buffer.data; + const int mass_flux_y_sizex = mass_flux_y_buffer.nX(); + double *bottom_mass_flux_y = bottom_mass_flux_y_buffer.data; + const int bottom_mass_flux_y_sizex = bottom_mass_flux_y_buffer.nX(); + #pragma omp target teams distribute parallel for simd clover_use_target(use_target) for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { - mass_flux_y(j, y_min - k) = bottom_mass_flux_y(j, bottom_ymax + 1 - k); + mass_flux_y[j + (y_min - k) * mass_flux_y_sizex] = bottom_mass_flux_y[j + (bottom_ymax + 1 - k) * bottom_mass_flux_y_sizex]; } } } diff --git a/src/update_tile_halo_kernel.h b/src/update_tile_halo_kernel.h index 3c3d305..897b184 100644 --- a/src/update_tile_halo_kernel.h +++ b/src/update_tile_halo_kernel.h @@ -22,9 +22,10 @@ #define UPDATE_TILE_HALO_KERNEL_H #include "definitions.h" -#include "utils.hpp" + void update_tile_halo_l_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, clover::Buffer2D &density0, clover::Buffer2D &energy0, @@ -62,6 +63,7 @@ void update_tile_halo_l_kernel( void update_tile_halo_r_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, clover::Buffer2D &density0, clover::Buffer2D &energy0, @@ -98,6 +100,7 @@ void update_tile_halo_r_kernel( int depth); void update_tile_halo_t_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, clover::Buffer2D &density0, clover::Buffer2D &energy0, @@ -135,6 +138,7 @@ void update_tile_halo_t_kernel( void update_tile_halo_b_kernel( + bool use_target, int x_min, int x_max, int y_min, int y_max, clover::Buffer2D &density0, clover::Buffer2D &energy0, diff --git a/src/utils.hpp b/src/utils.hpp deleted file mode 100644 index 7abef27..0000000 --- a/src/utils.hpp +++ /dev/null @@ -1,79 +0,0 @@ -/* - Crown Copyright 2012 AWE. - - This file is part of CloverLeaf. - - CloverLeaf is free software: you can redistribute it and/or modify it under - the terms of the GNU General Public License as published by the - Free Software Foundation, either version 3 of the License, or (at your option) - any later version. - - CloverLeaf is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - details. - - You should have received a copy of the GNU General Public License along with - CloverLeaf. If not, see http://www.gnu.org/licenses/. - */ - -#ifndef UTILS_HPP -#define UTILS_HPP - -#include -#include -#include -#include - -namespace clover { - - template - struct Buffer1D { - - std::vector data; - - explicit Buffer1D(size_t size) : data(size) {} - - T operator[](size_t i) const { return data[i]; } - T &operator[](size_t i) { return data[i]; } - - T *actual() { return data.data(); } - - [[nodiscard]] size_t size() const { return data.size(); } - - friend std::ostream &operator<<(std::ostream &os, const Buffer1D &buffer) { - os << "Buffer1D(size: " << buffer.size << ")"; - return os; - } - - }; - - template - struct Buffer2D { - - const size_t sizeX, sizeY; - std::vector data; - - Buffer2D(size_t sizeX, size_t sizeY) : sizeX(sizeX), sizeY(sizeY), data(sizeX * sizeY) {} - - T &operator()(size_t i, size_t j) { return data[i + j * sizeX]; } - T const &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } - - - T *actual() { return data.data(); } - - friend std::ostream &operator<<(std::ostream &os, const Buffer2D &buffer) { - os << "Buffer2D(sizeX: " << buffer.sizeX << " sizeY: " << buffer.sizeY << ")"; - return os; - } - - - }; - - -} - - -using namespace clover; - -#endif //UTILS_HPP diff --git a/src/viscosity.cpp b/src/viscosity.cpp index 9f0830c..ab26f49 100644 --- a/src/viscosity.cpp +++ b/src/viscosity.cpp @@ -19,7 +19,6 @@ #include #include "viscosity.h" -#include "utils.hpp" // @brief Fortran viscosity kernel. // @author Wayne Gaudin @@ -27,52 +26,61 @@ // smooth out shock front and prevent oscillations around discontinuities. // Only cells in compression will have a non-zero value. -void viscosity_kernel(int x_min, int x_max, int y_min, int y_max, - clover::Buffer1D &celldx, - clover::Buffer1D &celldy, - clover::Buffer2D &density0, - clover::Buffer2D &pressure, - clover::Buffer2D &viscosity, - clover::Buffer2D &xvel0, - clover::Buffer2D &yvel0) { +void viscosity_kernel( + bool use_target, + int x_min, int x_max, int y_min, int y_max, + field_type &field) { // DO k=y_min,y_max // DO j=x_min,x_max - _Pragma("kernel2d") + + + const int base_stride = field.base_stride; + const int vels_wk_stride = field.vels_wk_stride; + + double *celldx = field.celldx.data; + double *celldy = field.celldy.data; + double *density0 = field.density0.data; + double *pressure = field.pressure.data; + double *viscosity = field.viscosity.data; + double *xvel0 = field.xvel0.data; + double *yvel0 = field.yvel0.data; + + #pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) for (int j = (y_min + 1); j < (y_max + 2); j++) { for (int i = (x_min + 1); i < (x_max + 2); i++) { - double ugrad = (xvel0(i + 1, j + 0) + xvel0(i + 1, j + 1)) - (xvel0(i, j) + xvel0(i + 0, j + 1)); - double vgrad = (yvel0(i + 0, j + 1) + yvel0(i + 1, j + 1)) - (yvel0(i, j) + yvel0(i + 1, j + 0)); + double ugrad = (xvel0[(i + 1) + (j + 0) * vels_wk_stride] + xvel0[(i + 1) + (j + 1) * vels_wk_stride]) - (xvel0[i + j * vels_wk_stride] + xvel0[(i + 0) + (j + 1) * vels_wk_stride]); + double vgrad = (yvel0[(i + 0) + (j + 1) * vels_wk_stride] + yvel0[(i + 1) + (j + 1) * vels_wk_stride]) - (yvel0[i + j * vels_wk_stride] + yvel0[(i + 1) + (j + 0) * vels_wk_stride]); double div = (celldx[i] * (ugrad) + celldy[j] * (vgrad)); - double strain2 = 0.5 * (xvel0(i + 0, j + 1) + - xvel0(i + 1, j + 1) - - xvel0(i, j) - - xvel0(i + 1, j + 0)) / celldy[j] + - 0.5 * (yvel0(i + 1, j + 0) + - yvel0(i + 1, j + 1) - - yvel0(i, j) - - yvel0(i + 0, j + 1)) / celldx[i]; - double pgradx = (pressure(i + 1, j + 0) - pressure(i - 1, j + 0)) / (celldx[i] + celldx[i + 1]); - double pgrady = (pressure(i + 0, j + 1) - pressure(i + 0, j - 1)) / (celldy[j] + celldy[j + 2]); + double strain2 = 0.5 * (xvel0[(i + 0) + (j + 1) * vels_wk_stride] + + xvel0[(i + 1) + (j + 1) * vels_wk_stride] - + xvel0[i + j * vels_wk_stride] - + xvel0[(i + 1) + (j + 0) * vels_wk_stride]) / celldy[j] + + 0.5 * (yvel0[(i + 1) + (j + 0) * vels_wk_stride] + + yvel0[(i + 1) + (j + 1) * vels_wk_stride] - + yvel0[i + j * vels_wk_stride] - + yvel0[(i + 0) + (j + 1) * vels_wk_stride]) / celldx[i]; + double pgradx = (pressure[(i + 1) + (j + 0) * base_stride] - pressure[(i - 1) + (j + 0) * base_stride]) / (celldx[i] + celldx[i + 1]); + double pgrady = (pressure[(i + 0) + (j + 1) * base_stride] - pressure[(i + 0) + (j - 1) * base_stride]) / (celldy[j] + celldy[j + 2]); double pgradx2 = pgradx * pgradx; double pgrady2 = pgrady * pgrady; double limiter = ((0.5 * (ugrad) / celldx[i]) * pgradx2 + (0.5 * (vgrad) / celldy[j]) * pgrady2 + strain2 * pgradx * pgrady) / - std::fmax(pgradx2 + pgrady2, g_small); - if ((limiter > 0.0) || (div >= 0.0)) { viscosity(i, j) = 0.0; } + fmax(pgradx2 + pgrady2, g_small); + if ((limiter > 0.0) || (div >= 0.0)) { viscosity[i + j * base_stride] = 0.0; } else { double dirx = 1.0; if (pgradx < 0.0)dirx = -1.0; - pgradx = dirx * std::fmax(g_small, std::fabs(pgradx)); + pgradx = dirx * fmax(g_small, fabs(pgradx)); double diry = 1.0; if (pgradx < 0.0)diry = -1.0; - pgrady = diry * std::fmax(g_small, std::fabs(pgrady)); - double pgrad = std::sqrt(pgradx * pgradx + pgrady * pgrady); - double xgrad = std::fabs(celldx[i] * pgrad / pgradx); - double ygrad = std::fabs(celldy[j] * pgrad / pgrady); - double grad = std::fmin(xgrad, ygrad); + pgrady = diry * fmax(g_small, fabs(pgrady)); + double pgrad = sqrt(pgradx * pgradx + pgrady * pgrady); + double xgrad = fabs(celldx[i] * pgrad / pgradx); + double ygrad = fabs(celldy[j] * pgrad / pgrady); + double grad = fmin(xgrad, ygrad); double grad2 = grad * grad; - viscosity(i, j) = 2.0 * density0(i, j) * grad2 * limiter * limiter; + viscosity[i + j * base_stride] = 2.0 * density0[i + j * base_stride] * grad2 * limiter * limiter; } } } @@ -84,11 +92,23 @@ void viscosity_kernel(int x_min, int x_max, int y_min, int y_max, // viscosity. void viscosity(global_variables &globals) { + #if SYNC_BUFFERS + globals.hostToDevice(); + #endif + + for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) { tile_type &t = globals.chunk.tiles[tile]; - viscosity_kernel(t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, t.info.t_ymax, - t.field.celldx, t.field.celldy, t.field.density0, - t.field.pressure, t.field.viscosity, t.field.xvel0, - t.field.yvel0); + viscosity_kernel(globals.use_target, + t.info.t_xmin, + t.info.t_xmax, + t.info.t_ymin, + t.info.t_ymax, + t.field); } + + #if SYNC_BUFFERS + globals.deviceToHost(); + #endif + } diff --git a/src/visit.cpp b/src/visit.cpp index b7d532d..1786555 100644 --- a/src/visit.cpp +++ b/src/visit.cpp @@ -183,7 +183,7 @@ void visit(global_variables &globals, parallel_ ¶llel) { for (int j = globals.chunk.tiles[tile].info.t_xmin + 1; j <= globals.chunk.tiles[tile].info.t_xmax + 1; ++j) { double temp = (std::fabs(hm_viscosity(j, k)) > 0.00000001) ? hm_viscosity(j, k) - : 0.0; + : 0.0; u << std::scientific << std::setprecision(3) << temp << std::endl; } }