diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21a83a7..6d45c21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,60 +1,15 @@
-cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
 
-project(cloverleaf_sycl)
+project(cloverleaf_openmp_target)
 
 
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-#
-#set (OpenMP_CXX_FLAGS "-fiopenmp -qnextgen")
-#set (OpenMP_CXX_LIB_NAMES "libiomp5")
-#set (OpenMP_libiomp5_LIBRARY
-#        /opt/intel/oneapi/compiler/2021.1-beta08/linux/compiler/lib/intel64_lin/libiomp5.so
-#        )
 find_package(OpenMP REQUIRED)
 
 set(CMAKE_VERBOSE_MAKEFILE YES)
 
-#set(SYCL_RUNTIME DPCPP)
-#set(HIPSYCL_INSTALL_DIR /home/tom/hypsycl_dist)
-
-
-#if (SYCL_RUNTIME)
-#
-#    list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
-#
-#    if (${SYCL_RUNTIME} STREQUAL "HIPSYCL")
-#
-#        if (NOT HIPSYCL_INSTALL_DIR)
-#            message(FATAL_ERROR "HIPSYCL_INSTALL_DIR is undefined")
-#        endif ()
-#
-#        set(hipSYCL_DIR ${HIPSYCL_INSTALL_DIR}/lib/cmake)
-#        find_package(hipSYCL CONFIG REQUIRED)
-#        set(EXTRA_FLAGS -Wno-sign-compare -Wno-stringop-truncation)
-#    elseif (${SYCL_RUNTIME} STREQUAL "COMPUTECPP")
-#
-#        if (NOT ComputeCpp_DIR)
-#            message(FATAL_ERROR "ComputeCpp_DIR is undefined")
-#        endif ()
-#        add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
-#        set(COMPUTECPP_USER_FLAGS -O3 -fsycl-split-modules=20 -mllvm -inline-threshold=10000 -no-serial-memop)
-#        find_package(ComputeCpp REQUIRED)
-#        #        set(EXTRA_FLAGS  -pedantic)
-#    elseif (${SYCL_RUNTIME} STREQUAL "DPCPP")
-#
-#        set(CMAKE_CXX_STANDARD 17)
-#        set(CMAKE_CXX_COMPILER "dpcpp")
-#        set(EXTRA_FLAGS -pedantic)
-#        include_directories(/opt/intel/oneapi/compiler/2021.1-beta08/linux/compiler/include/)
-#    else ()
-#        message(FATAL_ERROR "SYCL_RUNTIME unsupported, must be one of HIPSYCL|COMPUTECPP|DPCPP, got ${SYCL_RUNTIME}")
-#    endif ()
-#else ()
-#    message(FATAL_ERROR "SYCL_RUNTIME not defined, must be one of HIPSYCL|COMPUTECPP|DPCPP")
-#endif ()
-
 
 if (MPI_AS_LIBRARY)
 
@@ -87,6 +42,7 @@ set(SOURCES
         src/advec_mom.cpp
         src/advection.cpp
         src/build_field.cpp
+        src/finalise_field.cpp
         src/calc_dt.cpp
         src/clover_leaf.cpp
         src/comms.cpp
@@ -120,34 +76,51 @@ target_compile_options(clover_leaf
         -Wall
         -Wextra
         -Wcast-align
-        -Wfatal-errors
+#        -Wfatal-errors
         -Werror=return-type
         -Wno-unused-parameter
         -Wno-unused-variable
-        -Wno-ignored-attributes
+#        -Wno-ignored-attributes
 
         ${EXTRA_FLAGS}
         )
 
-set(OMP_OFFLOAD_FLAGS -foffload=nvptx-none)
+separate_arguments(OMP_OFFLOAD_FLAGS)
+separate_arguments(CXX_EXTRA_FLAGS)
+separate_arguments(CXX_EXTRA_LINKER_FLAGS)
+
+
+option(OMP_ALLOW_HOST "Whether the OMP clause `if(target: <bool>)` is included at compile time, some compilers may not support this feature" ON)
 
-set(DEBUG_OPTIONS -O2 -fno-omit-frame-pointer -fsanitize=address ${OMP_OFFLOAD_FLAGS} ${CXX_EXTRA_FLAGS})
-set(RELEASE_OPTIONS -Ofast -march=native -mtune=native ${OMP_OFFLOAD_FLAGS} ${CXX_EXTRA_FLAGS})
+if (OMP_ALLOW_HOST)
+    add_definitions(-DOMP_ALLOW_HOST)
+endif()
+
+set(DEBUG_OPTIONS -O2 -fno-omit-frame-pointer ${OMP_OFFLOAD_FLAGS} ${CXX_EXTRA_FLAGS})
+set(RELEASE_OPTIONS -O3 ${OMP_OFFLOAD_FLAGS} ${CXX_EXTRA_FLAGS})
 
 target_link_libraries(clover_leaf PUBLIC ${MPI_C_LIB})
 target_link_libraries(clover_leaf PUBLIC OpenMP::OpenMP_CXX OpenMP::OpenMP_C)
 
-# remove when using omp target
-#target_link_libraries(clover_leaf PUBLIC $<$<CONFIG:RelWithDebInfo>:-Wl,-lasan>)
-#target_link_libraries(clover_leaf PUBLIC $<$<CONFIG:Release>:-Wl,-lasan>)
-target_link_libraries(clover_leaf PUBLIC $<$<CONFIG:Debug>:-Wl,-lasan>)
 
 target_compile_options(clover_leaf PUBLIC "$<$<CONFIG:RelWithDebInfo>:${RELEASE_OPTIONS}>")
 target_compile_options(clover_leaf PUBLIC "$<$<CONFIG:Release>:${RELEASE_OPTIONS}>")
 target_compile_options(clover_leaf PUBLIC "$<$<CONFIG:Debug>:${DEBUG_OPTIONS}>")
 
 
-target_link_options(clover_leaf PUBLIC ${OpenMP_CXX_FLAGS} ${OMP_OFFLOAD_FLAGS})
+if (${CMAKE_VERSION} VERSION_LESS "3.13.0")
+    message(WARNING "target_link_options is only available in CMake >= 3.13.0, using fallback target_link_libraries, this may cause issues with some compilers")
+
+    set(EXTRA_LINK_FLAGS ${OpenMP_CXX_FLAGS} ${OMP_OFFLOAD_FLAGS})
 
-#target_link_options(clover_leaf PUBLIC LINKER:${CXX_EXTRA_LINKER_FLAGS})
+    if (DEFINED CXX_EXTRA_LINKER_FLAGS)
+        list(APPEND EXTRA_LINK_FLAGS "-Wl,${CXX_EXTRA_LINKER_FLAGS}")
+    endif ()
+
+    target_link_libraries(clover_leaf PUBLIC ${EXTRA_LINK_FLAGS})
+
+else ()
+    target_link_options(clover_leaf PUBLIC ${OpenMP_CXX_FLAGS} ${OMP_OFFLOAD_FLAGS})
+    target_link_options(clover_leaf PUBLIC LINKER:${CXX_EXTRA_LINKER_FLAGS})
+endif ()
 
diff --git a/README.md b/README.md
index 5e8a861..3810c30 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,6 @@
+> [!WARNING]  
+> Superseded by <https://github.com/UoB-HPC/CloverLeaf>, which contains a OpenMP target implementation, along with many other models.
+
 # A OpenMP Target port of CloverLeaf
 
 This is a port of [CloverLeaf](https://github.com/UoB-HPC/cloverleaf_kokkos) from MPI+Kokkos to MPI+OpenMP Target.
@@ -28,28 +31,41 @@ Flags:
    * Set `MPI_C_INCLUDE_DIR` to  <mpi_root_dir>/include
    * Set `MPI_C_LIB` to the library name, for exampe: mpich for libmpich.so
  * `CXX_EXTRA_FLAGS` - `STRING`, appends extra flags that will be passed on to the compiler, applies to all configs
-  * `CXX_EXTRA_LINKER_FLAGS` - `STRING`, appends extra linker flags (the comma separated list after the `-Wl` flag) to the linker, applies to all configs
-    
+ * `CXX_EXTRA_LINKER_FLAGS` - `STRING`, appends extra linker flags (the comma separated list after the `-Wl` flag) to the linker, applies to all configs
+ * `OMP_OFFLOAD_FLAGS` - OpenMP 4.5 target offload flags that will passed directly to the compiler and linker, see examples flag combinations below.
+    * GCC+NVIDIA - `"-foffload=nvptx-none -foffload=-lm  -fno-fast-math -fno-associative-math"`
+    * GCC+Radeon - `"-foffload=amdgcn-amdhsa='-march=gfx906' -foffload=-lm  -fno-fast-math -fno-associative-math"`
+    * LLVM+NVIDIA - `"-fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_75"`
+    * ICC - `"-qnextgen -fiopenmp -fopenmp-targets=spir64"`
+    * CCE+NVIDIA - `"-fopenmp-targets=nvptx64 -Xopenmp-target -march=sm_60"`
+ * `OMP_ALLOW_HOST` - `BOOL(ON|OFF)`, enabled by default, set to false if the compiler is unable to support dynamic selection of host/target devices. If disabled, running the binary with `--no-target` emits an error.
+
+
 
 If parts of your toolchain are installed at different places, you'll have to specify it manually, for example:
 
     cmake3 -Bbuild -H.  \
-    -DCMAKE_C_COMPILER=/nfs/software/x86_64/gcc/9.1.0/bin/gcc \
-    -DCMAKE_CXX_COMPILER=/nfs/software/x86_64/gcc/9.1.0/bin/g++ \
+    -DCMAKE_C_COMPILER=gcc \
+    -DCMAKE_CXX_COMPILER=g++ \
     -DCMAKE_BUILD_TYPE=Release \
+    -DOMP_OFFLOAD_FLAGS="-foffload=nvptx-none -foffload=-lm  -fno-fast-math -fno-associative-math"
     
 Proceed with compiling:
     
-    cmake3 --build build --target clover_leaf --config Release -j $(nproc)
-   
+    cmake3 --build build --target cloverleaf --config Release -j $(nproc)
+
+## Known issues
+
+ * ICC 2021.1 Beta 20200602 requires `-DOMP_ALLOW_HOST=OFF`
+ 
 
 ## Running
 
-The main `clover_leaf` executable takes a `clover.in` file as parameter and outputs `clover.out` at working directory.
+The main `cloverleaf` executable takes a `clover.in` file as parameter and outputs `clover.out` at working directory.
 
 For example, after successful compilation, at **project root**:
 
-    ./build/clover_leaf --file InputDecks/clover_bm16_short.in
+    ./build/cloverleaf --file InputDecks/clover_bm16_short.in
 
 See [Tested configurations](#tested-configurations) for tested platforms and drivers.  
 
@@ -58,7 +74,8 @@ For help, use the `-h` flag:
 Options:
   -h  --help               Print the message
       --list               List available devices
+      --no-target          Use OMP fallback
       --device <INDEX>     Select device at INDEX from output of --list
-      --input              Custom clover.in file (defaults to clover.in if unspecified)
+      --file               Custom clover.in file (defaults to clover.in if unspecified)
 ```
 
diff --git a/src/PdV.cpp b/src/PdV.cpp
index 94c7268..9ce30c1 100644
--- a/src/PdV.cpp
+++ b/src/PdV.cpp
@@ -26,7 +26,7 @@
 #include "ideal_gas.h"
 #include "update_halo.h"
 #include "revert.h"
-#include "utils.hpp"
+
 
 //  @brief Fortran PdV kernel.
 //  @author Wayne Gaudin
@@ -35,88 +35,118 @@
 //  level of the velocity data depends on whether it is invoked as the
 //  predictor or corrector.
 void PdV_kernel(
+		bool use_target,
 		bool predict,
 		int x_min, int x_max, int y_min, int y_max,
 		double dt,
-		clover::Buffer2D<double> &xarea,
-		clover::Buffer2D<double> &yarea,
-		clover::Buffer2D<double> &volume,
-		clover::Buffer2D<double> &density0,
-		clover::Buffer2D<double> &density1,
-		clover::Buffer2D<double> &energy0,
-		clover::Buffer2D<double> &energy1,
-		clover::Buffer2D<double> &pressure,
-		clover::Buffer2D<double> &viscosity,
-		clover::Buffer2D<double> &xvel0,
-		clover::Buffer2D<double> &xvel1,
-		clover::Buffer2D<double> &yvel0,
-		clover::Buffer2D<double> &yvel1,
-		clover::Buffer2D<double> &volume_change) {
+		field_type &field
+) {
 
+	const int base_stride = field.base_stride;
+	const int vels_wk_stride = field.vels_wk_stride;
+	const int flux_x_stride = field.flux_x_stride;
+	const int flux_y_stride = field.flux_y_stride;
 
 	// DO k=y_min,y_max
 	//   DO j=x_min,x_max
 
 	if (predict) {
 
-		_Pragma("kernel2d")
+		double *xarea = field.xarea.data;
+
+		double *yarea = field.yarea.data;
+		double *volume = field.volume.data;
+		double *density0 = field.density0.data;
+		double *density1 = field.density1.data;
+		double *energy0 = field.energy0.data;
+		double *energy1 = field.energy1.data;
+		double *pressure = field.pressure.data;
+		double *viscosity = field.viscosity.data;
+		double *xvel0 = field.xvel0.data;
+		double *xvel1 = field.xvel1.data;
+		double *yvel0 = field.yvel0.data;
+		double *yvel1 = field.yvel1.data;
+		double *volume_change = field.work_array1.data;
+
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min + 1); j < (y_max + 2); j++) {
 			for (int i = (x_min + 1); i < (x_max + 2); i++) {
-				double left_flux = (xarea(i, j) * (xvel0(i, j) +
-				                                   xvel0(i + 0, j + 1) +
-				                                   xvel0(i, j) +
-				                                   xvel0(i + 0, j + 1))) * 0.25 * dt * 0.5;
-				double right_flux = (xarea(i + 1, j + 0) * (xvel0(i + 1, j + 0) +
-				                                            xvel0(i + 1, j + 1) +
-				                                            xvel0(i + 1, j + 0) +
-				                                            xvel0(i + 1, j + 1))) * 0.25 * dt * 0.5;
-				double bottom_flux = (yarea(i, j) * (yvel0(i, j) +
-				                                     yvel0(i + 1, j + 0) +
-				                                     yvel0(i, j) +
-				                                     yvel0(i + 1, j + 0))) * 0.25 * dt * 0.5;
-				double top_flux = (yarea(i + 0, j + 1) * (yvel0(i + 0, j + 1) +
-				                                          yvel0(i + 1, j + 1) +
-				                                          yvel0(i + 0, j + 1) +
-				                                          yvel0(i + 1, j + 1))) * 0.25 * dt * 0.5;
+				double left_flux = (xarea[i + j * flux_x_stride] * (xvel0[i + j * vels_wk_stride] +
+				                                                    xvel0[(i + 0) + (j + 1) * vels_wk_stride] +
+				                                                    xvel0[i + j * vels_wk_stride] +
+				                                                    xvel0[(i + 0) + (j + 1) * vels_wk_stride])) * 0.25 * dt * 0.5;
+				double right_flux = (xarea[(i + 1) + (j + 0) * flux_x_stride] * (xvel0[(i + 1) + (j + 0) * vels_wk_stride] +
+				                                                                 xvel0[(i + 1) + (j + 1) * vels_wk_stride] +
+				                                                                 xvel0[(i + 1) + (j + 0) * vels_wk_stride] +
+				                                                                 xvel0[(i + 1) + (j + 1) * vels_wk_stride])) * 0.25 * dt * 0.5;
+				double bottom_flux = (yarea[i + j * flux_y_stride] * (yvel0[i + j * vels_wk_stride] +
+				                                                      yvel0[(i + 1) + (j + 0) * vels_wk_stride] +
+				                                                      yvel0[i + j * vels_wk_stride] +
+				                                                      yvel0[(i + 1) + (j + 0) * vels_wk_stride])) * 0.25 * dt * 0.5;
+				double top_flux = (yarea[(i + 0) + (j + 1) * flux_y_stride] * (yvel0[(i + 0) + (j + 1) * vels_wk_stride] +
+				                                                               yvel0[(i + 1) + (j + 1) * vels_wk_stride] +
+				                                                               yvel0[(i + 0) + (j + 1) * vels_wk_stride] +
+				                                                               yvel0[(i + 1) + (j + 1) * vels_wk_stride])) * 0.25 * dt * 0.5;
 				double total_flux = right_flux - left_flux + top_flux - bottom_flux;
-				double volume_change_s = volume(i, j) / (volume(i, j) + total_flux);
-				double min_cell_volume = std::fmin(std::fmin(volume(i, j) + right_flux - left_flux + top_flux - bottom_flux, volume(i, j) + right_flux - left_flux), volume(i, j) + top_flux - bottom_flux);
-				double recip_volume = 1.0 / volume(i, j);
-				double energy_change = (pressure(i, j) / density0(i, j) + viscosity(i, j) / density0(i, j)) * total_flux * recip_volume;
-				energy1(i, j) = energy0(i, j) - energy_change;
-				density1(i, j) = density0(i, j) * volume_change_s;
+				double volume_change_s = volume[i + j * base_stride] / (volume[i + j * base_stride] + total_flux);
+				double min_cell_volume = fmin(fmin(volume[i + j * base_stride] + right_flux - left_flux + top_flux - bottom_flux, volume[i + j * base_stride] + right_flux - left_flux),
+				                              volume[i + j * base_stride] + top_flux - bottom_flux);
+				double recip_volume = 1.0 / volume[i + j * base_stride];
+				double energy_change =
+						(pressure[i + j * base_stride] / density0[i + j * base_stride] + viscosity[i + j * base_stride] / density0[i + j * base_stride]) * total_flux *
+						recip_volume;
+				energy1[i + j * base_stride] = energy0[i + j * base_stride] - energy_change;
+				density1[i + j * base_stride] = density0[i + j * base_stride] * volume_change_s;
 			}
 		}
 
 	} else {
 
-		_Pragma("kernel2d")
+		double *xarea = field.xarea.data;
+		double *yarea = field.yarea.data;
+		double *volume = field.volume.data;
+		double *density0 = field.density0.data;
+		double *density1 = field.density1.data;
+		double *energy0 = field.energy0.data;
+		double *energy1 = field.energy1.data;
+		double *pressure = field.pressure.data;
+		double *viscosity = field.viscosity.data;
+		double *xvel0 = field.xvel0.data;
+		double *xvel1 = field.xvel1.data;
+		double *yvel0 = field.yvel0.data;
+		double *yvel1 = field.yvel1.data;
+		double *volume_change = field.work_array1.data;
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min + 1); j < (y_max + 2); j++) {
 			for (int i = (x_min + 1); i < (x_max + 2); i++) {
-				double left_flux = (xarea(i, j) * (xvel0(i, j) +
-				                                   xvel0(i + 0, j + 1) +
-				                                   xvel1(i, j) +
-				                                   xvel1(i + 0, j + 1))) * 0.25 * dt;
-				double right_flux = (xarea(i + 1, j + 0) * (xvel0(i + 1, j + 0) +
-				                                            xvel0(i + 1, j + 1) +
-				                                            xvel1(i + 1, j + 0) +
-				                                            xvel1(i + 1, j + 1))) * 0.25 * dt;
-				double bottom_flux = (yarea(i, j) * (yvel0(i, j) +
-				                                     yvel0(i + 1, j + 0) +
-				                                     yvel1(i, j) +
-				                                     yvel1(i + 1, j + 0))) * 0.25 * dt;
-				double top_flux = (yarea(i + 0, j + 1) * (yvel0(i + 0, j + 1) +
-				                                          yvel0(i + 1, j + 1) +
-				                                          yvel1(i + 0, j + 1) + yvel1(i + 1, j + 1))) * 0.25 * dt;
+				double left_flux = (xarea[i + j * flux_x_stride] * (xvel0[i + j * vels_wk_stride] +
+				                                                    xvel0[(i + 0) + (j + 1) * vels_wk_stride] +
+				                                                    xvel1[i + j * vels_wk_stride] +
+				                                                    xvel1[(i + 0) + (j + 1) * vels_wk_stride])) * 0.25 * dt;
+				double right_flux = (xarea[(i + 1) + (j + 0) * flux_x_stride] * (xvel0[(i + 1) + (j + 0) * vels_wk_stride] +
+				                                                                 xvel0[(i + 1) + (j + 1) * vels_wk_stride] +
+				                                                                 xvel1[(i + 1) + (j + 0) * vels_wk_stride] +
+				                                                                 xvel1[(i + 1) + (j + 1) * vels_wk_stride])) * 0.25 * dt;
+				double bottom_flux = (yarea[i + j * flux_y_stride] * (yvel0[i + j * vels_wk_stride] +
+				                                                      yvel0[(i + 1) + (j + 0) * vels_wk_stride] +
+				                                                      yvel1[i + j * vels_wk_stride] +
+				                                                      yvel1[(i + 1) + (j + 0) * vels_wk_stride])) * 0.25 * dt;
+				double top_flux = (yarea[(i + 0) + (j + 1) * flux_y_stride] * (yvel0[(i + 0) + (j + 1) * vels_wk_stride] +
+				                                                               yvel0[(i + 1) + (j + 1) * vels_wk_stride] +
+				                                                               yvel1[(i + 0) + (j + 1) * vels_wk_stride] + yvel1[(i + 1) + (j + 1) * vels_wk_stride])) * 0.25 * dt;
 				double total_flux = right_flux - left_flux + top_flux - bottom_flux;
-				double volume_change_s = volume(i, j) / (volume(i, j) + total_flux);
-				double min_cell_volume = std::fmin(std::fmin(
-						volume(i, j) + right_flux - left_flux + top_flux - bottom_flux, volume(i, j) + right_flux - left_flux),
-				                                   volume(i, j) + top_flux - bottom_flux);
-				double recip_volume = 1.0 / volume(i, j);
-				double energy_change = (pressure(i, j) / density0(i, j) + viscosity(i, j) / density0(i, j)) * total_flux * recip_volume;
-				energy1(i, j) = energy0(i, j) - energy_change;
-				density1(i, j) = density0(i, j) * volume_change_s;
+				double volume_change_s = volume[i + j * base_stride] / (volume[i + j * base_stride] + total_flux);
+				double min_cell_volume = fmin(fmin(
+						volume[i + j * base_stride] + right_flux - left_flux + top_flux - bottom_flux, volume[i + j * base_stride] + right_flux - left_flux),
+				                              volume[i + j * base_stride] + top_flux - bottom_flux);
+				double recip_volume = 1.0 / volume[i + j * base_stride];
+				double energy_change =
+						(pressure[i + j * base_stride] / density0[i + j * base_stride] + viscosity[i + j * base_stride] / density0[i + j * base_stride]) * total_flux *
+						recip_volume;
+				energy1[i + j * base_stride] = energy0[i + j * base_stride] - energy_change;
+				density1[i + j * base_stride] = density0[i + j * base_stride] * volume_change_s;
 			}
 		}
 	}
@@ -134,30 +164,25 @@ void PdV(global_variables &globals, bool predict) {
 
 	globals.error_condition = 0;
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
 	for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) {
 		tile_type &t = globals.chunk.tiles[tile];
-		PdV_kernel(predict,
+		PdV_kernel(globals.use_target,
+		           predict,
 		           t.info.t_xmin,
 		           t.info.t_xmax,
 		           t.info.t_ymin,
 		           t.info.t_ymax,
 		           globals.dt,
-		           t.field.xarea,
-		           t.field.yarea,
-		           t.field.volume,
-		           t.field.density0,
-		           t.field.density1,
-		           t.field.energy0,
-		           t.field.energy1,
-		           t.field.pressure,
-		           t.field.viscosity,
-		           t.field.xvel0,
-		           t.field.xvel1,
-		           t.field.yvel0,
-		           t.field.yvel1,
-		           t.field.work_array1);
+		           t.field);
 	}
 
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
 
 	clover_check_error(globals.error_condition);
 	if (globals.profiler_on) globals.profiler.PdV += timer() - kernel_time;
diff --git a/src/accelerate.cpp b/src/accelerate.cpp
index 7e04ab7..e8c93d4 100644
--- a/src/accelerate.cpp
+++ b/src/accelerate.cpp
@@ -21,7 +21,7 @@
 
 #include "accelerate.h"
 #include "timer.h"
-#include "utils.hpp"
+
 
 
 // @brief Fortran acceleration kernel
@@ -29,18 +29,10 @@
 // @details The pressure and viscosity gradients are used to update the 
 // velocity field.
 void accelerate_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
 		double dt,
-		clover::Buffer2D<double> &xarea,
-		clover::Buffer2D<double> &yarea,
-		clover::Buffer2D<double> &volume,
-		clover::Buffer2D<double> &density0,
-		clover::Buffer2D<double> &pressure,
-		clover::Buffer2D<double> &viscosity,
-		clover::Buffer2D<double> &xvel0,
-		clover::Buffer2D<double> &yvel0,
-		clover::Buffer2D<double> &xvel1,
-		clover::Buffer2D<double> &yvel1) {
+		field_type &field) {
 
 	double halfdt = 0.5 * dt;
 
@@ -51,29 +43,47 @@ void accelerate_kernel(
 
 //for(int j = )
 
-
-	_Pragma("kernel2d")
+	const int xarea_sizex = field.flux_x_stride;
+	const int yarea_sizex = field.flux_y_stride;
+	const int base_stride = field.base_stride;
+	const int vels_wk_stride = field.vels_wk_stride;
+
+	double *xarea = field.xarea.data;
+	double *yarea = field.yarea.data;
+	double *volume = field.volume.data;
+	double *density0 = field.density0.data;
+	double *pressure = field.pressure.data;
+	double *viscosity = field.viscosity.data;
+	double *xvel0 = field.xvel0.data;
+	double *yvel0 = field.yvel0.data;
+	double *xvel1 = field.xvel1.data;
+	double *yvel1 = field.yvel1.data;
+
+	#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 	for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) {
 		for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) {
-			double stepbymass_s = halfdt / ((density0(i - 1, j - 1) * volume(i - 1, j - 1) +
-			                                 density0(i - 1, j + 0) * volume(i - 1, j + 0) + density0(i, j) * volume(i, j) +
-			                                 density0(i + 0, j - 1) * volume(i + 0, j - 1)) * 0.25);
-			xvel1(i, j) = xvel0(i, j) -
-			              stepbymass_s * (xarea(i, j) * (pressure(i, j) - pressure(i - 1, j + 0)) +
-			                              xarea(i + 0, j - 1) * (pressure(i + 0, j - 1) - pressure(i - 1, j - 1)));
-			yvel1(i, j) = yvel0(i, j) -
-			              stepbymass_s * (yarea(i, j) *
-			                              (pressure(i, j) - pressure(i + 0, j - 1)) +
-			                              yarea(i - 1, j + 0) * (pressure(i - 1, j + 0) - pressure(i - 1, j - 1)));
-			xvel1(i, j) = xvel1(i, j) -
-			              stepbymass_s * (xarea(i, j) *
-			                              (viscosity(i, j) -
-			                               viscosity(i - 1, j + 0)) +
-			                              xarea(i + 0, j - 1) * (viscosity(i + 0, j - 1) - viscosity(i - 1, j - 1)));
-			yvel1(i, j) = yvel1(i, j) -
-			              stepbymass_s * (yarea(i, j) *
-			                              (viscosity(i, j) - viscosity(i + 0, j - 1)) +
-			                              yarea(i - 1, j + 0) * (viscosity(i - 1, j + 0) - viscosity(i - 1, j - 1)));
+			double stepbymass_s = halfdt / ((density0[(i - 1) + (j - 1) * base_stride] * volume[(i - 1) + (j - 1) * base_stride] +
+			                                 density0[(i - 1) + (j + 0) * base_stride] * volume[(i - 1) + (j + 0) * base_stride] +
+			                                 density0[i + j * base_stride] * volume[i + j * base_stride] +
+			                                 density0[(i + 0) + (j - 1) * base_stride] * volume[(i + 0) + (j - 1) * base_stride]) * 0.25);
+			xvel1[i + j * vels_wk_stride] = xvel0[i + j * vels_wk_stride] -
+			                                stepbymass_s * (xarea[i + j * xarea_sizex] * (pressure[i + j * base_stride] - pressure[(i - 1) + (j + 0) * base_stride]) +
+			                                                xarea[(i + 0) + (j - 1) * xarea_sizex] * (pressure[(i + 0) + (j - 1) * base_stride] - pressure[(i - 1) + (j - 1) * base_stride]));
+			yvel1[i + j * vels_wk_stride] = yvel0[i + j * vels_wk_stride] -
+			                                stepbymass_s * (yarea[i + j * yarea_sizex] *
+			                                                (pressure[i + j * base_stride] - pressure[(i + 0) + (j - 1) * base_stride]) +
+			                                                yarea[(i - 1) + (j + 0) * yarea_sizex] * (pressure[(i - 1) + (j + 0) * base_stride] - pressure[(i - 1) + (j - 1) * base_stride]));
+			xvel1[i + j * vels_wk_stride] = xvel1[i + j * vels_wk_stride] -
+			                                stepbymass_s * (xarea[i + j * xarea_sizex] *
+			                                                (viscosity[i + j * base_stride] -
+			                                                 viscosity[(i - 1) + (j + 0) * base_stride]) +
+			                                                xarea[(i + 0) + (j - 1) * xarea_sizex] *
+			                                                (viscosity[(i + 0) + (j - 1) * base_stride] - viscosity[(i - 1) + (j - 1) * base_stride]));
+			yvel1[i + j * vels_wk_stride] = yvel1[i + j * vels_wk_stride] -
+			                                stepbymass_s * (yarea[i + j * yarea_sizex] *
+			                                                (viscosity[i + j * base_stride] - viscosity[(i + 0) + (j - 1) * base_stride]) +
+			                                                yarea[(i - 1) + (j + 0) * yarea_sizex] *
+			                                                (viscosity[(i - 1) + (j + 0) * base_stride] - viscosity[(i - 1) + (j - 1) * base_stride]));
 		}
 	}
 }
@@ -87,30 +97,29 @@ void accelerate(global_variables &globals) {
 	double kernel_time = 0;
 	if (globals.profiler_on) kernel_time = timer();
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
 
 	for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) {
 		tile_type &t = globals.chunk.tiles[tile];
 
 		accelerate_kernel(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
 				t.info.t_ymax,
 				globals.dt,
-				t.field.xarea,
-				t.field.yarea,
-				t.field.volume,
-				t.field.density0,
-				t.field.pressure,
-				t.field.viscosity,
-				t.field.xvel0,
-				t.field.yvel0,
-				t.field.xvel1,
-				t.field.yvel1);
+				t.field);
 
 
 	}
 
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
+
 	if (globals.profiler_on) globals.profiler.acceleration += timer() - kernel_time;
 
 }
diff --git a/src/advec_cell.cpp b/src/advec_cell.cpp
index f3aae30..759c665 100644
--- a/src/advec_cell.cpp
+++ b/src/advec_cell.cpp
@@ -20,7 +20,9 @@
 
 #include <cmath>
 #include "advec_cell.h"
-#include "utils.hpp"
+
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
 
 
 //  @brief Fortran cell advection kernel.
@@ -28,31 +30,22 @@
 //  @details Performs a second order advective remap using van-Leer limiting
 //  with directional splitting.
 void advec_cell_kernel(
+		bool use_target,
 		int x_min,
 		int x_max,
 		int y_min,
 		int y_max,
 		int dir,
 		int sweep_number,
-		clover::Buffer1D<double> &vertexdx,
-		clover::Buffer1D<double> &vertexdy,
-		clover::Buffer2D<double> &volume,
-		clover::Buffer2D<double> &density1,
-		clover::Buffer2D<double> &energy1,
-		clover::Buffer2D<double> &mass_flux_x,
-		clover::Buffer2D<double> &vol_flux_x,
-		clover::Buffer2D<double> &mass_flux_y,
-		clover::Buffer2D<double> &vol_flux_y,
-		clover::Buffer2D<double> &pre_vol,
-		clover::Buffer2D<double> &post_vol,
-		clover::Buffer2D<double> &pre_mass,
-		clover::Buffer2D<double> &post_mass,
-		clover::Buffer2D<double> &advec_vol,
-		clover::Buffer2D<double> &post_ener,
-		clover::Buffer2D<double> &ener_flux) {
+		field_type &field) {
 
 	const double one_by_six = 1.0 / 6.0;
 
+	const int base_stride = field.base_stride;
+	const int vels_wk_stride = field.vels_wk_stride;
+	const int flux_x_stride = field.flux_x_stride;
+	const int flux_y_stride = field.flux_y_stride;
+
 	if (dir == g_xdir) {
 
 		// DO k=y_min-2,y_max+2
@@ -61,11 +54,20 @@ void advec_cell_kernel(
 		if (sweep_number == 1) {
 
 
-			_Pragma("kernel2d")
+			double *volume = field.volume.data;
+			double *vol_flux_x = field.vol_flux_x.data;
+			double *vol_flux_y = field.vol_flux_y.data;
+			double *pre_vol = field.work_array1.data;
+			double *post_vol = field.work_array2.data;
+
+			#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 			for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) {
 				for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) {
-					pre_vol(i, j) = volume(i, j) + (vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j) + vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j));
-					post_vol(i, j) = pre_vol(i, j) - (vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j));
+					pre_vol[i + j * vels_wk_stride] = volume[i + j * base_stride] +
+					                                  (vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - vol_flux_x[i + j * flux_x_stride] +
+					                                   vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] -
+					                                   vol_flux_y[i + j * flux_y_stride]);
+					post_vol[i + j * vels_wk_stride] = pre_vol[i + j * vels_wk_stride] - (vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - vol_flux_x[i + j * flux_x_stride]);
 				}
 			}
 
@@ -73,11 +75,16 @@ void advec_cell_kernel(
 		} else {
 
 
-			_Pragma("kernel2d")
+			double *volume = field.volume.data;
+			double *vol_flux_x = field.vol_flux_x.data;
+			double *pre_vol = field.work_array1.data;
+			double *post_vol = field.work_array2.data;
+
+			#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 			for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) {
 				for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) {
-					pre_vol(i, j) = volume(i, j) + vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j);
-					post_vol(i, j) = volume(i, j);
+					pre_vol[i + j * vels_wk_stride] = volume[i + j * base_stride] + vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - vol_flux_x[i + j * flux_x_stride];
+					post_vol[i + j * vels_wk_stride] = volume[i + j * base_stride];
 				}
 			}
 
@@ -85,58 +92,66 @@ void advec_cell_kernel(
 
 		// DO k=y_min,y_max
 		//   DO j=x_min,x_max+2
-		_Pragma("kernel2d")
+		double *vertexdx = field.vertexdx.data;
+		double *density1 = field.density1.data;
+		double *energy1 = field.energy1.data;
+		double *mass_flux_x = field.mass_flux_x.data;
+		double *vol_flux_x = field.vol_flux_x.data;
+		double *pre_vol = field.work_array1.data;
+		double *ener_flux = field.work_array7.data;
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min + 1); j < (y_max + 2); j++) {
 			for (int i = (x_min + 1); i < (x_max + 2 + 2); i++)
 				({
 					int upwind, donor, downwind, dif;
 					double sigmat, sigma3, sigma4, sigmav, sigma, sigmam, diffuw, diffdw, limiter, wind;
-					if (vol_flux_x(i, j) > 0.0) {
+					if (vol_flux_x[i + j * flux_x_stride] > 0.0) {
 						upwind = i - 2;
 						donor = i - 1;
 						downwind = i;
 						dif = donor;
 					} else {
-						upwind = std::min(i + 1, x_max + 2);
+						upwind = MIN(i + 1, x_max + 2);
 						donor = i;
 						downwind = i - 1;
 						dif = upwind;
 					}
-					sigmat = std::fabs(vol_flux_x(i, j)) / pre_vol(donor, j);
+					sigmat = fabs(vol_flux_x[i + j * flux_x_stride]) / pre_vol[donor + j * vels_wk_stride];
 					sigma3 = (1.0 + sigmat) * (vertexdx[i] / vertexdx[dif]);
 					sigma4 = 2.0 - sigmat;
-					sigma = sigmat;
+//					sigma = sigmat;
 					sigmav = sigmat;
-					diffuw = density1(donor, j) - density1(upwind, j);
-					diffdw = density1(downwind, j) - density1(donor, j);
+					diffuw = density1[donor + j * base_stride] - density1[upwind + j * base_stride];
+					diffdw = density1[downwind + j * base_stride] - density1[donor + j * base_stride];
 					wind = 1.0;
 					if (diffdw <= 0.0)wind = -1.0;
 					if (diffuw * diffdw > 0.0) {
 						limiter = (1.0 - sigmav) * wind *
-						          std::fmin(std::fmin(
-								          std::fabs(diffuw),
-								          std::fabs(diffdw)),
-						                    one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw)));
+						          fmin(fmin(
+								          fabs(diffuw),
+								          fabs(diffdw)),
+						               one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw)));
 					} else {
 						limiter = 0.0;
 					}
-					mass_flux_x(i, j) = vol_flux_x(i, j) * (density1(donor, j) + limiter);
-					sigmam = std::fabs(mass_flux_x(i, j)) / (density1(donor, j) * pre_vol(donor, j));
-					diffuw = energy1(donor, j) - energy1(upwind, j);
-					diffdw = energy1(downwind, j) - energy1(donor, j);
+					mass_flux_x[i + j * flux_x_stride] = vol_flux_x[i + j * flux_x_stride] * (density1[donor + j * base_stride] + limiter);
+					sigmam = fabs(mass_flux_x[i + j * flux_x_stride]) / (density1[donor + j * base_stride] * pre_vol[donor + j * vels_wk_stride]);
+					diffuw = energy1[donor + j * base_stride] - energy1[upwind + j * base_stride];
+					diffdw = energy1[downwind + j * base_stride] - energy1[donor + j * base_stride];
 					wind = 1.0;
 					if (diffdw <= 0.0)wind = -1.0;
 					if (diffuw * diffdw > 0.0) {
 						limiter = (1.0 - sigmam) *
 						          wind *
-						          std::fmin(std::fmin(
-								          std::fabs(diffuw),
-								          std::fabs(diffdw)),
-						                    one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw)));
+						          fmin(fmin(
+								          fabs(diffuw),
+								          fabs(diffdw)),
+						               one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw)));
 					} else {
 						limiter = 0.0;
 					}
-					ener_flux(i, j) = mass_flux_x(i, j) * (energy1(donor, j) + limiter);
+					ener_flux[i + j * vels_wk_stride] = mass_flux_x[i + j * flux_x_stride] * (energy1[donor + j * base_stride] + limiter);
 				});
 		}
 
@@ -145,15 +160,17 @@ void advec_cell_kernel(
 		// DO k=y_min,y_max
 		//   DO j=x_min,x_max
 
-		_Pragma("kernel2d")
+
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min + 1); j < (y_max + 2); j++) {
 			for (int i = (x_min + 1); i < (x_max + 2); i++) {
-				double pre_mass_s = density1(i, j) * pre_vol(i, j);
-				double post_mass_s = pre_mass_s + mass_flux_x(i, j) - mass_flux_x(i + 1, j + 0);
-				double post_ener_s = (energy1(i, j) * pre_mass_s + ener_flux(i, j) - ener_flux(i + 1, j + 0)) / post_mass_s;
-				double advec_vol_s = pre_vol(i, j) + vol_flux_x(i, j) - vol_flux_x(i + 1, j + 0);
-				density1(i, j) = post_mass_s / advec_vol_s;
-				energy1(i, j) = post_ener_s;
+				double pre_mass_s = density1[i + j * base_stride] * pre_vol[i + j * vels_wk_stride];
+				double post_mass_s = pre_mass_s + mass_flux_x[i + j * flux_x_stride] - mass_flux_x[(i + 1) + (j + 0) * flux_x_stride];
+				double post_ener_s = (energy1[i + j * base_stride] * pre_mass_s + ener_flux[i + j * vels_wk_stride] - ener_flux[(i + 1) + (j + 0) * vels_wk_stride]) / post_mass_s;
+				double advec_vol_s = pre_vol[i + j * vels_wk_stride] + vol_flux_x[i + j * flux_x_stride] - vol_flux_x[(i + 1) + (j + 0) * flux_x_stride];
+				density1[i + j * base_stride] = post_mass_s / advec_vol_s;
+				energy1[i + j * base_stride] = post_ener_s;
 			}
 		}
 
@@ -165,11 +182,20 @@ void advec_cell_kernel(
 		if (sweep_number == 1) {
 
 
-			_Pragma("kernel2d")
+			double *volume = field.volume.data;
+			double *vol_flux_x = field.vol_flux_x.data;
+			double *vol_flux_y = field.vol_flux_y.data;
+			double *pre_vol = field.work_array1.data;
+			double *post_vol = field.work_array2.data;
+
+			#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 			for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) {
 				for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) {
-					pre_vol(i, j) = volume(i, j) + (vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j) + vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j));
-					post_vol(i, j) = pre_vol(i, j) - (vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j));
+					pre_vol[i + j * vels_wk_stride] = volume[i + j * base_stride] +
+					                                  (vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - vol_flux_y[i + j * flux_y_stride] +
+					                                   vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] -
+					                                   vol_flux_x[i + j * flux_x_stride]);
+					post_vol[i + j * vels_wk_stride] = pre_vol[i + j * vels_wk_stride] - (vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - vol_flux_y[i + j * flux_y_stride]);
 				}
 			}
 
@@ -177,11 +203,16 @@ void advec_cell_kernel(
 		} else {
 
 
-			_Pragma("kernel2d")
+			double *volume = field.volume.data;
+			double *vol_flux_y = field.vol_flux_y.data;
+			double *pre_vol = field.work_array1.data;
+			double *post_vol = field.work_array2.data;
+
+			#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 			for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) {
 				for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) {
-					pre_vol(i, j) = volume(i, j) + vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j);
-					post_vol(i, j) = volume(i, j);
+					pre_vol[i + j * vels_wk_stride] = volume[i + j * base_stride] + vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - vol_flux_y[i + j * flux_y_stride];
+					post_vol[i + j * vels_wk_stride] = volume[i + j * base_stride];
 				}
 			}
 
@@ -191,72 +222,81 @@ void advec_cell_kernel(
 
 		// DO k=y_min,y_max+2
 		//   DO j=x_min,x_max
-		_Pragma("kernel2d")
+		double *vertexdy = field.vertexdy.data;
+		double *density1 = field.density1.data;
+		double *energy1 = field.energy1.data;
+		double *mass_flux_y = field.mass_flux_y.data;
+		double *vol_flux_y = field.vol_flux_y.data;
+		double *pre_vol = field.work_array1.data;
+		double *ener_flux = field.work_array7.data;
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min + 1); j < (y_max + 2 + 2); j++) {
 			for (int i = (x_min + 1); i < (x_max + 2); i++)
 				({
 					int upwind, donor, downwind, dif;
 					double sigmat, sigma3, sigma4, sigmav, sigma, sigmam, diffuw, diffdw, limiter, wind;
-					if (vol_flux_y(i, j) > 0.0) {
+					if (vol_flux_y[i + j * flux_y_stride] > 0.0) {
 						upwind = j - 2;
 						donor = j - 1;
 						downwind = j;
 						dif = donor;
 					} else {
-						upwind = std::min(j + 1, y_max + 2);
+						upwind = MIN(j + 1, y_max + 2);
 						donor = j;
 						downwind = j - 1;
 						dif = upwind;
 					}
-					sigmat = std::fabs(vol_flux_y(i, j)) / pre_vol(i, donor);
+					sigmat = fabs(vol_flux_y[i + j * flux_y_stride]) / pre_vol[i + donor * vels_wk_stride];
 					sigma3 = (1.0 + sigmat) * (vertexdy[j] / vertexdy[dif]);
 					sigma4 = 2.0 - sigmat;
-					sigma = sigmat;
+//					sigma = sigmat;
 					sigmav = sigmat;
-					diffuw = density1(i, donor) - density1(i, upwind);
-					diffdw = density1(i, downwind) - density1(i, donor);
+					diffuw = density1[i + donor * base_stride] - density1[i + upwind * base_stride];
+					diffdw = density1[i + downwind * base_stride] - density1[i + donor * base_stride];
 					wind = 1.0;
 					if (diffdw <= 0.0)wind = -1.0;
 					if (diffuw * diffdw > 0.0) {
 						limiter = (1.0 - sigmav) * wind *
-						          std::fmin(std::fmin(
-								          std::fabs(diffuw),
-								          std::fabs(diffdw)),
-						                    one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw)));
+						          fmin(fmin(
+								          fabs(diffuw),
+								          fabs(diffdw)),
+						               one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw)));
 					} else {
 						limiter = 0.0;
 					}
-					mass_flux_y(i, j) = vol_flux_y(i, j) * (density1(i, donor) + limiter);
-					sigmam = std::fabs(mass_flux_y(i, j)) / (density1(i, donor) * pre_vol(i, donor));
-					diffuw = energy1(i, donor) - energy1(i, upwind);
-					diffdw = energy1(i, downwind) - energy1(i, donor);
+					mass_flux_y[i + j * flux_y_stride] = vol_flux_y[i + j * flux_y_stride] * (density1[i + donor * base_stride] + limiter);
+					sigmam = fabs(mass_flux_y[i + j * flux_y_stride]) / (density1[i + donor * base_stride] * pre_vol[i + donor * vels_wk_stride]);
+					diffuw = energy1[i + donor * base_stride] - energy1[i + upwind * base_stride];
+					diffdw = energy1[i + downwind * base_stride] - energy1[i + donor * base_stride];
 					wind = 1.0;
 					if (diffdw <= 0.0)wind = -1.0;
 					if (diffuw * diffdw > 0.0) {
 						limiter = (1.0 - sigmam) * wind *
-						          std::fmin(std::fmin(
-								          std::fabs(diffuw),
-								          std::fabs(diffdw)),
-						                    one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw)));
+						          fmin(fmin(
+								          fabs(diffuw),
+								          fabs(diffdw)),
+						               one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw)));
 					} else {
 						limiter = 0.0;
 					}
-					ener_flux(i, j) = mass_flux_y(i, j) * (energy1(i, donor) + limiter);
+					ener_flux[i + j * vels_wk_stride] = mass_flux_y[i + j * flux_y_stride] * (energy1[i + donor * base_stride] + limiter);
 				});
 		}
 
 
 		// DO k=y_min,y_max
 		//   DO j=x_min,x_max
-		_Pragma("kernel2d")
+
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min + 1); j < (y_max + 2); j++) {
 			for (int i = (x_min + 1); i < (x_max + 2); i++) {
-				double pre_mass_s = density1(i, j) * pre_vol(i, j);
-				double post_mass_s = pre_mass_s + mass_flux_y(i, j) - mass_flux_y(i + 0, j + 1);
-				double post_ener_s = (energy1(i, j) * pre_mass_s + ener_flux(i, j) - ener_flux(i + 0, j + 1)) / post_mass_s;
-				double advec_vol_s = pre_vol(i, j) + vol_flux_y(i, j) - vol_flux_y(i + 0, j + 1);
-				density1(i, j) = post_mass_s / advec_vol_s;
-				energy1(i, j) = post_ener_s;
+				double pre_mass_s = density1[i + j * base_stride] * pre_vol[i + j * vels_wk_stride];
+				double post_mass_s = pre_mass_s + mass_flux_y[i + j * flux_y_stride] - mass_flux_y[(i + 0) + (j + 1) * flux_y_stride];
+				double post_ener_s = (energy1[i + j * base_stride] * pre_mass_s + ener_flux[i + j * vels_wk_stride] - ener_flux[(i + 0) + (j + 1) * vels_wk_stride]) / post_mass_s;
+				double advec_vol_s = pre_vol[i + j * vels_wk_stride] + vol_flux_y[i + j * flux_y_stride] - vol_flux_y[(i + 0) + (j + 1) * flux_y_stride];
+				density1[i + j * base_stride] = post_mass_s / advec_vol_s;
+				energy1[i + j * base_stride] = post_ener_s;
 			}
 		}
 
@@ -270,30 +310,24 @@ void advec_cell_kernel(
 //  @details Invokes the user selected advection kernel.
 void advec_cell_driver(global_variables &globals, int tile, int sweep_number, int direction) {
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
 	tile_type &t = globals.chunk.tiles[tile];
 	advec_cell_kernel(
+			globals.use_target,
 			t.info.t_xmin,
 			t.info.t_xmax,
 			t.info.t_ymin,
 			t.info.t_ymax,
 			direction,
 			sweep_number,
-			t.field.vertexdx,
-			t.field.vertexdy,
-			t.field.volume,
-			t.field.density1,
-			t.field.energy1,
-			t.field.mass_flux_x,
-			t.field.vol_flux_x,
-			t.field.mass_flux_y,
-			t.field.vol_flux_y,
-			t.field.work_array1,
-			t.field.work_array2,
-			t.field.work_array3,
-			t.field.work_array4,
-			t.field.work_array5,
-			t.field.work_array6,
-			t.field.work_array7);
+			t.field);
+
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
 
 }
 
diff --git a/src/advec_mom.cpp b/src/advec_mom.cpp
index 48c226d..eb543ea 100644
--- a/src/advec_mom.cpp
+++ b/src/advec_mom.cpp
@@ -20,7 +20,7 @@
 
 #include <cmath>
 #include "advec_mom.h"
-#include "utils.hpp"
+
 
 //  @brief Fortran momentum advection kernel
 //  @author Wayne Gaudin
@@ -29,22 +29,10 @@
 //  Note that although pre_vol is only set and not used in the update, please
 //  leave it in the method.
 void advec_mom_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
-		clover::Buffer2D<double> &vel1,
-		clover::Buffer2D<double> &mass_flux_x,
-		clover::Buffer2D<double> &vol_flux_x,
-		clover::Buffer2D<double> &mass_flux_y,
-		clover::Buffer2D<double> &vol_flux_y,
-		clover::Buffer2D<double> &volume,
-		clover::Buffer2D<double> &density1,
-		clover::Buffer2D<double> &node_flux,
-		clover::Buffer2D<double> &node_mass_post,
-		clover::Buffer2D<double> &node_mass_pre,
-		clover::Buffer2D<double> &mom_flux,
-		clover::Buffer2D<double> &pre_vol,
-		clover::Buffer2D<double> &post_vol,
-		clover::Buffer1D<double> &celldx,
-		clover::Buffer1D<double> &celldy,
+		clover::Buffer2D<double> &vel1_buffer,
+		field_type &field,
 		int which_vel,
 		int sweep_number,
 		int direction) {
@@ -55,44 +43,76 @@ void advec_mom_kernel(
 	// DO k=y_min-2,y_max+2
 	//   DO j=x_min-2,x_max+2
 
+	const int base_stride = field.base_stride;
+	const int vels_wk_stride = field.vels_wk_stride;
+	const int flux_x_stride = field.flux_x_stride;
+	const int flux_y_stride = field.flux_y_stride;
+
+
 	if (mom_sweep == 1) { // x 1
 
 
-		_Pragma("kernel2d")
+		double *vol_flux_y = field.vol_flux_y.data;
+		double *vol_flux_x = field.vol_flux_x.data;
+		double *volume = field.volume.data;
+		double *pre_vol = field.work_array5.data;
+		double *post_vol = field.work_array6.data;
+
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) {
 			for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) {
-				post_vol(i, j) = volume(i, j) + vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j);
-				pre_vol(i, j) = post_vol(i, j) + vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j);
+				post_vol[i + j * vels_wk_stride] = volume[i + j * base_stride] + vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - vol_flux_y[i + j * flux_y_stride];
+				pre_vol[i + j * vels_wk_stride] = post_vol[i + j * vels_wk_stride] + vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - vol_flux_x[i + j * flux_x_stride];
 			}
 		}
 	} else if (mom_sweep == 2) { // y 1
 
 
-		_Pragma("kernel2d")
+		double *vol_flux_y = field.vol_flux_y.data;
+		double *vol_flux_x = field.vol_flux_x.data;
+		double *volume = field.volume.data;
+		double *pre_vol = field.work_array5.data;
+		double *post_vol = field.work_array6.data;
+
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) {
 			for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) {
-				post_vol(i, j) = volume(i, j) + vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j);
-				pre_vol(i, j) = post_vol(i, j) + vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j);
+				post_vol[i + j * vels_wk_stride] = volume[i + j * base_stride] + vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - vol_flux_x[i + j * flux_x_stride];
+				pre_vol[i + j * vels_wk_stride] = post_vol[i + j * vels_wk_stride] + vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - vol_flux_y[i + j * flux_y_stride];
 			}
 		}
 	} else if (mom_sweep == 3) { // x 2
 
 
-		_Pragma("kernel2d")
+		double *vol_flux_y = field.vol_flux_y.data;
+		double *volume = field.volume.data;
+		double *pre_vol = field.work_array5.data;
+		double *post_vol = field.work_array6.data;
+
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) {
 			for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) {
-				post_vol(i, j) = volume(i, j);
-				pre_vol(i, j) = post_vol(i, j) + vol_flux_y(i + 0, j + 1) - vol_flux_y(i, j);
+				post_vol[i + j * vels_wk_stride] = volume[i + j * base_stride];
+				pre_vol[i + j * vels_wk_stride] = post_vol[i + j * vels_wk_stride] + vol_flux_y[(i + 0) + (j + 1) * flux_y_stride] - vol_flux_y[i + j * flux_y_stride];
 			}
 		}
 	} else if (mom_sweep == 4) { // y 2
 
 
-		_Pragma("kernel2d")
+		double *vol_flux_x = field.vol_flux_x.data;
+		double *volume = field.volume.data;
+		double *pre_vol = field.work_array5.data;
+		double *post_vol = field.work_array6.data;
+
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) {
 			for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) {
-				post_vol(i, j) = volume(i, j);
-				pre_vol(i, j) = post_vol(i, j) + vol_flux_x(i + 1, j + 0) - vol_flux_x(i, j);
+				post_vol[i + j * vels_wk_stride] = volume[i + j * base_stride];
+				pre_vol[i + j * vels_wk_stride] = post_vol[i + j * vels_wk_stride] + vol_flux_x[(i + 1) + (j + 0) * flux_x_stride] - vol_flux_x[i + j * flux_x_stride];
 			}
 		}
 	}
@@ -103,46 +123,62 @@ void advec_mom_kernel(
 			//   DO j=x_min-2,x_max+2
 
 
+			double *mass_flux_x = field.mass_flux_x.data;
+			double *node_flux = field.work_array1.data;
 
-			_Pragma("kernel2d")
+
+			#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 			for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) {
 				for (int i = (x_min - 2 + 1); i < (x_max + 2 + 2); i++) {
-					node_flux(i, j) = 0.25 * (mass_flux_x(i + 0, j - 1) + mass_flux_x(i, j) +
-					                          mass_flux_x(i + 1, j - 1) + mass_flux_x(i + 1, j + 0));
+					node_flux[i + j * vels_wk_stride] = 0.25 * (mass_flux_x[(i + 0) + (j - 1) * flux_x_stride] + mass_flux_x[i + j * flux_x_stride] +
+					                                            mass_flux_x[(i + 1) + (j - 1) * flux_x_stride] + mass_flux_x[(i + 1) + (j + 0) * flux_x_stride]);
 				}
 			}
 
 			// DO k=y_min,y_max+1
 			//   DO j=x_min-1,x_max+2
 
+			double *density1 = field.density1.data;
+			double *node_mass_post = field.work_array2.data;
+			double *node_mass_pre = field.work_array3.data;
+			double *post_vol = field.work_array6.data;
+
 
-			_Pragma("kernel2d")
+			#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 			for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) {
 				for (int i = (x_min - 1 + 1); i < (x_max + 2 + 2); i++) {
-					node_mass_post(i, j) = 0.25 * (density1(i + 0, j - 1) *
-					                               post_vol(i + 0, j - 1) +
-					                               density1(i, j) *
-					                               post_vol(i, j) +
-					                               density1(i - 1, j - 1) *
-					                               post_vol(i - 1, j - 1) +
-					                               density1(i - 1, j + 0) * post_vol(i - 1, j + 0));
-					node_mass_pre(i, j) = node_mass_post(i, j) - node_flux(i - 1, j + 0) + node_flux(i, j);
+					node_mass_post[i + j * vels_wk_stride] = 0.25 * (density1[(i + 0) + (j - 1) * base_stride] *
+					                                                 post_vol[(i + 0) + (j - 1) * vels_wk_stride] +
+					                                                 density1[i + j * base_stride] *
+					                                                 post_vol[i + j * vels_wk_stride] +
+					                                                 density1[(i - 1) + (j - 1) * base_stride] *
+					                                                 post_vol[(i - 1) + (j - 1) * vels_wk_stride] +
+					                                                 density1[(i - 1) + (j + 0) * base_stride] * post_vol[(i - 1) + (j + 0) * vels_wk_stride]);
+					node_mass_pre[i + j * vels_wk_stride] =
+							node_mass_post[i + j * vels_wk_stride] - node_flux[(i - 1) + (j + 0) * vels_wk_stride] + node_flux[i + j * vels_wk_stride];
 				}
 			}
 		}
 
-			// DO k=y_min,y_max+1
-			//  DO j=x_min-1,x_max+1
+		// DO k=y_min,y_max+1
+		//  DO j=x_min-1,x_max+1
+
 
 
+		const int vel1_sizex = vel1_buffer.nX();
+		double *vel1 = vel1_buffer.data;
+		double *node_flux = field.work_array1.data;
+		double *node_mass_pre = field.work_array3.data;
+		double *mom_flux = field.work_array4.data;
+		double *celldx = field.celldx.data;
 
-				_Pragma("kernel2d")
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) {
 			for (int i = (x_min - 1 + 1); i < (x_max + 1 + 2); i++)
 				({
 					int upwind, donor, downwind, dif;
 					double sigma, width, limiter, vdiffuw, vdiffdw, auw, adw, wind, advec_vel_s;
-					if (node_flux(i, j) < 0.0) {
+					if (node_flux[i + j * vels_wk_stride] < 0.0) {
 						upwind = i + 2;
 						donor = i + 1;
 						downwind = i;
@@ -153,21 +189,21 @@ void advec_mom_kernel(
 						downwind = i + 1;
 						dif = upwind;
 					}
-					sigma = std::fabs(node_flux(i, j)) / (node_mass_pre(donor, j));
+					sigma = fabs(node_flux[i + j * vels_wk_stride]) / (node_mass_pre[donor + j * vels_wk_stride]);
 					width = celldx[i];
-					vdiffuw = vel1(donor, j) - vel1(upwind, j);
-					vdiffdw = vel1(downwind, j) - vel1(donor, j);
+					vdiffuw = vel1[donor + j * vel1_sizex] - vel1[upwind + j * vel1_sizex];
+					vdiffdw = vel1[downwind + j * vel1_sizex] - vel1[donor + j * vel1_sizex];
 					limiter = 0.0;
 					if (vdiffuw * vdiffdw > 0.0) {
-						auw = std::fabs(vdiffuw);
-						adw = std::fabs(vdiffdw);
+						auw = fabs(vdiffuw);
+						adw = fabs(vdiffdw);
 						wind = 1.0;
 						if (vdiffdw <= 0.0)wind = -1.0;
-						limiter = wind * std::fmin(std::fmin(
+						limiter = wind * fmin(fmin(
 								width * ((2.0 - sigma) * adw / width + (1.0 + sigma) * auw / celldx[dif]) / 6.0, auw), adw);
 					}
-					advec_vel_s = vel1(donor, j) + (1.0 - sigma) * limiter;
-					mom_flux(i, j) = advec_vel_s * node_flux(i, j);
+					advec_vel_s = vel1[donor + j * vel1_sizex] + (1.0 - sigma) * limiter;
+					mom_flux[i + j * vels_wk_stride] = advec_vel_s * node_flux[i + j * vels_wk_stride];
 				});
 		}
 
@@ -176,10 +212,15 @@ void advec_mom_kernel(
 
 
 
-		_Pragma("kernel2d")
+		double *node_mass_post = field.work_array2.data;
+
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) {
 			for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) {
-				vel1(i, j) = (vel1(i, j) * node_mass_pre(i, j) + mom_flux(i - 1, j + 0) - mom_flux(i, j)) / node_mass_post(i, j);
+				vel1[i + j * vel1_sizex] =
+						(vel1[i + j * vel1_sizex] * node_mass_pre[i + j * vels_wk_stride] + mom_flux[(i - 1) + (j + 0) * vels_wk_stride] - mom_flux[i + j * vels_wk_stride]) /
+						node_mass_post[i + j * vels_wk_stride];
 			}
 		}
 	} else if (direction == 2) {
@@ -188,45 +229,60 @@ void advec_mom_kernel(
 			//   DO j=x_min,x_max+1
 
 
+			double *node_flux = field.work_array1.data;
+			double *mass_flux_y = field.mass_flux_y.data;
 
-			_Pragma("kernel2d")
+			#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 			for (int j = (y_min - 2 + 1); j < (y_max + 2 + 2); j++) {
 				for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) {
-					node_flux(i, j) = 0.25 * (mass_flux_y(i - 1, j + 0) + mass_flux_y(i, j) +
-					                          mass_flux_y(i - 1, j + 1) + mass_flux_y(i + 0, j + 1));
+					node_flux[i + j * vels_wk_stride] = 0.25 * (mass_flux_y[(i - 1) + (j + 0) * flux_y_stride] + mass_flux_y[i + j * flux_y_stride] +
+					                                            mass_flux_y[(i - 1) + (j + 1) * flux_y_stride] + mass_flux_y[(i + 0) + (j + 1) * flux_y_stride]);
 				}
 			}
 
 
 			// DO k=y_min-1,y_max+2
 			//   DO j=x_min,x_max+1
+			double *density1 = field.density1.data;
+			double *node_mass_post = field.work_array2.data;
+			double *node_mass_pre = field.work_array3.data;
+			double *post_vol = field.work_array6.data;
 
-			_Pragma("kernel2d")
+
+			#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 			for (int j = (y_min - 1 + 1); j < (y_max + 2 + 2); j++) {
 				for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) {
-					node_mass_post(i, j) = 0.25 * (density1(i + 0, j - 1) *
-					                               post_vol(i + 0, j - 1) +
-					                               density1(i, j) *
-					                               post_vol(i, j) +
-					                               density1(i - 1, j - 1) *
-					                               post_vol(i - 1, j - 1) +
-					                               density1(i - 1, j + 0) *
-					                               post_vol(i - 1, j + 0));
-					node_mass_pre(i, j) = node_mass_post(i, j) - node_flux(i + 0, j - 1) + node_flux(i, j);
+					node_mass_post[i + j * vels_wk_stride] = 0.25 * (density1[(i + 0) + (j - 1) * base_stride] *
+					                                                 post_vol[(i + 0) + (j - 1) * vels_wk_stride] +
+					                                                 density1[i + j * base_stride] *
+					                                                 post_vol[i + j * vels_wk_stride] +
+					                                                 density1[(i - 1) + (j - 1) * base_stride] *
+					                                                 post_vol[(i - 1) + (j - 1) * vels_wk_stride] +
+					                                                 density1[(i - 1) + (j + 0) * base_stride] *
+					                                                 post_vol[(i - 1) + (j + 0) * vels_wk_stride]);
+					node_mass_pre[i + j * vels_wk_stride] =
+							node_mass_post[i + j * vels_wk_stride] - node_flux[(i + 0) + (j - 1) * vels_wk_stride] + node_flux[i + j * vels_wk_stride];
 				}
 			}
 		}
 
-			// DO k=y_min-1,y_max+1
-			//   DO j=x_min,x_max+1
+		// DO k=y_min-1,y_max+1
+		//   DO j=x_min,x_max+1
 
-				_Pragma("kernel2d")
+		const int vel1_sizex = vel1_buffer.nX();
+		double *vel1 = vel1_buffer.data;
+		double *node_flux = field.work_array1.data;
+		double *node_mass_pre = field.work_array3.data;
+		double *mom_flux = field.work_array4.data;
+		double *celldy = field.celldy.data;
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min - 1 + 1); j < (y_max + 1 + 2); j++) {
 			for (int i = (x_min + 1); i < (x_max + 1 + 2); i++)
 				({
 					int upwind, donor, downwind, dif;
 					double sigma, width, limiter, vdiffuw, vdiffdw, auw, adw, wind, advec_vel_s;
-					if (node_flux(i, j) < 0.0) {
+					if (node_flux[i + j * vels_wk_stride] < 0.0) {
 						upwind = j + 2;
 						donor = j + 1;
 						downwind = j;
@@ -237,21 +293,21 @@ void advec_mom_kernel(
 						downwind = j + 1;
 						dif = upwind;
 					}
-					sigma = std::fabs(node_flux(i, j)) / (node_mass_pre(i, donor));
+					sigma = fabs(node_flux[i + j * vels_wk_stride]) / (node_mass_pre[i + donor * vels_wk_stride]);
 					width = celldy[j];
-					vdiffuw = vel1(i, donor) - vel1(i, upwind);
-					vdiffdw = vel1(i, downwind) - vel1(i, donor);
+					vdiffuw = vel1[i + donor * vel1_sizex] - vel1[i + upwind * vel1_sizex];
+					vdiffdw = vel1[i + downwind * vel1_sizex] - vel1[i + donor * vel1_sizex];
 					limiter = 0.0;
 					if (vdiffuw * vdiffdw > 0.0) {
-						auw = std::fabs(vdiffuw);
-						adw = std::fabs(vdiffdw);
+						auw = fabs(vdiffuw);
+						adw = fabs(vdiffdw);
 						wind = 1.0;
 						if (vdiffdw <= 0.0)wind = -1.0;
-						limiter = wind * std::fmin(std::fmin(
+						limiter = wind * fmin(fmin(
 								width * ((2.0 - sigma) * adw / width + (1.0 + sigma) * auw / celldy[dif]) / 6.0, auw), adw);
 					}
-					advec_vel_s = vel1(i, donor) + (1.0 - sigma) * limiter;
-					mom_flux(i, j) = advec_vel_s * node_flux(i, j);
+					advec_vel_s = vel1[i + donor * vel1_sizex] + (1.0 - sigma) * limiter;
+					mom_flux[i + j * vels_wk_stride] = advec_vel_s * node_flux[i + j * vels_wk_stride];
 				});
 		}
 
@@ -261,10 +317,15 @@ void advec_mom_kernel(
 
 
 
-		_Pragma("kernel2d")
+		double *node_mass_post = field.work_array2.data;
+
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 		for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) {
 			for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) {
-				vel1(i, j) = (vel1(i, j) * node_mass_pre(i, j) + mom_flux(i + 0, j - 1) - mom_flux(i, j)) / node_mass_post(i, j);
+				vel1[i + j * vel1_sizex] =
+						(vel1[i + j * vel1_sizex] * node_mass_pre[i + j * vels_wk_stride] + mom_flux[(i + 0) + (j - 1) * vels_wk_stride] - mom_flux[i + j * vels_wk_stride]) /
+						node_mass_post[i + j * vels_wk_stride];
 			}
 		}
 	}
@@ -278,57 +339,42 @@ void advec_mom_driver(global_variables &globals, int tile, int which_vel, int di
                       int sweep_number) {
 
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
+
 	tile_type &t = globals.chunk.tiles[tile];
 	if (which_vel == 1) {
 		advec_mom_kernel(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
 				t.info.t_ymax,
 				t.field.xvel1,
-				t.field.mass_flux_x,
-				t.field.vol_flux_x,
-				t.field.mass_flux_y,
-				t.field.vol_flux_y,
-				t.field.volume,
-				t.field.density1,
-				t.field.work_array1,
-				t.field.work_array2,
-				t.field.work_array3,
-				t.field.work_array4,
-				t.field.work_array5,
-				t.field.work_array6,
-				t.field.celldx,
-				t.field.celldy,
+				t.field,
 				which_vel,
 				sweep_number,
 				direction);
 	} else {
 		advec_mom_kernel(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
 				t.info.t_ymax,
 				t.field.yvel1,
-				t.field.mass_flux_x,
-				t.field.vol_flux_x,
-				t.field.mass_flux_y,
-				t.field.vol_flux_y,
-				t.field.volume,
-				t.field.density1,
-				t.field.work_array1,
-				t.field.work_array2,
-				t.field.work_array3,
-				t.field.work_array4,
-				t.field.work_array5,
-				t.field.work_array6,
-				t.field.celldx,
-				t.field.celldy,
+				t.field,
 				which_vel,
 				sweep_number,
 				direction);
 	}
 
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
+
 }
 
 
diff --git a/src/build_field.cpp b/src/build_field.cpp
index f9c9842..06bb7f5 100644
--- a/src/build_field.cpp
+++ b/src/build_field.cpp
@@ -25,14 +25,87 @@
 
 
 #include "build_field.h"
-#include "utils.hpp"
 
-// Allocate Kokkos Views for the data arrays
+
+// Allocate device buffers for the data arrays
 void build_field(global_variables &globals) {
 
 	for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) {
 
 		tile_type &t = globals.chunk.tiles[tile];
+		field_type &field = t.field;
+
+
+
+		double * density0 = field.density0.data;
+		double * density1 = field.density1.data;
+		double * energy0 = field.energy0.data;
+		double * energy1 = field.energy1.data;
+		double * pressure = field.pressure.data;
+		double * viscosity = field.viscosity.data;
+		double * soundspeed = field.soundspeed.data;
+		double * yvel0 = field.yvel0.data;
+		double * yvel1 = field.yvel1.data;
+		double * xvel0 = field.xvel0.data;
+		double * xvel1 = field.xvel1.data;
+		double * vol_flux_x = field.vol_flux_x.data;
+		double * vol_flux_y = field.vol_flux_y.data;
+		double * mass_flux_x = field.mass_flux_x.data;
+		double * mass_flux_y = field.mass_flux_y.data;
+		double * work_array1 = field.work_array1.data;
+		double * work_array2 = field.work_array2.data;
+		double * work_array3 = field.work_array3.data;
+		double * work_array4 = field.work_array4.data;
+		double * work_array5 = field.work_array5.data;
+		double * work_array6 = field.work_array6.data;
+		double * work_array7 = field.work_array7.data;
+		double * cellx = field.cellx.data;
+		double * celldx = field.celldx.data;
+		double * celly = field.celly.data;
+		double * celldy = field.celldy.data;
+		double * vertexx = field.vertexx.data;
+		double * vertexdx = field.vertexdx.data;
+		double * vertexy = field.vertexy.data;
+		double * vertexdy = field.vertexdy.data;
+		double * volume = field.volume.data;
+		double * xarea = field.xarea.data;
+		double * yarea = field.yarea.data;
+
+
+		#pragma omp target enter data \
+                map(alloc: density0[:field.density0.N()])    \
+                map(alloc: density1[:field.density1.N()])    \
+                map(alloc: energy0[:field.energy0.N()])    \
+                map(alloc: energy1[:field.energy1.N()])    \
+                map(alloc: pressure[:field.pressure.N()])    \
+                map(alloc: viscosity[:field.viscosity.N()])    \
+                map(alloc: soundspeed[:field.soundspeed.N()])    \
+                map(alloc: yvel0[:field.yvel0.N()])    \
+                map(alloc: yvel1[:field.yvel1.N()])    \
+                map(alloc: xvel0[:field.xvel0.N()])    \
+                map(alloc: xvel1[:field.xvel1.N()])    \
+                map(alloc: vol_flux_x[:field.vol_flux_x.N()])    \
+                map(alloc: vol_flux_y[:field.vol_flux_y.N()])    \
+                map(alloc: mass_flux_x[:field.mass_flux_x.N()])    \
+                map(alloc: mass_flux_y[:field.mass_flux_y.N()])    \
+                map(alloc: work_array1[:field.work_array1.N()])    \
+                map(alloc: work_array2[:field.work_array2.N()])    \
+                map(alloc: work_array3[:field.work_array3.N()])    \
+                map(alloc: work_array4[:field.work_array4.N()])    \
+                map(alloc: work_array5[:field.work_array5.N()])    \
+                map(alloc: work_array6[:field.work_array6.N()])    \
+                map(alloc: work_array7[:field.work_array7.N()])    \
+                map(alloc: cellx[:field.cellx.N()]) \
+                map(alloc: celldx[:field.celldx.N()]) \
+                map(alloc: celly[:field.celly.N()]) \
+                map(alloc: celldy[:field.celldy.N()]) \
+                map(alloc: vertexx[:field.vertexx.N()]) \
+                map(alloc: vertexdx[:field.vertexdx.N()]) \
+                map(alloc: vertexy[:field.vertexy.N()]) \
+                map(alloc: vertexdy[:field.vertexdy.N()]) \
+                map(alloc: volume[:field.volume.N()])    \
+                map(alloc: xarea[:field.xarea.N()])    \
+                map(alloc: yarea[:field.yarea.N()])    \
 
 		const int xrange = (t.info.t_xmax + 2) - (t.info.t_xmin - 2) + 1;
 		const int yrange = (t.info.t_ymax + 2) - (t.info.t_ymin - 2) + 1;
@@ -94,90 +167,99 @@ void build_field(global_variables &globals) {
 		// cycle which can skew timings in the first step
 
 		// Take a reference to the lowest structure, as Kokkos device cannot necessarily chase through the structure.
-		field_type &field = t.field;
 
 //		Kokkos::MDRangePolicy <Kokkos::Rank<2>> loop_bounds_1({0, 0}, {xrange + 1, yrange + 1});
 
 
 		// Nested loop over (t_ymin-2:t_ymax+3) and (t_xmin-2:t_xmax+3) inclusive
-		_Pragma("kernel2d")
-		for (int j = (0); j < (yrange + 1); j++) {
-			for (int i = (0); i < (xrange + 1); i++) {
-				field.work_array1(i, j) = 0.0;
-				field.work_array2(i, j) = 0.0;
-				field.work_array3(i, j) = 0.0;
-				field.work_array4(i, j) = 0.0;
-				field.work_array5(i, j) = 0.0;
-				field.work_array6(i, j) = 0.0;
-				field.work_array7(i, j) = 0.0;
-				field.xvel0(i, j) = 0.0;
-				field.xvel1(i, j) = 0.0;
-				field.yvel0(i, j) = 0.0;
-				field.yvel1(i, j) = 0.0;
+
+
+		const int vels_wk_stride = field.vels_wk_stride;
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target)
+		for (int j = 0; j < (yrange + 1); j++) {
+			for (int i = 0; i < (xrange + 1); i++) {
+				work_array1[i + j * vels_wk_stride] = 0.0;
+				work_array2[i + j * vels_wk_stride] = 0.0;
+				work_array3[i + j * vels_wk_stride] = 0.0;
+				work_array4[i + j * vels_wk_stride] = 0.0;
+				work_array5[i + j * vels_wk_stride] = 0.0;
+				work_array6[i + j * vels_wk_stride] = 0.0;
+				work_array7[i + j * vels_wk_stride] = 0.0;
+				xvel0[i + j * vels_wk_stride] = 0.0;
+				xvel1[i + j * vels_wk_stride] = 0.0;
+				yvel0[i + j * vels_wk_stride] = 0.0;
+				yvel1[i + j * vels_wk_stride] = 0.0;
 			}
 		}
 
 		// Nested loop over (t_ymin-2:t_ymax+2) and (t_xmin-2:t_xmax+2) inclusive
-		_Pragma("kernel2d")
-		for (int j = (0); j < (yrange); j++) {
-			for (int i = (0); i < (xrange); i++) {
-				field.density0(i, j) = 0.0;
-				field.density1(i, j) = 0.0;
-				field.energy0(i, j) = 0.0;
-				field.energy1(i, j) = 0.0;
-				field.pressure(i, j) = 0.0;
-				field.viscosity(i, j) = 0.0;
-				field.soundspeed(i, j) = 0.0;
-				field.volume(i, j) = 0.0;
+		const int base_stride = field.base_stride;
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target)
+		for (int j = 0; j < (yrange); j++) {
+			for (int i = 0; i < (xrange); i++) {
+				density0[i + j * base_stride] = 0.0;
+				density1[i + j * base_stride] = 0.0;
+				energy0[i + j * base_stride] = 0.0;
+				energy1[i + j * base_stride] = 0.0;
+				pressure[i + j * base_stride] = 0.0;
+				viscosity[i + j * base_stride] = 0.0;
+				soundspeed[i + j * base_stride] = 0.0;
+				volume[i + j * base_stride] = 0.0;
 			}
 		}
 
 		// Nested loop over (t_ymin-2:t_ymax+2) and (t_xmin-2:t_xmax+3) inclusive
-		_Pragma("kernel2d")
-		for (int j = (0); j < (yrange); j++) {
-			for (int i = (0); i < (xrange); i++) {
-				field.vol_flux_x(i, j) = 0.0;
-				field.mass_flux_x(i, j) = 0.0;
-				field.xarea(i, j) = 0.0;
+		const int flux_x_stride = field.flux_x_stride;
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target)
+		for (int j = 0; j < (yrange); j++) {
+			for (int i = 0; i < (xrange); i++) {
+				vol_flux_x[i + j * flux_x_stride] = 0.0;
+				mass_flux_x[i + j * flux_x_stride] = 0.0;
+				xarea[i + j * flux_x_stride] = 0.0;
 			}
 		}
 
 		// Nested loop over (t_ymin-2:t_ymax+3) and (t_xmin-2:t_xmax+2) inclusive
-		_Pragma("kernel2d")
-		for (int j = (0); j < (yrange + 1); j++) {
-			for (int i = (0); i < (xrange); i++) {
-				field.vol_flux_y(i, j) = 0.0;
-				field.mass_flux_y(i, j) = 0.0;
-				field.yarea(i, j) = 0.0;
+		const int flux_y_stride = field.flux_y_stride;
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target)
+		for (int j = 0; j < (yrange + 1); j++) {
+			for (int i = 0; i < (xrange); i++) {
+				vol_flux_y[i + j * flux_y_stride] = 0.0;
+				mass_flux_y[i + j * flux_y_stride] = 0.0;
+				yarea[i + j * flux_y_stride] = 0.0;
 			}
 		}
 
 		// (t_xmin-2:t_xmax+2) inclusive
-		_Pragma("kernel1d")
-		for (int id = (0); id < (xrange); id++) {
-			field.cellx[id] = 0.0;
-			field.celldx[id] = 0.0;
+		#pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target)
+		for (int id = 0; id < (xrange); id++) {
+			cellx[id] = 0.0;
+			celldx[id] = 0.0;
 		}
 
 		// (t_ymin-2:t_ymax+2) inclusive
-		_Pragma("kernel1d")
-		for (int id = (0); id < (yrange); id++) {
-			field.celly[id] = 0.0;
-			field.celldy[id] = 0.0;
+		#pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target)
+		for (int id = 0; id < (yrange); id++) {
+			celly[id] = 0.0;
+			celldy[id] = 0.0;
 		}
 
 		// (t_xmin-2:t_xmax+3) inclusive
-		_Pragma("kernel1d")
-		for (int id = (0); id < (xrange + 1); id++) {
-			field.vertexx[id] = 0.0;
-			field.vertexdx[id] = 0.0;
+		#pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target)
+		for (int id = 0; id < (xrange + 1); id++) {
+			vertexx[id] = 0.0;
+			vertexdx[id] = 0.0;
 		}
 
 		// (t_ymin-2:t_ymax+3) inclusive
-		_Pragma("kernel1d")
-		for (int id = (0); id < (yrange + 1); id++) {
-			field.vertexy[id] = 0.0;
-			field.vertexdy[id] = 0.0;
+		#pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target)
+		for (int id = 0; id < (yrange + 1); id++) {
+			vertexy[id] = 0.0;
+			vertexdy[id] = 0.0;
 		}
 
 
diff --git a/src/calc_dt.cpp b/src/calc_dt.cpp
index 528e249..18694d0 100644
--- a/src/calc_dt.cpp
+++ b/src/calc_dt.cpp
@@ -21,7 +21,7 @@
 
 #include <string>
 #include "calc_dt.h"
-#include "utils.hpp"
+
 #include <cmath>
 
 //  @brief Fortran timestep kernel
@@ -32,26 +32,14 @@
 
 
 void calc_dt_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
 		double dtmin,
 		double dtc_safe,
 		double dtu_safe,
 		double dtv_safe,
 		double dtdiv_safe,
-		clover::Buffer2D<double> &xarea,
-		clover::Buffer2D<double> &yarea,
-		clover::Buffer1D<double> &cellx,
-		clover::Buffer1D<double> &celly,
-		clover::Buffer1D<double> &celldx,
-		clover::Buffer1D<double> &celldy,
-		clover::Buffer2D<double> &volume,
-		clover::Buffer2D<double> &density0,
-		clover::Buffer2D<double> &energy0,
-		clover::Buffer2D<double> &pressure,
-		clover::Buffer2D<double> &viscosity_a,
-		clover::Buffer2D<double> &soundspeed,
-		clover::Buffer2D<double> &xvel0,
-		clover::Buffer2D<double> &yvel0,
+		field_type &field,
 		double &dt_min_val,
 		int &dtl_control,
 		double &xl_pos,
@@ -70,35 +58,56 @@ void calc_dt_kernel(
 //	Kokkos::MDRangePolicy <Kokkos::Rank<2>> policy({x_min + 1, y_min + 1}, {x_max + 2, y_max + 2});
 
 
-	_Pragma("kernel2d")
+	const int flux_x_stride = field.flux_x_stride;
+	const int flux_y_stride = field.flux_y_stride;
+
+	const int base_stride = field.base_stride;
+	const int vels_wk_stride = field.vels_wk_stride;
+
+	double *xarea = field.xarea.data;
+	double *yarea = field.yarea.data;
+	double *celldx = field.celldx.data;
+	double *celldy = field.celldy.data;
+	double *volume = field.volume.data;
+	double *density0 = field.density0.data;
+	double *viscosity = field.viscosity.data;
+	double *soundspeed = field.soundspeed.data;
+	double *xvel0 = field.xvel0.data;
+	double *yvel0 = field.yvel0.data;
+
+
+    // XXX See https://forums.developer.nvidia.com/t/nvc-f-0000-internal-compiler-error-unhandled-size-for-preparing-max-constant/221740
+    double dt_min_val0 = dt_min_val;
+	#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target) map(tofrom:dt_min_val) reduction(min:dt_min_val0)
 	for (int j = (y_min + 1); j < (y_max + 2); j++) {
 		for (int i = (x_min + 1); i < (x_max + 2); i++) {
 			double dsx = celldx[i];
 			double dsy = celldy[j];
-			double cc = soundspeed(i, j) * soundspeed(i, j);
-			cc = cc + 2.0 * viscosity_a(i, j) / density0(i, j);
-			cc = std::fmax(std::sqrt(cc), g_small);
-			double dtct = dtc_safe * std::fmin(dsx, dsy) / cc;
+			double cc = soundspeed[i + j * base_stride] * soundspeed[i + j * base_stride];
+			cc = cc + 2.0 * viscosity[i + j * base_stride] / density0[i + j * base_stride];
+			cc = fmax(sqrt(cc), g_small);
+			double dtct = dtc_safe * fmin(dsx, dsy) / cc;
 			double div = 0.0;
-			double dv1 = (xvel0(i, j) + xvel0(i + 0, j + 1)) * xarea(i, j);
-			double dv2 = (xvel0(i + 1, j + 0) + xvel0(i + 1, j + 1)) * xarea(i + 1, j + 0);
+			double dv1 = (xvel0[i + j * vels_wk_stride] + xvel0[(i + 0) + (j + 1) * vels_wk_stride]) * xarea[i + j * flux_x_stride];
+			double dv2 = (xvel0[(i + 1) + (j + 0) * vels_wk_stride] + xvel0[(i + 1) + (j + 1) * vels_wk_stride]) * xarea[(i + 1) + (j + 0) * flux_x_stride];
 			div = div + dv2 - dv1;
-			double dtut = dtu_safe * 2.0 * volume(i, j) / std::fmax(std::fmax(std::fabs(dv1), std::fabs(dv2)), g_small * volume(i, j));
-			dv1 = (yvel0(i, j) + yvel0(i + 1, j + 0)) * yarea(i, j);
-			dv2 = (yvel0(i + 0, j + 1) + yvel0(i + 1, j + 1)) * yarea(i + 0, j + 1);
+			double dtut = dtu_safe * 2.0 * volume[i + j * base_stride] / fmax(fmax(fabs(dv1), fabs(dv2)), g_small * volume[i + j * base_stride]);
+			dv1 = (yvel0[i + j * vels_wk_stride] + yvel0[(i + 1) + (j + 0) * vels_wk_stride]) * yarea[i + j * flux_y_stride];
+			dv2 = (yvel0[(i + 0) + (j + 1) * vels_wk_stride] + yvel0[(i + 1) + (j + 1) * vels_wk_stride]) * yarea[(i + 0) + (j + 1) * flux_y_stride];
 			div = div + dv2 - dv1;
-			double dtvt = dtv_safe * 2.0 * volume(i, j) / std::fmax(std::fmax(std::fabs(dv1), std::fabs(dv2)), g_small * volume(i, j));
-			div = div / (2.0 * volume(i, j));
+			double dtvt = dtv_safe * 2.0 * volume[i + j * base_stride] / fmax(fmax(fabs(dv1), fabs(dv2)), g_small * volume[i + j * base_stride]);
+			div = div / (2.0 * volume[i + j * base_stride]);
 			double dtdivt;
 			if (div < -g_small) {
 				dtdivt = dtdiv_safe * (-1.0 / div);
 			} else {
 				dtdivt = g_big;
 			}
-			double mins = std::fmin(dtct, std::fmin(dtut, std::fmin(dtvt, std::fmin(dtdivt, g_big))));
-			dt_min_val = std::fmin(mins, dt_min_val);
+			double mins = fmin(dtct, fmin(dtut, fmin(dtvt, fmin(dtdivt, g_big))));
+			dt_min_val0 = fmin(mins, dt_min_val0);
 		}
 	}
+    dt_min_val = dt_min_val0;
 
 
 	dtl_control = static_cast<int>(10.01 * (jk_control - static_cast<int>(jk_control)));
@@ -111,14 +120,14 @@ void calc_dt_kernel(
 
 	if (small != 0) {
 
-		auto cellx_acc = cellx;
-		auto celly_acc = celly;
-		auto density0_acc = density0;
-		auto energy0_acc = energy0;
-		auto pressure_acc = pressure;
-		auto soundspeed_acc = soundspeed;
-		auto xvel0_acc = xvel0;
-		auto yvel0_acc = yvel0;
+		auto &cellx_acc = field.cellx;
+		auto &celly_acc = field.celly;
+		auto &density0_acc = field.density0;
+		auto &energy0_acc = field.energy0;
+		auto &pressure_acc = field.pressure;
+		auto &soundspeed_acc = field.soundspeed;
+		auto &xvel0_acc = field.xvel0;
+		auto &yvel0_acc = field.yvel0;
 
 		std::cout
 				<< "Timestep information:" << std::endl
@@ -153,9 +162,13 @@ void calc_dt(global_variables &globals, int tile, double &local_dt, std::string
 	int l_control;
 	int small = 0;
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
 
 	tile_type &t = globals.chunk.tiles[tile];
 	calc_dt_kernel(
+			globals.use_target,
 			t.info.t_xmin,
 			t.info.t_xmax,
 			t.info.t_ymin,
@@ -165,20 +178,7 @@ void calc_dt(global_variables &globals, int tile, double &local_dt, std::string
 			globals.config.dtu_safe,
 			globals.config.dtv_safe,
 			globals.config.dtdiv_safe,
-			t.field.xarea,
-			t.field.yarea,
-			t.field.cellx,
-			t.field.celly,
-			t.field.celldx,
-			t.field.celldy,
-			t.field.volume,
-			t.field.density0,
-			t.field.energy0,
-			t.field.pressure,
-			t.field.viscosity,
-			t.field.soundspeed,
-			t.field.xvel0,
-			t.field.yvel0,
+			t.field,
 			local_dt,
 			l_control,
 			xl_pos,
@@ -188,6 +188,10 @@ void calc_dt(global_variables &globals, int tile, double &local_dt, std::string
 			small
 	);
 
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
+
 
 	if (l_control == 1) local_control = "sound";
 	if (l_control == 2) local_control = "xvel";
diff --git a/src/clover_leaf.cpp b/src/clover_leaf.cpp
index b28c84e..c4e5a2d 100644
--- a/src/clover_leaf.cpp
+++ b/src/clover_leaf.cpp
@@ -47,6 +47,7 @@
 #include "hydro.h"
 #include "initialise.h"
 #include "version.h"
+#include "finalise_field.h"
 #include <omp.h>
 #include <memory>
 
@@ -54,7 +55,6 @@
 std::ostream g_out(nullptr);
 
 int main(int argc, char *argv[]) {
-
 	// Initialise MPI first
 	MPI_Init(&argc, &argv);
 
@@ -74,11 +74,13 @@ int main(int argc, char *argv[]) {
 	}
 
 
-	std::unique_ptr<global_variables> config = initialise(parallel,
-	                                                      std::vector<std::string>(argv + 1, argv + argc));
+	auto config = initialise(parallel, std::vector<std::string>(argv + 1, argv + argc));
 
 	std::cout << "Launching hydro" << std::endl;
-	hydro(*config, parallel);
+	hydro(config, parallel);
+
+	// calls the appropriate omp target exit data for all buffers, see build_field.cpp for the enter data half
+	finalise_field(config);
 
 	// Finilise programming models
 //	Kokkos::finalize();
diff --git a/src/comms.cpp b/src/comms.cpp
index 8831ad6..5691eb5 100644
--- a/src/comms.cpp
+++ b/src/comms.cpp
@@ -34,7 +34,7 @@
 
 #include "comms.h"
 #include "pack_kernel.h"
-#include "utils.hpp"
+
 
 #include <mpi.h>
 
@@ -61,6 +61,8 @@ void clover_barrier() {
 }
 
 void clover_barrier(global_variables &globals) {
+	#pragma omp flush
+//	globals.deviceToHost();;
 	clover_barrier();
 }
 
@@ -498,8 +500,13 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	tile_type &t = globals.chunk.tiles[tile];
 	int t_offset = (t.info.t_bottom - globals.chunk.bottom) * depth;
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
 	if (fields[field_density0] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -512,6 +519,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_density1] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -524,6 +532,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_energy0] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -536,6 +545,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_energy1] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -548,6 +558,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_pressure] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -560,6 +571,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_viscosity] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -572,6 +584,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_soundspeed] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -584,6 +597,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_xvel0] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -596,6 +610,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_xvel1] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -608,6 +623,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_yvel0] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -620,6 +636,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_yvel1] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -632,6 +649,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_vol_flux_x] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -644,6 +662,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_vol_flux_y] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -656,6 +675,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_mass_flux_x] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -668,6 +688,7 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 	}
 	if (fields[field_mass_flux_y] == 1) {
 		clover_pack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -678,26 +699,37 @@ void clover_pack_left(global_variables &globals, int tile, const int fields[NUM_
 				depth, y_face_data,
 				left_right_offset[field_mass_flux_y] + t_offset);
 	}
-
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
 }
 
 void clover_send_recv_message_left(
 		global_variables &globals,
-		clover::Buffer1D<double> &left_snd,
-		clover::Buffer1D<double> &left_rcv,
+		clover::Buffer1D<double> &left_snd_buffer,
+		clover::Buffer1D<double> &left_rcv_buffer,
 		int total_size, int tag_send, int tag_recv,
 		MPI_Request &req_send, MPI_Request &req_recv) {
 
 	// First copy send buffer from device to host
 //	Kokkos::deep_copy(globals.chunk.hm_left_snd, left_snd);
 
+
 	int left_task = globals.chunk.chunk_neighbours[chunk_left] - 1;
 
-	MPI_Isend(globals.chunk.left_snd.actual(), total_size, MPI_DOUBLE, left_task, tag_send,
+
+	double *left_snd = left_snd_buffer.data;
+	double *left_rcv = left_rcv_buffer.data;
+	#pragma omp target update from(left_snd[:left_snd_buffer.N()])
+
+	MPI_Isend(left_snd, total_size, MPI_DOUBLE, left_task, tag_send,
 	          MPI_COMM_WORLD, &req_send);
 
-	MPI_Irecv(globals.chunk.left_rcv.actual(), total_size, MPI_DOUBLE, left_task, tag_recv,
+	MPI_Irecv(left_rcv, total_size, MPI_DOUBLE, left_task, tag_recv,
 	          MPI_COMM_WORLD, &req_recv);
+	#pragma omp target update to(left_rcv[:left_rcv_buffer.N()])
+
+
 }
 
 void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS], int tile, int depth,
@@ -706,8 +738,13 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	tile_type &t = globals.chunk.tiles[tile];
 	int t_offset = (t.info.t_bottom - globals.chunk.bottom) * depth;
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
 	if (fields[field_density0] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -720,6 +757,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_density1] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -732,6 +770,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_energy0] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -744,6 +783,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_energy1] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -756,6 +796,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_pressure] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -768,6 +809,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_viscosity] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -780,6 +822,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_soundspeed] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -792,6 +835,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_xvel0] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -804,6 +848,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_xvel1] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -816,6 +861,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_yvel0] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -828,6 +874,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_yvel1] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -840,6 +887,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_vol_flux_x] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -852,6 +900,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_vol_flux_y] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -864,6 +913,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_mass_flux_x] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -876,6 +926,7 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_mass_flux_y] == 1) {
 		clover_unpack_message_left(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -886,7 +937,9 @@ void clover_unpack_left(global_variables &globals, const int fields[NUM_FIELDS],
 				depth, y_face_data,
 				left_right_offset[field_mass_flux_y] + t_offset);
 	}
-
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
 }
 
 void clover_pack_right(global_variables &globals, int tile, const int fields[NUM_FIELDS], int depth,
@@ -895,8 +948,13 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	tile_type &t = globals.chunk.tiles[tile];
 	int t_offset = (t.info.t_bottom - globals.chunk.bottom) * depth;
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
 	if (fields[field_density0] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -909,6 +967,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_density1] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -921,6 +980,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_energy0] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -933,6 +993,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_energy1] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -945,6 +1006,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_pressure] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -957,6 +1019,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_viscosity] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -969,6 +1032,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_soundspeed] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -981,6 +1045,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_xvel0] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -993,6 +1058,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_xvel1] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1005,6 +1071,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_yvel0] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1017,6 +1084,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_yvel1] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1029,6 +1097,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_vol_flux_x] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1041,6 +1110,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_vol_flux_y] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1053,6 +1123,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_mass_flux_x] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1065,6 +1136,7 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 	}
 	if (fields[field_mass_flux_y] == 1) {
 		clover_pack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1075,13 +1147,15 @@ void clover_pack_right(global_variables &globals, int tile, const int fields[NUM
 				depth, y_face_data,
 				left_right_offset[field_mass_flux_y] + t_offset);
 	}
-
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
 }
 
 void clover_send_recv_message_right(
 		global_variables &globals,
-		clover::Buffer1D<double> &right_snd,
-		clover::Buffer1D<double> &right_rcv,
+		clover::Buffer1D<double> &right_snd_buffer,
+		clover::Buffer1D<double> &right_rcv_buffer,
 		int total_size, int tag_send, int tag_recv,
 		MPI_Request &req_send, MPI_Request &req_recv) {
 
@@ -1090,11 +1164,16 @@ void clover_send_recv_message_right(
 
 	int right_task = globals.chunk.chunk_neighbours[chunk_right] - 1;
 
-	MPI_Isend(globals.chunk.right_snd.actual(), total_size, MPI_DOUBLE, right_task,
+	double *right_snd = right_snd_buffer.data;
+	double *right_rcv = right_rcv_buffer.data;
+	#pragma omp target update from(right_snd[:right_snd_buffer.N()])
+
+	MPI_Isend(right_snd, total_size, MPI_DOUBLE, right_task,
 	          tag_send, MPI_COMM_WORLD, &req_send);
 
-	MPI_Irecv(globals.chunk.right_rcv.actual(), total_size, MPI_DOUBLE, right_task,
+	MPI_Irecv(right_rcv, total_size, MPI_DOUBLE, right_task,
 	          tag_recv, MPI_COMM_WORLD, &req_recv);
+	#pragma omp target update to(right_rcv[:right_rcv_buffer.N()])
 }
 
 void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS], int tile, int depth,
@@ -1103,8 +1182,13 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	tile_type &t = globals.chunk.tiles[tile];
 	int t_offset = (t.info.t_bottom - globals.chunk.bottom) * depth;
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
 	if (fields[field_density0] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1117,6 +1201,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_density1] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1129,6 +1214,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_energy0] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1141,6 +1227,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_energy1] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1153,6 +1240,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_pressure] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1165,6 +1253,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_viscosity] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1177,6 +1266,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_soundspeed] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1189,6 +1279,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_xvel0] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1201,6 +1292,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_xvel1] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1213,6 +1305,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_yvel0] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1225,6 +1318,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_yvel1] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1237,6 +1331,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_vol_flux_x] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1249,6 +1344,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_vol_flux_y] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1261,6 +1357,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_mass_flux_x] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1273,6 +1370,7 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 	}
 	if (fields[field_mass_flux_y] == 1) {
 		clover_unpack_message_right(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1283,7 +1381,9 @@ void clover_unpack_right(global_variables &globals, const int fields[NUM_FIELDS]
 				depth, y_face_data,
 				left_right_offset[field_mass_flux_y] + t_offset);
 	}
-
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
 }
 
 void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_FIELDS], int depth,
@@ -1292,8 +1392,13 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	tile_type &t = globals.chunk.tiles[tile];
 	int t_offset = (t.info.t_left - globals.chunk.left) * depth;
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
 	if (fields[field_density0] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1306,6 +1411,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_density1] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1318,6 +1424,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_energy0] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1330,6 +1437,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_energy1] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1342,6 +1450,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_pressure] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1354,6 +1463,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_viscosity] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1366,6 +1476,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_soundspeed] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1378,6 +1489,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_xvel0] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1390,6 +1502,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_xvel1] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1402,6 +1515,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_yvel0] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1414,6 +1528,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_yvel1] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1426,6 +1541,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_vol_flux_x] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1438,6 +1554,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_vol_flux_y] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1450,6 +1567,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_mass_flux_x] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1462,6 +1580,7 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 	}
 	if (fields[field_mass_flux_y] == 1) {
 		clover_pack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1472,13 +1591,15 @@ void clover_pack_top(global_variables &globals, int tile, const int fields[NUM_F
 				depth, y_face_data,
 				bottom_top_offset[field_mass_flux_y] + t_offset);
 	}
-
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
 }
 
 void clover_send_recv_message_top(
 		global_variables &globals,
-		clover::Buffer1D<double> &top_snd,
-		clover::Buffer1D<double> &top_rcv,
+		clover::Buffer1D<double> &top_snd_buffer,
+		clover::Buffer1D<double> &top_rcv_buffer,
 		int total_size, int tag_send, int tag_recv,
 		MPI_Request &req_send, MPI_Request &req_recv) {
 
@@ -1487,11 +1608,16 @@ void clover_send_recv_message_top(
 
 	int top_task = globals.chunk.chunk_neighbours[chunk_top] - 1;
 
-	MPI_Isend(globals.chunk.top_snd.actual(), total_size, MPI_DOUBLE, top_task, tag_send,
+	double *top_snd = top_snd_buffer.data;
+	double *top_rcv = top_rcv_buffer.data;
+	#pragma omp target update from(top_snd[:top_snd_buffer.N()])
+
+	MPI_Isend(top_snd, total_size, MPI_DOUBLE, top_task, tag_send,
 	          MPI_COMM_WORLD, &req_send);
 
-	MPI_Irecv(globals.chunk.top_rcv.actual(), total_size, MPI_DOUBLE, top_task, tag_recv,
+	MPI_Irecv(top_rcv, total_size, MPI_DOUBLE, top_task, tag_recv,
 	          MPI_COMM_WORLD, &req_recv);
+	#pragma omp target update to(top_rcv[:top_rcv_buffer.N()])
 }
 
 void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS], int tile, int depth,
@@ -1500,8 +1626,13 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	tile_type &t = globals.chunk.tiles[tile];
 	int t_offset = (t.info.t_left - globals.chunk.left) * depth;
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
 	if (fields[field_density0] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1514,6 +1645,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_density1] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1526,6 +1658,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_energy0] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1538,6 +1671,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_energy1] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1550,6 +1684,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_pressure] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1562,6 +1697,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_viscosity] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1574,6 +1710,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_soundspeed] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1586,6 +1723,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_xvel0] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1598,6 +1736,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_xvel1] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1610,6 +1749,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_yvel0] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1622,6 +1762,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_yvel1] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1634,6 +1775,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_vol_flux_x] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1646,6 +1788,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_vol_flux_y] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1658,6 +1801,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_mass_flux_x] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1670,6 +1814,7 @@ void clover_unpack_top(global_variables &globals, const int fields[NUM_FIELDS],
 	}
 	if (fields[field_mass_flux_y] == 1) {
 		clover_unpack_message_top(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1690,8 +1835,13 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	tile_type &t = globals.chunk.tiles[tile];
 	int t_offset = (t.info.t_left - globals.chunk.left) * depth;
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
 	if (fields[field_density0] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1704,6 +1854,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_density1] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1716,6 +1867,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_energy0] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1728,6 +1880,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_energy1] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1740,6 +1893,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_pressure] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1752,6 +1906,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_viscosity] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1764,6 +1919,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_soundspeed] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1776,6 +1932,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_xvel0] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1788,6 +1945,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_xvel1] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1800,6 +1958,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_yvel0] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1812,6 +1971,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_yvel1] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1824,6 +1984,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_vol_flux_x] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1836,6 +1997,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_vol_flux_y] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1848,6 +2010,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_mass_flux_x] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1860,6 +2023,7 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 	}
 	if (fields[field_mass_flux_y] == 1) {
 		clover_pack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1870,13 +2034,15 @@ void clover_pack_bottom(global_variables &globals, int tile, const int fields[NU
 				depth, y_face_data,
 				bottom_top_offset[field_mass_flux_y] + t_offset);
 	}
-
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
 }
 
 void clover_send_recv_message_bottom(
 		global_variables &globals,
-		clover::Buffer1D<double> &bottom_snd,
-		clover::Buffer1D<double> &bottom_rcv,
+		clover::Buffer1D<double> &bottom_snd_buffer,
+		clover::Buffer1D<double> &bottom_rcv_buffer,
 		int total_size, int tag_send, int tag_recv,
 		MPI_Request &req_send, MPI_Request &req_recv) {
 
@@ -1885,11 +2051,16 @@ void clover_send_recv_message_bottom(
 
 	int bottom_task = globals.chunk.chunk_neighbours[chunk_bottom] - 1;
 
-	MPI_Isend(globals.chunk.bottom_snd.actual(), total_size, MPI_DOUBLE, bottom_task,
+	double *bottom_snd = bottom_snd_buffer.data;
+	double *bottom_rcv = bottom_rcv_buffer.data;
+	#pragma omp target update from(bottom_snd[:bottom_snd_buffer.N()])
+
+	MPI_Isend(bottom_snd, total_size, MPI_DOUBLE, bottom_task,
 	          tag_send, MPI_COMM_WORLD, &req_send);
 
-	MPI_Irecv(globals.chunk.bottom_rcv.actual(), total_size, MPI_DOUBLE, bottom_task,
+	MPI_Irecv(bottom_rcv, total_size, MPI_DOUBLE, bottom_task,
 	          tag_recv, MPI_COMM_WORLD, &req_recv);
+	#pragma omp target update to(bottom_rcv[:bottom_rcv_buffer.N()])
 }
 
 void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS], int tile, int depth,
@@ -1899,8 +2070,13 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	tile_type &t = globals.chunk.tiles[tile];
 	int t_offset = (t.info.t_left - globals.chunk.left) * depth;
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
 	if (fields[field_density0] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1913,6 +2089,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_density1] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1925,6 +2102,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_energy0] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1937,6 +2115,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_energy1] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1949,6 +2128,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_pressure] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1961,6 +2141,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_viscosity] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1973,6 +2154,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_soundspeed] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1985,6 +2167,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_xvel0] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -1997,6 +2180,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_xvel1] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -2009,6 +2193,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_yvel0] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -2021,6 +2206,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_yvel1] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -2033,6 +2219,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_vol_flux_x] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -2045,6 +2232,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_vol_flux_y] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -2057,6 +2245,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_mass_flux_x] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -2069,6 +2258,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 	}
 	if (fields[field_mass_flux_y] == 1) {
 		clover_unpack_message_bottom(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
@@ -2079,5 +2269,7 @@ void clover_unpack_bottom(global_variables &globals, const int fields[NUM_FIELDS
 				depth, y_face_data,
 				bottom_top_offset[field_mass_flux_y] + t_offset);
 	}
-
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
 }
diff --git a/src/comms.h b/src/comms.h
index ba4f2b6..6b55fe1 100644
--- a/src/comms.h
+++ b/src/comms.h
@@ -22,8 +22,6 @@
 #define COMMS_H
 
 #include "definitions.h"
-#include "utils.hpp"
-
 #include <mpi.h>
 
 // Structure to hold MPI rank information
diff --git a/src/definitions.h b/src/definitions.h
index 528db49..498803a 100644
--- a/src/definitions.h
+++ b/src/definitions.h
@@ -20,7 +20,12 @@
 #ifndef GRID_H
 #define GRID_H
 
+// Enables dumping buffers at each iteration as text files, see hydro.cpp for actual implementation
 #define DEBUG false
+// Enables buffer synchronisation between host and device before and after each kernel invocation.
+// This is useful for debugging individual kernels;
+// by synchronising buffer data, not all kernels have to be executed on the device or host
+#define SYNC_BUFFERS 0
 
 
 #include <iostream>
@@ -29,7 +34,8 @@
 #include <iostream>
 #include <chrono>
 #include <functional>
-#include "utils.hpp"
+#include <cassert>
+#include <vector>
 
 
 #define g_ibig 640000
@@ -37,8 +43,111 @@
 #define g_big   (1.0e+21)
 #define NUM_FIELDS 15
 
+#ifdef OMP_ALLOW_HOST
+#define clover_use_target(cond) if(target: (cond))
+#else
+#define clover_use_target(cond) /*no-op*/
+#endif
+
+namespace clover {
+
+	template<class T, class U = T>
+	static T cpp14_exchange(T &obj, U &&new_value) {
+		T old_value = std::move(obj);
+		obj = std::forward<U>(new_value);
+		return old_value;
+	}
+
+	template<typename T>
+	struct Buffer1D {
+
+	private:
+		const size_t size;
+
+	public:
+		T *data;
+
+		explicit Buffer1D(size_t size) : size(size), data(new T[size]) {
+			assert(size > 0);
+		}
+
+		Buffer1D(const Buffer1D<T> &that) : size(that.size), data(new T[size]) {
+			std::copy(that.data, that.data + size, data);
+		}
+
+		Buffer1D(Buffer1D &&other) noexcept: size(other.size), data(cpp14_exchange(other.data, nullptr)) {}
+
+		Buffer1D &operator=(Buffer1D &&other) noexcept {
+			size = other.size;
+			std::swap(data, other.data);
+			return *this;
+		}
+
+		[[nodiscard]]   T &operator[](size_t i) { return data[i]; }
+		[[nodiscard]] T operator[](size_t i) const { return data[i]; }
+
+		[[nodiscard]] constexpr size_t N() const { return size; }
+
+
+		Buffer1D<T> &operator=(const Buffer1D<T> &other) {
+			if (this != &other) {
+				delete[] data;
+				std::copy(other.data, other.data + size, data);
+				size = other.size;
+			}
+			return *this;
+		}
+
+		~Buffer1D() { delete[] data; }
+	};
+
+
+	template<typename T>
+	struct Buffer2D {
+	private:
+		const size_t sizeX, sizeY;
+	public:
+
+		T *data;
+
+		Buffer2D(size_t sizeX, size_t sizeY) : sizeX(sizeX), sizeY(sizeY), data(new T[sizeX * sizeY]) {
+			assert(sizeX > 0);
+			assert(sizeY > 0);
+		}
+		Buffer2D(const Buffer2D<T> &that) : sizeX(that.sizeX), sizeY(that.sizeY), data(new T[sizeX * sizeY]) {
+			std::copy(that.data, that.data + (sizeX * sizeY), data);
+		}
+
+		Buffer2D(Buffer2D &&other) noexcept: sizeX(other.sizeX), sizeY(other.sizeY), data(cpp14_exchange(other.data, nullptr)) {}
+
+
+		[[nodiscard]] const T &operator()(size_t i, size_t j) { return data[i + j * sizeX]; }
+		[[nodiscard]] T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; }
+		[[nodiscard]] constexpr size_t N() const { return sizeX * sizeY; }
+		[[nodiscard]] constexpr size_t nX() const { return sizeX; }
+		[[nodiscard]] constexpr size_t nY() const { return sizeY; }
+
+
+		Buffer2D<T> &operator=(const Buffer2D<T> &other) {
+			if (this != &other) {
+				return *this = Buffer2D(other);
+			}
+		}
+
+		Buffer2D &operator=(Buffer2D &&other) noexcept {
+			sizeX = other.sizeX;
+			sizeY = other.sizeY;
+			std::swap(data, other.data);
+			return *this;
+		}
 
 
+		~Buffer2D() { delete[] data; }
+
+	};
+
+}
+
 
 typedef std::chrono::time_point<std::chrono::system_clock> timepoint;
 
@@ -63,10 +172,10 @@ static inline void record(const std::string &name, const std::function<void(std:
 
 // formats and then dumps content of 1d double buffer to stream
 static inline void
-show(std::ostream &out, const std::string &name, clover::Buffer1D<double> &buffer) {
-	out << name << "(" << 1 << ") [" << buffer.size() << "]" << std::endl;
+show(std::ostream &out, const std::string &name, const clover::Buffer1D<double> &buffer) {
+	out << name << "(" << 1 << ") [" << buffer.N() << "]" << std::endl;
 	out << "\t";
-	for (size_t i = 0; i < buffer.size(); ++i) {
+	for (size_t i = 0; i < buffer.N(); ++i) {
 		out << buffer[i] << ", ";
 	}
 	out << std::endl;
@@ -74,11 +183,11 @@ show(std::ostream &out, const std::string &name, clover::Buffer1D<double> &buffe
 // formats and then dumps content of 2d double buffer to stream
 static inline void
 show(std::ostream &out, const std::string &name, clover::Buffer2D<double> &buffer) {
-	out << name << "(" << 2 << ") [" << buffer.sizeX << "x" << buffer.sizeY << "]"
+	out << name << "(" << 2 << ") [" << buffer.nX() << "x" << buffer.nY() << "]"
 	    << std::endl;
-	for (size_t i = 0; i < buffer.sizeX; ++i) {
+	for (size_t i = 0; i < buffer.nX(); ++i) {
 		out << "\t";
-		for (size_t j = 0; j < buffer.sizeY; ++j) {
+		for (size_t j = 0; j < buffer.nY(); ++j) {
 			out << buffer(i, j) << ", ";
 		}
 		out << std::endl;
@@ -180,17 +289,24 @@ struct profiler_type {
 
 struct field_type {
 
-	clover::Buffer2D<double> density0;
-	clover::Buffer2D<double> density1;
-	clover::Buffer2D<double> energy0;
-	clover::Buffer2D<double> energy1;
+	clover::Buffer2D<double> density0, density1;
+	clover::Buffer2D<double> energy0, energy1;
 	clover::Buffer2D<double> pressure;
 	clover::Buffer2D<double> viscosity;
+	clover::Buffer2D<double> volume;
 	clover::Buffer2D<double> soundspeed;
+
+
+	int density0_stride, density1_stride;
+	int energy0_stride, energy1_stride;
+	int pressure_stride;
+	int viscosity_stride;
+	int volume_stride;
+	int soundspeed_stride;
+
+
 	clover::Buffer2D<double> xvel0, xvel1;
 	clover::Buffer2D<double> yvel0, yvel1;
-	clover::Buffer2D<double> vol_flux_x, mass_flux_x;
-	clover::Buffer2D<double> vol_flux_y, mass_flux_y;
 
 	clover::Buffer2D<double> work_array1; // node_flux, stepbymass, volume_change, pre_vol
 	clover::Buffer2D<double> work_array2; // node_mass_post, post_vol
@@ -200,36 +316,33 @@ struct field_type {
 	clover::Buffer2D<double> work_array6; // pre_vol, post_ener
 	clover::Buffer2D<double> work_array7; // post_vol, ener_flux
 
-	clover::Buffer1D<double> cellx;
-	clover::Buffer1D<double> celldx;
-	clover::Buffer1D<double> celly;
-	clover::Buffer1D<double> celldy;
-	clover::Buffer1D<double> vertexx;
-	clover::Buffer1D<double> vertexdx;
-	clover::Buffer1D<double> vertexy;
-	clover::Buffer1D<double> vertexdy;
+	clover::Buffer2D<double> vol_flux_x, mass_flux_x;
+	clover::Buffer2D<double> vol_flux_y, mass_flux_y;
+	clover::Buffer2D<double> xarea, yarea;
+
+	clover::Buffer1D<double> cellx, celldx;
+	clover::Buffer1D<double> celly, celldy;
+
+	clover::Buffer1D<double> vertexx, vertexdx;
+	clover::Buffer1D<double> vertexy, vertexdy;
+
+
+	int base_stride;
+	int vels_wk_stride;
+	int flux_x_stride, flux_y_stride;
 
-	clover::Buffer2D<double> volume;
-	clover::Buffer2D<double> xarea;
-	clover::Buffer2D<double> yarea;
 
+	explicit field_type(const int xrange, const int yrange) :
 
-	explicit field_type(const size_t xrange, const size_t yrange) :
-			density0(xrange, yrange),
-			density1(xrange, yrange),
-			energy0(xrange, yrange),
-			energy1(xrange, yrange),
+			density0(xrange, yrange), density1(xrange, yrange),
+			energy0(xrange, yrange), energy1(xrange, yrange),
 			pressure(xrange, yrange),
 			viscosity(xrange, yrange),
+			volume(xrange, yrange),
 			soundspeed(xrange, yrange),
-			xvel0(xrange + 1, yrange + 1),
-			xvel1(xrange + 1, yrange + 1),
-			yvel0(xrange + 1, yrange + 1),
-			yvel1(xrange + 1, yrange + 1),
-			vol_flux_x(xrange + 1, yrange),
-			mass_flux_x(xrange + 1, yrange),
-			vol_flux_y(xrange, yrange + 1),
-			mass_flux_y(xrange, yrange + 1),
+
+			xvel0(xrange + 1, yrange + 1), xvel1(xrange + 1, yrange + 1),
+			yvel0(xrange + 1, yrange + 1), yvel1(xrange + 1, yrange + 1),
 			work_array1(xrange + 1, yrange + 1),
 			work_array2(xrange + 1, yrange + 1),
 			work_array3(xrange + 1, yrange + 1),
@@ -237,19 +350,22 @@ struct field_type {
 			work_array5(xrange + 1, yrange + 1),
 			work_array6(xrange + 1, yrange + 1),
 			work_array7(xrange + 1, yrange + 1),
-			cellx(xrange),
-			celldx(xrange),
-			celly(yrange),
-			celldy(yrange),
+
+			vol_flux_x(xrange + 1, yrange), mass_flux_x(xrange + 1, yrange),
+			vol_flux_y(xrange, yrange + 1), mass_flux_y(xrange, yrange + 1),
+			xarea(xrange + 1, yrange), yarea(xrange, yrange + 1),
+
+			cellx(xrange), celldx(xrange),
+			celly(yrange), celldy(yrange),
+
 			vertexx(xrange + 1),
 			vertexdx(xrange + 1),
 			vertexy(yrange + 1),
 			vertexdy(yrange + 1),
-			volume(xrange, yrange),
-			xarea(xrange + 1, yrange),
-			yarea(xrange, yrange + 1) {}
-
 
+			base_stride(xrange),
+			vels_wk_stride(xrange + 1),
+			flux_x_stride(xrange + 1), flux_y_stride(xrange) {}
 };
 
 
@@ -368,6 +484,8 @@ struct global_variables {
 	const global_config config;
 
 	const size_t omp_device;
+	bool use_target;
+
 	chunk_type chunk;
 
 	int error_condition;
@@ -392,15 +510,175 @@ struct global_variables {
 	explicit global_variables(
 			const global_config &config,
 			size_t omp_device,
+			bool use_target,
 			chunk_type chunk) :
-			config(config), omp_device(omp_device), chunk(std::move(chunk)),
+			config(config), omp_device(omp_device), use_target(use_target), chunk(std::move(chunk)),
 			dt(config.dtinit),
 			dtold(config.dtinit),
 			profiler_on(config.profiler_on) {}
 
+	void hostToDevice() {
+
+		for (int tile = 0; tile < config.tiles_per_chunk; ++tile) {
+			tile_type &t = chunk.tiles[tile];
+			field_type &field = t.field;
+
+
+			double *density0 = field.density0.data;
+			double *density1 = field.density1.data;
+			double *energy0 = field.energy0.data;
+			double *energy1 = field.energy1.data;
+			double *pressure = field.pressure.data;
+			double *viscosity = field.viscosity.data;
+			double *soundspeed = field.soundspeed.data;
+			double *yvel0 = field.yvel0.data;
+			double *yvel1 = field.yvel1.data;
+			double *xvel0 = field.xvel0.data;
+			double *xvel1 = field.xvel1.data;
+			double *vol_flux_x = field.vol_flux_x.data;
+			double *vol_flux_y = field.vol_flux_y.data;
+			double *mass_flux_x = field.mass_flux_x.data;
+			double *mass_flux_y = field.mass_flux_y.data;
+			double *work_array1 = field.work_array1.data;
+			double *work_array2 = field.work_array2.data;
+			double *work_array3 = field.work_array3.data;
+			double *work_array4 = field.work_array4.data;
+			double *work_array5 = field.work_array5.data;
+			double *work_array6 = field.work_array6.data;
+			double *work_array7 = field.work_array7.data;
+			double *cellx = field.cellx.data;
+			double *celldx = field.celldx.data;
+			double *celly = field.celly.data;
+			double *celldy = field.celldy.data;
+			double *vertexx = field.vertexx.data;
+			double *vertexdx = field.vertexdx.data;
+			double *vertexy = field.vertexy.data;
+			double *vertexdy = field.vertexdy.data;
+			double *volume = field.volume.data;
+			double *xarea = field.xarea.data;
+			double *yarea = field.yarea.data;
+
+			#pragma omp target update \
+                to(density0[:field.density0.N()])    \
+                to(density1[:field.density1.N()])    \
+                to(energy0[:field.energy0.N()])    \
+                to(energy1[:field.energy1.N()])    \
+                to(pressure[:field.pressure.N()])    \
+                to(viscosity[:field.viscosity.N()])    \
+                to(soundspeed[:field.soundspeed.N()])    \
+                to(yvel0[:field.yvel0.N()])    \
+                to(yvel1[:field.yvel1.N()])    \
+                to(xvel0[:field.xvel0.N()])    \
+                to(xvel1[:field.xvel1.N()])    \
+                to(vol_flux_x[:field.vol_flux_x.N()])    \
+                to(vol_flux_y[:field.vol_flux_y.N()])    \
+                to(mass_flux_x[:field.mass_flux_x.N()])    \
+                to(mass_flux_y[:field.mass_flux_y.N()])    \
+                to(work_array1[:field.work_array1.N()])    \
+                to(work_array2[:field.work_array2.N()])    \
+                to(work_array3[:field.work_array3.N()])    \
+                to(work_array4[:field.work_array4.N()])    \
+                to(work_array5[:field.work_array5.N()])    \
+                to(work_array6[:field.work_array6.N()])    \
+                to(work_array7[:field.work_array7.N()])    \
+                to(cellx[:field.cellx.N()]) \
+                to(celldx[:field.celldx.N()]) \
+                to(celly[:field.celly.N()]) \
+                to(celldy[:field.celldy.N()]) \
+                to(vertexx[:field.vertexx.N()]) \
+                to(vertexdx[:field.vertexdx.N()]) \
+                to(vertexy[:field.vertexy.N()]) \
+                to(vertexdy[:field.vertexdy.N()]) \
+                to(volume[:field.volume.N()])    \
+                to(xarea[:field.xarea.N()])    \
+                to(yarea[:field.yarea.N()])
+		}
+
+	}
+
+	void deviceToHost() {
+
+		for (int tile = 0; tile < config.tiles_per_chunk; ++tile) {
+			tile_type &t = chunk.tiles[tile];
+			field_type &field = t.field;
+
+
+			double *density0 = field.density0.data;
+			double *density1 = field.density1.data;
+			double *energy0 = field.energy0.data;
+			double *energy1 = field.energy1.data;
+			double *pressure = field.pressure.data;
+			double *viscosity = field.viscosity.data;
+			double *soundspeed = field.soundspeed.data;
+			double *yvel0 = field.yvel0.data;
+			double *yvel1 = field.yvel1.data;
+			double *xvel0 = field.xvel0.data;
+			double *xvel1 = field.xvel1.data;
+			double *vol_flux_x = field.vol_flux_x.data;
+			double *vol_flux_y = field.vol_flux_y.data;
+			double *mass_flux_x = field.mass_flux_x.data;
+			double *mass_flux_y = field.mass_flux_y.data;
+			double *work_array1 = field.work_array1.data;
+			double *work_array2 = field.work_array2.data;
+			double *work_array3 = field.work_array3.data;
+			double *work_array4 = field.work_array4.data;
+			double *work_array5 = field.work_array5.data;
+			double *work_array6 = field.work_array6.data;
+			double *work_array7 = field.work_array7.data;
+			double *cellx = field.cellx.data;
+			double *celldx = field.celldx.data;
+			double *celly = field.celly.data;
+			double *celldy = field.celldy.data;
+			double *vertexx = field.vertexx.data;
+			double *vertexdx = field.vertexdx.data;
+			double *vertexy = field.vertexy.data;
+			double *vertexdy = field.vertexdy.data;
+			double *volume = field.volume.data;
+			double *xarea = field.xarea.data;
+			double *yarea = field.yarea.data;
+
+			#pragma omp target update \
+                from(density0[:field.density0.N()])    \
+                from(density1[:field.density1.N()])    \
+                from(energy0[:field.energy0.N()])    \
+                from(energy1[:field.energy1.N()])    \
+                from(pressure[:field.pressure.N()])    \
+                from(viscosity[:field.viscosity.N()])    \
+                from(soundspeed[:field.soundspeed.N()])    \
+                from(yvel0[:field.yvel0.N()])    \
+                from(yvel1[:field.yvel1.N()])    \
+                from(xvel0[:field.xvel0.N()])    \
+                from(xvel1[:field.xvel1.N()])    \
+                from(vol_flux_x[:field.vol_flux_x.N()])    \
+                from(vol_flux_y[:field.vol_flux_y.N()])    \
+                from(mass_flux_x[:field.mass_flux_x.N()])    \
+                from(mass_flux_y[:field.mass_flux_y.N()])    \
+                from(work_array1[:field.work_array1.N()])    \
+                from(work_array2[:field.work_array2.N()])    \
+                from(work_array3[:field.work_array3.N()])    \
+                from(work_array4[:field.work_array4.N()])    \
+                from(work_array5[:field.work_array5.N()])    \
+                from(work_array6[:field.work_array6.N()])    \
+                from(work_array7[:field.work_array7.N()])    \
+                from(cellx[:field.cellx.N()]) \
+                from(celldx[:field.celldx.N()]) \
+                from(celly[:field.celly.N()]) \
+                from(celldy[:field.celldy.N()]) \
+                from(vertexx[:field.vertexx.N()]) \
+                from(vertexdx[:field.vertexdx.N()]) \
+                from(vertexy[:field.vertexy.N()]) \
+                from(vertexdy[:field.vertexdy.N()]) \
+                from(volume[:field.volume.N()])    \
+                from(xarea[:field.xarea.N()])    \
+                from(yarea[:field.yarea.N()])
+		}
+	}
+
 	// dumps all content to file; for debugging only
 	void dump(const std::string &filename) {
 
+		deviceToHost();
+
 		std::cout << "Dumping globals to " << filename << std::endl;
 
 		record(filename, [&](std::ostream &out) {
@@ -500,6 +778,7 @@ struct global_variables {
 
 	}
 
+
 };
 
 
diff --git a/src/field_summary.cpp b/src/field_summary.cpp
index 0e9c016..3214804 100644
--- a/src/field_summary.cpp
+++ b/src/field_summary.cpp
@@ -22,7 +22,7 @@
 #include "field_summary.h"
 #include "timer.h"
 #include "ideal_gas.h"
-#include "utils.hpp"
+
 
 #include <iomanip>
 
@@ -75,6 +75,9 @@ void field_summary(global_variables &globals, parallel_ &parallel) {
 	double ke = 0.0;
 	double press = 0.0;
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
 
 	for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) {
 		tile_type &t = globals.chunk.tiles[tile];
@@ -85,28 +88,50 @@ void field_summary(global_variables &globals, parallel_ &parallel) {
 		int xmin = t.info.t_xmin;
 		field_type &field = t.field;
 
-		_Pragma("kernel1d")
-		for (int idx = (0); idx < ((ymax - ymin + 1) * (xmax - xmin + 1)); idx++) {
+
+		const int base_stride = field.base_stride;
+		const int vels_wk_stride = field.vels_wk_stride;
+
+		double *volume = field.volume.data;
+		double *density0 = field.density0.data;
+		double *energy0 = field.energy0.data;
+		double *pressure = field.pressure.data;
+		double *xvel0 = field.xvel0.data;
+		double *yvel0 = field.yvel0.data;
+
+
+		#pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target) \
+        map(tofrom:vol) \
+        map(tofrom:mass) \
+        map(tofrom:ie) \
+        map(tofrom:ke) \
+        map(tofrom:press) \
+        reduction(+:vol, mass, ie, ke, press)
+		for (int idx = 0; idx < ((ymax - ymin + 1) * (xmax - xmin + 1)); idx++) {
 			const int j = xmin + 1 + idx % (xmax - xmin + 1);
 			const int k = ymin + 1 + idx / (xmax - xmin + 1);
 			double vsqrd = 0.0;
 			for (int kv = k; kv <= k + 1; ++kv) {
 				for (int jv = j; jv <= j + 1; ++jv) {
-					vsqrd += 0.25 * (field.xvel0(jv, kv) * field.xvel0(jv, kv) + field.yvel0(jv, kv) * field.yvel0(jv, kv));
+					vsqrd += 0.25 * (xvel0[(jv) + (kv) * vels_wk_stride] * xvel0[(jv) + (kv) * vels_wk_stride] + yvel0[(jv) + (kv) * vels_wk_stride] * yvel0[(jv) + (kv) * vels_wk_stride]);
 				}
 			}
-			double cell_vol = field.volume(j, k);
-			double cell_mass = cell_vol * field.density0(j, k);
+			double cell_vol = volume[j + (k) * base_stride];
+			double cell_mass = cell_vol * density0[j + (k) * base_stride];
 			vol += cell_vol;
 			mass += cell_mass;
-			ie += cell_mass * field.energy0(j, k);
+			ie += cell_mass * energy0[j + (k) * base_stride];
 			ke += cell_mass * 0.5 * vsqrd;
-			press += cell_vol * field.pressure(j, k);
+			press += cell_vol * pressure[j + (k) * base_stride];
 		}
 
 
 	}
 
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
+
 	clover_sum(vol);
 	clover_sum(mass);
 	clover_sum(ie);
diff --git a/src/finalise_field.cpp b/src/finalise_field.cpp
new file mode 100644
index 0000000..3ce2150
--- /dev/null
+++ b/src/finalise_field.cpp
@@ -0,0 +1,110 @@
+/*
+ Crown Copyright 2012 AWE.
+
+ This file is part of CloverLeaf.
+
+ CloverLeaf is free software: you can redistribute it and/or modify it under 
+ the terms of the GNU General Public License as published by the 
+ Free Software Foundation, either version 3 of the License, or (at your option) 
+ any later version.
+
+ CloverLeaf is distributed in the hope that it will be useful, but 
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 
+ details.
+
+ You should have received a copy of the GNU General Public License along with
+ CloverLeaf. If not, see http://www.gnu.org/licenses/.
+ */
+
+
+// @brief  Allocates the data for each mesh chunk
+// @author Wayne Gaudin
+// @details The data fields for the mesh chunk are allocated based on the mesh
+// size.
+
+
+#include "finalise_field.h"
+
+
+// Allocate Kokkos Views for the data arrays
+void finalise_field(global_variables &globals) {
+
+	for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) {
+
+		tile_type &t = globals.chunk.tiles[tile];
+		field_type &field = t.field;
+
+		double *density0 = field.density0.data;
+		double *density1 = field.density1.data;
+		double *energy0 = field.energy0.data;
+		double *energy1 = field.energy1.data;
+		double *pressure = field.pressure.data;
+		double *viscosity = field.viscosity.data;
+		double *soundspeed = field.soundspeed.data;
+		double *yvel0 = field.yvel0.data;
+		double *yvel1 = field.yvel1.data;
+		double *xvel0 = field.xvel0.data;
+		double *xvel1 = field.xvel1.data;
+		double *vol_flux_x = field.vol_flux_x.data;
+		double *vol_flux_y = field.vol_flux_y.data;
+		double *mass_flux_x = field.mass_flux_x.data;
+		double *mass_flux_y = field.mass_flux_y.data;
+		double *work_array1 = field.work_array1.data;
+		double *work_array2 = field.work_array2.data;
+		double *work_array3 = field.work_array3.data;
+		double *work_array4 = field.work_array4.data;
+		double *work_array5 = field.work_array5.data;
+		double *work_array6 = field.work_array6.data;
+		double *work_array7 = field.work_array7.data;
+		double *cellx = field.cellx.data;
+		double *celldx = field.celldx.data;
+		double *celly = field.celly.data;
+		double *celldy = field.celldy.data;
+		double *vertexx = field.vertexx.data;
+		double *vertexdx = field.vertexdx.data;
+		double *vertexy = field.vertexy.data;
+		double *vertexdy = field.vertexdy.data;
+		double *volume = field.volume.data;
+		double *xarea = field.xarea.data;
+		double *yarea = field.yarea.data;
+
+		#pragma omp target exit data \
+                map(release: density0[:0]) \
+                map(release: density1[:0]) \
+                map(release: energy0[:0]) \
+                map(release: energy1[:0]) \
+                map(release: pressure[:0]) \
+                map(release: viscosity[:0]) \
+                map(release: soundspeed[:0]) \
+                map(release: yvel0[:0]) \
+                map(release: yvel1[:0]) \
+                map(release: xvel0[:0]) \
+                map(release: xvel1[:0]) \
+                map(release: vol_flux_x[:0]) \
+                map(release: vol_flux_y[:0]) \
+                map(release: mass_flux_x[:0]) \
+                map(release: mass_flux_y[:0]) \
+                map(release: work_array1[:0]) \
+                map(release: work_array2[:0]) \
+                map(release: work_array3[:0]) \
+                map(release: work_array4[:0]) \
+                map(release: work_array5[:0]) \
+                map(release: work_array6[:0]) \
+                map(release: work_array7[:0]) \
+                map(release: cellx[:0]) \
+                map(release: celldx[:0]) \
+                map(release: celly[:0]) \
+                map(release: celldy[:0]) \
+                map(release: vertexx[:0]) \
+                map(release: vertexdx[:0]) \
+                map(release: vertexy[:0]) \
+                map(release: vertexdy[:0]) \
+                map(release: volume[:0]) \
+                map(release: xarea[:0]) \
+                map(release: yarea[:0])
+
+	}
+
+}
+
diff --git a/src/cxx14_compat.hpp b/src/finalise_field.h
similarity index 58%
rename from src/cxx14_compat.hpp
rename to src/finalise_field.h
index 7181e0f..c558a3e 100644
--- a/src/cxx14_compat.hpp
+++ b/src/finalise_field.h
@@ -3,14 +3,14 @@
 
  This file is part of CloverLeaf.
 
- CloverLeaf is free software: you can redistribute it and/or modify it under
- the terms of the GNU General Public License as published by the
- Free Software Foundation, either version 3 of the License, or (at your option)
+ CloverLeaf is free software: you can redistribute it and/or modify it under 
+ the terms of the GNU General Public License as published by the 
+ Free Software Foundation, either version 3 of the License, or (at your option) 
  any later version.
 
- CloverLeaf is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ CloverLeaf is distributed in the hope that it will be useful, but 
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 
  details.
 
  You should have received a copy of the GNU General Public License along with
@@ -18,17 +18,12 @@
  */
 
 
-#ifndef CXX14_COMPAT_HPP
-#define CXX14_COMPAT_HPP
+#ifndef FINALISE_FIELD_H
+#define FINALISE_FIELD_H
 
-#include <memory>
+#include "definitions.h"
 
+void finalise_field(global_variables &globals);
 
-// taken from https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique
-// one of the possible reference implementations
-template<typename T, typename... Args>
-std::unique_ptr<T> make_unique(Args &&... args) {
-	return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
+#endif
 
-#endif //CXX14_COMPAT_HPP
diff --git a/src/flux_calc.cpp b/src/flux_calc.cpp
index 2095a10..694283c 100644
--- a/src/flux_calc.cpp
+++ b/src/flux_calc.cpp
@@ -21,33 +21,45 @@
 
 #include "flux_calc.h"
 #include "timer.h"
-#include "utils.hpp"
+
 
 
 //  @brief Fortran flux kernel.
 //  @author Wayne Gaudin
 //  @details The edge volume fluxes are calculated based on the velocity fields.
 void flux_calc_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
 		double dt,
-		clover::Buffer2D<double> &xarea,
-		clover::Buffer2D<double> &yarea,
-		clover::Buffer2D<double> &xvel0,
-		clover::Buffer2D<double> &yvel0,
-		clover::Buffer2D<double> &xvel1,
-		clover::Buffer2D<double> &yvel1,
-		clover::Buffer2D<double> &vol_flux_x,
-		clover::Buffer2D<double> &vol_flux_y) {
+		field_type &field) {
 
 	// DO k=y_min,y_max+1
 	//   DO j=x_min,x_max+1
-// Note that the loops calculate one extra flux than required, but this
+	// Note that the loops calculate one extra flux than required, but this
 	// allows loop fusion that improves performance
-	_Pragma("kernel2d")
+
+	const int flux_x_stride = field.flux_x_stride;
+	const int flux_y_stride = field.flux_y_stride;
+	const int vels_wk_stride = field.vels_wk_stride;
+
+	double *xarea = field.xarea.data;
+	double *yarea = field.yarea.data;
+	double *xvel0 = field.xvel0.data;
+	double *yvel0 = field.yvel0.data;
+	double *xvel1 = field.xvel1.data;
+	double *yvel1 = field.yvel1.data;
+	double *vol_flux_x = field.vol_flux_x.data;
+	double *vol_flux_y = field.vol_flux_y.data;
+
+	#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 	for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) {
 		for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) {
-			vol_flux_x(i, j) = 0.25 * dt * xarea(i, j) * (xvel0(i, j) + xvel0(i + 0, j + 1) + xvel1(i, j) + xvel1(i + 0, j + 1));
-			vol_flux_y(i, j) = 0.25 * dt * yarea(i, j) * (yvel0(i, j) + yvel0(i + 1, j + 0) + yvel1(i, j) + yvel1(i + 1, j + 0));
+			vol_flux_x[i + j * flux_x_stride] = 0.25 * dt * xarea[i + j * flux_x_stride] *
+			                                    (xvel0[i + j * vels_wk_stride] + xvel0[(i + 0) + (j + 1) * vels_wk_stride] + xvel1[i + j * vels_wk_stride] +
+			                                     xvel1[(i + 0) + (j + 1) * vels_wk_stride]);
+			vol_flux_y[i + j * flux_y_stride] = 0.25 * dt * yarea[i + j * flux_y_stride] *
+			                                    (yvel0[i + j * vels_wk_stride] + yvel0[(i + 1) + (j + 0) * vels_wk_stride] + yvel1[i + j * vels_wk_stride] +
+			                                     yvel1[(i + 1) + (j + 0) * vels_wk_stride]);
 		}
 	}
 }
@@ -61,25 +73,27 @@ void flux_calc(global_variables &globals) {
 	if (globals.profiler_on) kernel_time = timer();
 
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
 	for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) {
 
 		tile_type &t = globals.chunk.tiles[tile];
 		flux_calc_kernel(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
 				t.info.t_ymax,
 				globals.dt,
-				t.field.xarea,
-				t.field.yarea,
-				t.field.xvel0,
-				t.field.yvel0,
-				t.field.xvel1,
-				t.field.yvel1,
-				t.field.vol_flux_x,
-				t.field.vol_flux_y);
+				t.field);
 	}
 
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
+
 	if (globals.profiler_on) globals.profiler.flux += timer() - kernel_time;
 
 }
diff --git a/src/generate_chunk.cpp b/src/generate_chunk.cpp
index db57d71..7db9169 100644
--- a/src/generate_chunk.cpp
+++ b/src/generate_chunk.cpp
@@ -27,36 +27,35 @@
 
 #include <cmath>
 #include "generate_chunk.h"
-#include "utils.hpp"
 
 void generate_chunk(const int tile, global_variables &globals) {
 
 
 	// Need to copy the host array of state input data into a device array
-	Buffer1D<double> state_density(globals.config.number_of_states);
-	Buffer1D<double> state_energy(globals.config.number_of_states);
-	Buffer1D<double> state_xvel(globals.config.number_of_states);
-	Buffer1D<double> state_yvel(globals.config.number_of_states);
-	Buffer1D<double> state_xmin(globals.config.number_of_states);
-	Buffer1D<double> state_xmax(globals.config.number_of_states);
-	Buffer1D<double> state_ymin(globals.config.number_of_states);
-	Buffer1D<double> state_ymax(globals.config.number_of_states);
-	Buffer1D<double> state_radius(globals.config.number_of_states);
-	Buffer1D<int> state_geometry(globals.config.number_of_states);
+	clover::Buffer1D<double> state_density_buffer(globals.config.number_of_states);
+	clover::Buffer1D<double> state_energy_buffer(globals.config.number_of_states);
+	clover::Buffer1D<double> state_xvel_buffer(globals.config.number_of_states);
+	clover::Buffer1D<double> state_yvel_buffer(globals.config.number_of_states);
+	clover::Buffer1D<double> state_xmin_buffer(globals.config.number_of_states);
+	clover::Buffer1D<double> state_xmax_buffer(globals.config.number_of_states);
+	clover::Buffer1D<double> state_ymin_buffer(globals.config.number_of_states);
+	clover::Buffer1D<double> state_ymax_buffer(globals.config.number_of_states);
+	clover::Buffer1D<double> state_radius_buffer(globals.config.number_of_states);
+	clover::Buffer1D<int> state_geometry_buffer(globals.config.number_of_states);
 
 
 	// Copy the data to the new views
 	for (int state = 0; state < globals.config.number_of_states; ++state) {
-		state_density[state] = globals.config.states[state].density;
-		state_energy[state] = globals.config.states[state].energy;
-		state_xvel[state] = globals.config.states[state].xvel;
-		state_yvel[state] = globals.config.states[state].yvel;
-		state_xmin[state] = globals.config.states[state].xmin;
-		state_xmax[state] = globals.config.states[state].xmax;
-		state_ymin[state] = globals.config.states[state].ymin;
-		state_ymax[state] = globals.config.states[state].ymax;
-		state_radius[state] = globals.config.states[state].radius;
-		state_geometry[state] = globals.config.states[state].geometry;
+		state_density_buffer[state] = globals.config.states[state].density;
+		state_energy_buffer[state] = globals.config.states[state].energy;
+		state_xvel_buffer[state] = globals.config.states[state].xvel;
+		state_yvel_buffer[state] = globals.config.states[state].yvel;
+		state_xmin_buffer[state] = globals.config.states[state].xmin;
+		state_xmax_buffer[state] = globals.config.states[state].xmax;
+		state_ymin_buffer[state] = globals.config.states[state].ymin;
+		state_ymax_buffer[state] = globals.config.states[state].ymax;
+		state_radius_buffer[state] = globals.config.states[state].radius;
+		state_geometry_buffer[state] = globals.config.states[state].geometry;
 	}
 
 	// Kokkos::deep_copy (TO, FROM)
@@ -76,58 +75,99 @@ void generate_chunk(const int tile, global_variables &globals) {
 	field_type &field = globals.chunk.tiles[tile].field;
 
 
+	const double state_energy_0 = state_energy_buffer[0];
+	const double state_density_0 = state_density_buffer[0];
+	const double state_xvel_0 = state_xvel_buffer[0];
+	const double state_yvel_0 = state_yvel_buffer[0];
+
+	const int base_stride = field.base_stride;
+	const int vels_wk_stride = field.vels_wk_stride;
+
 	// State 1 is always the background state
-	_Pragma("kernel2d")
-	for (int j = (0); j < (yrange); j++) {
-		for (int i = (0); i < (xrange); i++) {
-			field.energy0(i, j) = state_energy[0];
-			field.density0(i, j) = state_density[0];
-			field.xvel0(i, j) = state_xvel[0];
-			field.yvel0(i, j) = state_yvel[0];
+	double *energy0 = field.energy0.data;
+	double *density0 = field.density0.data;
+	double *xvel0 = field.xvel0.data;
+	double *yvel0 = field.yvel0.data;
+
+	#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target)
+	for (int j = 0; j < (yrange); j++) {
+		for (int i = 0; i < (xrange); i++) {
+			energy0[i + j * base_stride] = state_energy_0;
+			density0[i + j * base_stride] = state_density_0;
+			xvel0[i + j * vels_wk_stride] = state_xvel_0;
+			yvel0[i + j * vels_wk_stride] = state_yvel_0;
 		}
 	}
 
 
 	for (int state = 1; state < globals.config.number_of_states; ++state) {
-		_Pragma("kernel2d")
-		for (int j = (0); j < (yrange); j++) {
-			for (int i = (0); i < (xrange); i++) {
+
+		double *cellx = field.cellx.data;
+		double *celly = field.celly.data;
+
+		double *vertexx = field.vertexx.data;
+		double *vertexy = field.vertexy.data;
+
+		const double *state_density = state_density_buffer.data;
+		const double *state_energy = state_energy_buffer.data;
+		const double *state_xvel = state_xvel_buffer.data;
+		const double *state_yvel = state_yvel_buffer.data;
+		const double *state_xmin = state_xmin_buffer.data;
+		const double *state_xmax = state_xmax_buffer.data;
+		const double *state_ymin = state_ymin_buffer.data;
+		const double *state_ymax = state_ymax_buffer.data;
+		const double *state_radius = state_radius_buffer.data;
+		const int *state_geometry = state_geometry_buffer.data;
+
+		#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target) \
+        map(to : state_density[:state_density_buffer.N()]) \
+        map(to : state_energy[:state_energy_buffer.N()]) \
+        map(to : state_xvel[:state_xvel_buffer.N()]) \
+        map(to : state_yvel[:state_yvel_buffer.N()]) \
+        map(to : state_xmin[:state_xmin_buffer.N()]) \
+        map(to : state_xmax[:state_xmax_buffer.N()]) \
+        map(to : state_ymin[:state_ymin_buffer.N()]) \
+        map(to : state_ymax[:state_ymax_buffer.N()]) \
+        map(to : state_radius[:state_radius_buffer.N()]) \
+        map(to : state_geometry[:state_geometry_buffer.N()])
+		for (int j = 0; j < (yrange); j++) {
+			for (int i = 0; i < (xrange); i++) {
 				double x_cent = state_xmin[state];
 				double y_cent = state_ymin[state];
 				if (state_geometry[state] == g_rect) {
-					if (field.vertexx[i + 1] >= state_xmin[state] && field.vertexx[i] < state_xmax[state]) {
-						if (field.vertexy[j + 1] >= state_ymin[state] && field.vertexy[j] < state_ymax[state]) {
-							field.energy0(i, j) = state_energy[state];
-							field.density0(i, j) = state_density[state];
+					if (vertexx[i + 1] >= state_xmin[state] && vertexx[i] < state_xmax[state]) {
+						if (vertexy[j + 1] >= state_ymin[state] && vertexy[j] < state_ymax[state]) {
+							energy0[i + j * base_stride] = state_energy[state];
+							density0[i + j * base_stride] = state_density[state];
 							for (int kt = j; kt <= j + 1; ++kt) {
 								for (int jt = i; jt <= i + 1; ++jt) {
-									field.xvel0(jt, kt) = state_xvel[state];
-									field.yvel0(jt, kt) = state_yvel[state];
+									xvel0[jt + kt * vels_wk_stride] = state_xvel[state];
+									yvel0[jt + kt * vels_wk_stride] = state_yvel[state];
 								}
 							}
 						}
 					}
 				} else if (state_geometry[state] == g_circ) {
-					double radius = std::sqrt((field.cellx[i] - x_cent) *
-					                          (field.cellx[i] - x_cent) + (field.celly[j] - y_cent) * (field.celly[j] - y_cent));
+					double radius = sqrt((cellx[i] - x_cent) *
+					                     (cellx[i] - x_cent) + (celly[j] - y_cent) * (celly[j] - y_cent));
 					if (radius <= state_radius[state]) {
-						field.energy0(i, j) = state_energy[state];
-						field.density0(i, j) = state_density[state];
+						energy0[i + j * base_stride] = state_energy[state];
+						density0[i + j * base_stride] = state_density[state];
 						for (int kt = j; kt <= j + 1; ++kt) {
 							for (int jt = i; jt <= i + 1; ++jt) {
-								field.xvel0(jt, kt) = state_xvel[state];
-								field.yvel0(jt, kt) = state_yvel[state];
+								xvel0[jt + kt * vels_wk_stride] = state_xvel[state];
+								yvel0[jt + kt * vels_wk_stride] = state_yvel[state];
 							}
 						}
 					}
 				} else if (state_geometry[state] == g_point) {
-					if (field.vertexx[i] == x_cent && field.vertexy[j] == y_cent) {
-						field.energy0(i, j) = state_energy[state];
-						field.density0(i, j) = state_density[state];
+					if (vertexx[i] == x_cent && vertexy[j] == y_cent) {
+						energy0[i + j * base_stride] = state_energy[state];
+						density0[i + j * base_stride] = state_density[state];
 						for (int kt = j; kt <= j + 1; ++kt) {
 							for (int jt = i; jt <= i + 1; ++jt) {
-								field.xvel0(jt, kt) = state_xvel[state];
-								field.yvel0(jt, kt) = state_yvel[state];
+								xvel0[jt + kt * vels_wk_stride] = state_xvel[state];
+								yvel0[jt + kt * vels_wk_stride] = state_yvel[state];
 							}
 						}
 					}
diff --git a/src/hydro.cpp b/src/hydro.cpp
index 5d53597..060c108 100644
--- a/src/hydro.cpp
+++ b/src/hydro.cpp
@@ -18,18 +18,18 @@
  */
 
 
+#include "accelerate.h"
 #include "hydro.h"
 #include "timer.h"
 #include "field_summary.h"
 #include "visit.h"
 #include "timestep.h"
 #include "PdV.h"
-#include "accelerate.h"
 #include "flux_calc.h"
 #include "advection.h"
 #include "reset_field.h"
+#include "finalise_field.h"
 
-#include <algorithm>
 
 extern std::ostream g_out;
 
@@ -220,6 +220,7 @@ void hydro(global_variables &globals, parallel_ &parallel) {
 				}
 			}
 
+
 			//clover_finalize(); Skipped as just closes the file and calls MPI_Finalize (which is done back in main).
 
 			break;
@@ -229,16 +230,16 @@ void hydro(global_variables &globals, parallel_ &parallel) {
 		if (parallel.boss) {
 			wall_clock = timer() - timerstart;
 			double step_clock = timer() - step_time;
-			g_out << "Wall clock " << wall_clock << std::endl;
-			std::cout << "Wall clock " << wall_clock << std::endl;
+			g_out << "Wall clock " << wall_clock << "\n";
+			std::cout << "Wall clock " << wall_clock << "\n";
 			double cells = globals.config.grid.x_cells * globals.config.grid.y_cells;
 			double rstep = globals.step;
 			double grind_time = wall_clock / (rstep * cells);
 			double step_grind = step_clock / cells;
-			std::cout << "Average time per cell " << grind_time << std::endl;
-			g_out << "Average time per cell " << grind_time << std::endl;
-			std::cout << "Step time per cell    " << step_grind << std::endl;
-			g_out << "Step time per cell    " << step_grind << std::endl;
+			std::cout << "Average time per cell " << grind_time << "\n";
+			g_out << "Average time per cell " << grind_time << "\n";
+			std::cout << "Step time per cell    " << step_grind << "\n";
+			g_out << "Step time per cell    " << step_grind << "\n";
 		}
 
 	}
diff --git a/src/ideal_gas.cpp b/src/ideal_gas.cpp
index 2b55231..42a389e 100644
--- a/src/ideal_gas.cpp
+++ b/src/ideal_gas.cpp
@@ -21,8 +21,8 @@
 
 #include <cmath>
 #include "ideal_gas.h"
-#include "utils.hpp"
 
+#include "comms.h"
 
 #define IDX(buffer, x, y) buffer[idx[(x)]][idx[(y)]]
 
@@ -35,11 +35,11 @@ int N = 0;
 //  @details Calculates the pressure and sound speed for the mesh chunk using
 //  the ideal gas equation of state, with a fixed gamma of 1.4.
 void ideal_gas_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
-		clover::Buffer2D<double> &density,
-		clover::Buffer2D<double> &energy,
-		clover::Buffer2D<double> &pressure,
-		clover::Buffer2D<double> &soundspeed) {
+		field_type &field,
+		clover::Buffer2D<double> &density_buffer,
+		clover::Buffer2D<double> &energy_buffer) {
 
 	//std::cout <<" ideal_gas(" << x_min+1 << ","<< y_min+1<< ","<< x_max+2<< ","<< y_max +2  << ")" << std::endl;
 	// DO k=y_min,y_max
@@ -47,15 +47,22 @@ void ideal_gas_kernel(
 
 //	Kokkos::MDRangePolicy <Kokkos::Rank<2>> policy({x_min + 1, y_min + 1}, {x_max + 2, y_max + 2});
 
-	_Pragma("kernel2d")
+	const int base_stride = field.base_stride;
+
+	double *density = density_buffer.data;
+	double *energy = energy_buffer.data;
+	double *pressure = field.pressure.data;
+	double *soundspeed = field.soundspeed.data;
+
+	#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 	for (int j = (y_min + 1); j < (y_max + 2); j++) {
 		for (int i = (x_min + 1); i < (x_max + 2); i++) {
-			double v = 1.0 / density(i, j);
-			pressure(i, j) = (1.4 - 1.0) * density(i, j) * energy(i, j);
-			double pressurebyenergy = (1.4 - 1.0) * density(i, j);
-			double pressurebyvolume = -density(i, j) * pressure(i, j);
-			double sound_speed_squared = v * v * (pressure(i, j) * pressurebyenergy - pressurebyvolume);
-			soundspeed(i, j) = std::sqrt(sound_speed_squared);
+			double v = 1.0 / density[i + j * base_stride];
+			pressure[i + j * base_stride] = (1.4 - 1.0) * density[i + j * base_stride] * energy[i + j * base_stride];
+			double pressurebyenergy = (1.4 - 1.0) * density[i + j * base_stride];
+			double pressurebyvolume = -density[i + j * base_stride] * pressure[i + j * base_stride];
+			double sound_speed_squared = v * v * (pressure[i + j * base_stride] * pressurebyenergy - pressurebyvolume);
+			soundspeed[i + j * base_stride] = std::sqrt(sound_speed_squared);
 		}
 	};
 
@@ -71,30 +78,37 @@ void ideal_gas(global_variables &globals, const int tile, bool predict) {
 
 	tile_type &t = globals.chunk.tiles[tile];
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
 
 	if (!predict) {
 		ideal_gas_kernel(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
 				t.info.t_ymax,
+				t.field,
 				t.field.density0,
-				t.field.energy0,
-				t.field.pressure,
-				t.field.soundspeed
+				t.field.energy0
 		);
 	} else {
 		ideal_gas_kernel(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
 				t.info.t_ymax,
+				t.field,
 				t.field.density1,
-				t.field.energy1,
-				t.field.pressure,
-				t.field.soundspeed
+				t.field.energy1
 		);
 	}
 
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
+
 }
 
diff --git a/src/initialise.cpp b/src/initialise.cpp
index 51a7349..bf3c3cb 100644
--- a/src/initialise.cpp
+++ b/src/initialise.cpp
@@ -41,7 +41,7 @@ std::ofstream of;
 
 struct RunConfig {
 	std::string file;
-	size_t deviceIdx;
+	int deviceIdx;
 };
 
 
@@ -66,7 +66,9 @@ void printHelp(const std::string &name) {
 RunConfig parseArgs(const size_t num_devices,
                     const std::vector<std::string> &args) {
 
-	const auto readParam = [&args](size_t current, const std::string &emptyMessage, auto map) {
+	const auto readParam = [&args](size_t current,
+	                               const std::string &emptyMessage,
+	                               const std::function<void(std::string)> &map) {
 		if (current + 1 < args.size()) {
 			return map(args[current + 1]);
 		} else {
@@ -84,21 +86,22 @@ RunConfig parseArgs(const size_t num_devices,
 			printHelp(args[0]);
 			std::exit(EXIT_SUCCESS);
 		} else if (arg == "--list") {
+			std::cout << "OMP devices:" << std::endl;
 			printSimple(num_devices);
 			std::exit(EXIT_SUCCESS);
 		} else if (arg == "--no-target") {
 			config.deviceIdx = -1;
 		} else if (arg == "--device") {
-			readParam(i, "--device specified but no index was given", [&](const auto &param) {
+			readParam(i, "--device specified but no index was given", [&](const std::string &param) {
 				auto selected = std::stoul(param);
-				if (selected < 0 || selected >= num_devices) {
+				if (selected >= num_devices) {
 					std::cerr << "bad device index `" << param << "`" << std::endl;
 					std::exit(EXIT_FAILURE);
 				}
 				config.deviceIdx = selected;
 			});
 		} else if (arg == "--file") {
-			readParam(i, "--file specified but no file was given", [&config](const auto &param) {
+			readParam(i, "--file specified but no file was given", [&config](const std::string &param) {
 				config.file = param;
 			});
 		}
@@ -106,8 +109,7 @@ RunConfig parseArgs(const size_t num_devices,
 	return config;
 }
 
-std::unique_ptr<global_variables>
-initialise(parallel_ &parallel, const std::vector<std::string> &args) {
+global_variables initialise(parallel_ &parallel, const std::vector<std::string> &args) {
 
 	global_config config;
 
@@ -133,23 +135,43 @@ initialise(parallel_ &parallel, const std::vector<std::string> &args) {
 
 	clover_barrier();
 
-//
-//	int x = 1;
-//	#pragma omp target map(tofrom: x)
-//	x = x + 1;
 
 	auto num_devices = omp_get_num_devices();
-	if (num_devices == 0) {
-		std::cout << "No OMP target devices available" << std::endl;
-	} else {
-		std::cout << "Detected OMP devices:" << std::endl;
-		printSimple(num_devices);
+	if (parallel.boss) {
+
+		if (num_devices == 0) {
+			std::cout << "No OMP target devices available" << std::endl;
+		} else {
+			std::cout << "Detected OMP devices:" << std::endl;
+			printSimple(num_devices);
+		}
+		std::cout << "\n" << std::endl;
 	}
 
 	auto runConfig = parseArgs(num_devices, args);
 	auto file = runConfig.file;
 	auto selectedDevice = runConfig.deviceIdx;
-	std::cout << "Using OMP device: " << selectedDevice << std::endl;
+	auto useTarget = selectedDevice != -1;
+
+	if (parallel.boss) {
+		(!useTarget ?
+		 std::cout << "Using OMP device: (host fallback))" :
+		 std::cout << "Using OMP device: #" << selectedDevice) << std::endl;
+	}
+
+	if (!useTarget) {
+		std::cout << "Using OMP device: (host fallback))" << std::endl;
+
+		#ifndef OMP_ALLOW_HOST
+		std::cerr << "Error: host fallback mode selected but OMP_ALLOW_HOST not enabled at compile time" << std::endl;
+		std::exit(EXIT_FAILURE);
+		#endif
+
+
+	} else {
+		omp_set_default_device(selectedDevice);
+	}
+
 
 	std::ifstream g_in;
 	if (parallel.boss) {
@@ -207,9 +229,9 @@ initialise(parallel_ &parallel, const std::vector<std::string> &args) {
 	config.number_of_chunks = parallel.max_task;
 
 
-	auto globals = start(parallel, config, selectedDevice);
+	auto globals = start(parallel, config, selectedDevice, useTarget);
 
-	clover_barrier(*globals);
+	clover_barrier(globals);
 
 	if (parallel.boss) {
 		g_out << "Starting the calculation" << std::endl;
diff --git a/src/initialise.h b/src/initialise.h
index 2acc50a..4bdc26d 100644
--- a/src/initialise.h
+++ b/src/initialise.h
@@ -24,7 +24,7 @@
 #include "comms.h"
 #include "definitions.h"
 
-std::unique_ptr<global_variables> initialise(parallel_ &parallel, const std::vector<std::string> &args);
+global_variables initialise(parallel_ &parallel, const std::vector<std::string> &args);
 
 #endif
 
diff --git a/src/initialise_chunk.cpp b/src/initialise_chunk.cpp
index ed5c694..c7b93fe 100644
--- a/src/initialise_chunk.cpp
+++ b/src/initialise_chunk.cpp
@@ -27,7 +27,7 @@
 
 
 #include "initialise_chunk.h"
-#include "utils.hpp"
+
 
 
 void initialise_chunk(const int tile, global_variables &globals) {
@@ -56,43 +56,60 @@ void initialise_chunk(const int tile, global_variables &globals) {
 	field_type &field = globals.chunk.tiles[tile].field;
 
 
-	_Pragma("kernel1d")
-	for (int j = (0); j < (xrange); j++) {
-		field.vertexx[j] = xmin + dx * (j - 1 - x_min);
-		field.vertexdx[j] = dx;
+	double *vertexx = field.vertexx.data;
+	double *vertexdx = field.vertexdx.data;
+
+	#pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target)
+	for (int j = 0; j < (xrange); j++) {
+		vertexx[j] = xmin + dx * (j - 1 - x_min);
+		vertexdx[j] = dx;
 	}
 
 
-	_Pragma("kernel1d")
-	for (int k = (0); k < (yrange); k++) {
-		field.vertexy[k] = ymin + dy * (k - 1 - y_min);
-		field.vertexdy[k] = dy;
+	double *vertexy = field.vertexy.data;
+	double *vertexdy = field.vertexdy.data;
+
+	#pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target)
+	for (int k = 0; k < (yrange); k++) {
+		vertexy[k] = ymin + dy * (k - 1 - y_min);
+		vertexdy[k] = dy;
 	}
 
 
 	const int xrange1 = (x_max + 2) - (x_min - 2) + 1;
 	const int yrange1 = (y_max + 2) - (y_min - 2) + 1;
 
-	_Pragma("kernel1d")
-	for (int j = (0); j < (xrange1); j++) {
-		field.cellx[j] = 0.5 * (field.vertexx[j] + field.vertexx[j + 1]);
-		field.celldx[j] = dx;
+	double *cellx = field.cellx.data;
+	double *celldx = field.celldx.data;
+	#pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target)
+	for (int j = 0; j < (xrange1); j++) {
+		cellx[j] = 0.5 * (vertexx[j] + vertexx[j + 1]);
+		celldx[j] = dx;
 	}
 
 
-	_Pragma("kernel1d")
-	for (int k = (0); k < (yrange1); k++) {
-		field.celly[k] = 0.5 * (field.vertexy[k] + field.vertexy[k + 1]);
-		field.celldy[k] = dy;
+	double *celly = field.celly.data;
+	double *celldy = field.celldy.data;
+	#pragma omp target teams distribute parallel for simd clover_use_target(globals.use_target)
+	for (int k = 0; k < (yrange1); k++) {
+		celly[k] = 0.5 * (vertexy[k] + vertexy[k + 1]);
+		celldy[k] = dy;
 	}
 
+	const int base_stride = field.base_stride;
+	const int flux_x_stride = field.flux_x_stride;
+	const int flux_y_stride = field.flux_y_stride;
+
+	double *volume = field.volume.data;
+	double *xarea = field.xarea.data;
+	double *yarea = field.yarea.data;
 
-	_Pragma("kernel2d")
-	for (int j = (0); j < (yrange1); j++) {
-		for (int i = (0); i < (xrange1); i++) {
-			field.volume(i, j) = dx * dy;
-			field.xarea(i, j) = field.celldy[j];
-			field.yarea(i, j) = field.celldx[i];
+	#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(globals.use_target)
+	for (int j = 0; j < (yrange1); j++) {
+		for (int i = 0; i < (xrange1); i++) {
+			volume[i + j * base_stride] = dx * dy;
+			xarea[i + j * flux_x_stride] = celldy[j];
+			yarea[i + j * flux_y_stride] = celldx[i];
 		}
 	}
 
diff --git a/src/pack_kernel.cpp b/src/pack_kernel.cpp
index c55f1d7..3e70ec5 100644
--- a/src/pack_kernel.cpp
+++ b/src/pack_kernel.cpp
@@ -24,11 +24,11 @@
 
 
 #include "pack_kernel.h"
-#include "utils.hpp"
 
-void clover_pack_message_left(int x_min, int x_max, int y_min, int y_max,
-                              clover::Buffer2D<double> &field,
-                              clover::Buffer1D<double> &left_snd,
+
+void clover_pack_message_left(bool use_target, int x_min, int x_max, int y_min, int y_max,
+                              clover::Buffer2D<double> &field_buffer,
+                              clover::Buffer1D<double> &left_snd_buffer,
                               int cell_data, int vertex_data, int x_face_data, int y_face_data,
                               int depth, int field_type, int buffer_offset) {
 
@@ -54,14 +54,17 @@ void clover_pack_message_left(int x_min, int x_max, int y_min, int y_max,
 		y_inc = 1;
 	}
 
-		// DO k=y_min-depth,y_max+y_inc+depth
+	// DO k=y_min-depth,y_max+y_inc+depth
 
 
-			_Pragma("kernel1d")
+	double *left_snd = left_snd_buffer.data;
+	double *field = field_buffer.data;
+	const int field_sizex = field_buffer.nX();
+	#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 	for (int k = (y_min - depth + 1); k < (y_max + y_inc + depth + 2); k++) {
 		for (int j = 0; j < depth; ++j) {
 			int index = buffer_offset + j + (k + depth - 1) * depth;
-			left_snd[index] = field(x_min + x_inc - 1 + j, k);
+			left_snd[index] = field[(x_min + x_inc - 1 + j) + (k) * field_sizex];
 		}
 	}
 
@@ -69,9 +72,9 @@ void clover_pack_message_left(int x_min, int x_max, int y_min, int y_max,
 }
 
 
-void clover_unpack_message_left(int x_min, int x_max, int y_min, int y_max,
-                                clover::Buffer2D<double> &field,
-                                clover::Buffer1D<double> &left_rcv,
+void clover_unpack_message_left(bool use_target, int x_min, int x_max, int y_min, int y_max,
+                                clover::Buffer2D<double> &field_buffer,
+                                clover::Buffer1D<double> &left_rcv_buffer,
                                 int cell_data, int vertex_data, int x_face_data, int y_face_data,
                                 int depth, int field_type, int buffer_offset) {
 
@@ -93,16 +96,19 @@ void clover_unpack_message_left(int x_min, int x_max, int y_min, int y_max,
 		y_inc = 1;
 	}
 
-		// DO k=y_min-depth,y_max+y_inc+depth
+	// DO k=y_min-depth,y_max+y_inc+depth
 
 
 
 
-			_Pragma("kernel1d")
+	double *field = field_buffer.data;
+	const int field_sizex = field_buffer.nX();
+	double *left_rcv = left_rcv_buffer.data;
+	#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 	for (int k = (y_min - depth + 1); k < (y_max + y_inc + depth + 2); k++) {
 		for (int j = 0; j < depth; ++j) {
 			int index = buffer_offset + j + (k + depth - 1) * depth;
-			field(x_min - j, k) = left_rcv[index];
+			field[(x_min - j) + (k) * field_sizex] = left_rcv[index];
 		}
 	}
 
@@ -110,9 +116,9 @@ void clover_unpack_message_left(int x_min, int x_max, int y_min, int y_max,
 }
 
 
-void clover_pack_message_right(int x_min, int x_max, int y_min, int y_max,
-                               clover::Buffer2D<double> &field,
-                               clover::Buffer1D<double> &right_snd,
+void clover_pack_message_right(bool use_target, int x_min, int x_max, int y_min, int y_max,
+                               clover::Buffer2D<double> &field_buffer,
+                               clover::Buffer1D<double> &right_snd_buffer,
                                int cell_data, int vertex_data, int x_face_data, int y_face_data,
                                int depth, int field_type, int buffer_offset) {
 
@@ -134,12 +140,15 @@ void clover_pack_message_right(int x_min, int x_max, int y_min, int y_max,
 		y_inc = 1;
 	}
 
-		// DO k=y_min-depth,y_max+y_inc+depth
-			_Pragma("kernel1d")
+	// DO k=y_min-depth,y_max+y_inc+depth
+	double *right_snd = right_snd_buffer.data;
+	double *field = field_buffer.data;
+	const int field_sizex = field_buffer.nX();
+	#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 	for (int k = (y_min - depth + 1); k < (y_max + y_inc + depth + 2); k++) {
 		for (int j = 0; j < depth; ++j) {
 			int index = buffer_offset + j + (k + depth - 1) * depth;
-			right_snd[index] = field(x_min + 1 + j, k);
+			right_snd[index] = field[(x_min + 1 + j) + (k) * field_sizex];
 		}
 	}
 
@@ -147,9 +156,9 @@ void clover_pack_message_right(int x_min, int x_max, int y_min, int y_max,
 }
 
 
-void clover_unpack_message_right(int x_min, int x_max, int y_min, int y_max,
-                                 clover::Buffer2D<double> &field,
-                                 clover::Buffer1D<double> &right_rcv,
+void clover_unpack_message_right(bool use_target, int x_min, int x_max, int y_min, int y_max,
+                                 clover::Buffer2D<double> &field_buffer,
+                                 clover::Buffer1D<double> &right_rcv_buffer,
                                  int cell_data, int vertex_data, int x_face_data, int y_face_data,
                                  int depth, int field_type, int buffer_offset) {
 
@@ -175,21 +184,24 @@ void clover_unpack_message_right(int x_min, int x_max, int y_min, int y_max,
 		y_inc = 1;
 	}
 
-		// DO k=y_min-depth,y_max+y_inc+depth
-			_Pragma("kernel1d")
+	// DO k=y_min-depth,y_max+y_inc+depth
+	double *right_rcv = right_rcv_buffer.data;
+	double *field = field_buffer.data;
+	const int field_sizex = field_buffer.nX();
+	#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 	for (int k = (y_min - depth + 1); k < (y_max + y_inc + depth + 2); k++) {
 		for (int j = 0; j < depth; ++j) {
 			int index = buffer_offset + j + (k + depth - 1) * depth;
-			right_rcv[index] = field(x_max + x_inc + j, k);
+			right_rcv[index] = field[(x_max + x_inc + j) + (k) * field_sizex];
 		}
 	}
 
 
 }
 
-void clover_pack_message_top(int x_min, int x_max, int y_min, int y_max,
-                             clover::Buffer2D<double> &field,
-                             clover::Buffer1D<double> &top_snd,
+void clover_pack_message_top(bool use_target, int x_min, int x_max, int y_min, int y_max,
+                             clover::Buffer2D<double> &field_buffer,
+                             clover::Buffer1D<double> &top_snd_buffer,
                              int cell_data, int vertex_data, int x_face_data, int y_face_data,
                              int depth, int field_type, int buffer_offset) {
 
@@ -214,17 +226,20 @@ void clover_pack_message_top(int x_min, int x_max, int y_min, int y_max,
 	for (int k = 0; k < depth; ++k) {
 		// DO j=x_min-depth,x_max+x_inc+depth
 
-		_Pragma("kernel1d")
+		double *top_snd = top_snd_buffer.data;
+		double *field = field_buffer.data;
+		const int field_sizex = field_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int j = (x_min - depth + 1); j < (x_max + x_inc + depth + 2); j++) {
 			int index = buffer_offset + k + (j + depth - 1) * depth;
-			top_snd[index] = field(j, y_max + 1 - k);
+			top_snd[index] = field[j + (y_max + 1 - k) * field_sizex];
 		}
 	}
 }
 
-void clover_unpack_message_top(int x_min, int x_max, int y_min, int y_max,
-                               clover::Buffer2D<double> &field,
-                               clover::Buffer1D<double> &top_rcv,
+void clover_unpack_message_top(bool use_target, int x_min, int x_max, int y_min, int y_max,
+                               clover::Buffer2D<double> &field_buffer,
+                               clover::Buffer1D<double> &top_rcv_buffer,
                                int cell_data, int vertex_data, int x_face_data, int y_face_data,
                                int depth, int field_type, int buffer_offset) {
 
@@ -254,18 +269,21 @@ void clover_unpack_message_top(int x_min, int x_max, int y_min, int y_max,
 		// DO j=x_min-depth,x_max+x_inc+depth
 
 
-		_Pragma("kernel1d")
+		double *field = field_buffer.data;
+		const int field_sizex = field_buffer.nX();
+		double *top_rcv = top_rcv_buffer.data;
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int j = (x_min - depth + 1); j < (x_max + x_inc + depth + 2); j++) {
 			int index = buffer_offset + k + (j + depth - 1) * depth;
-			field(j, y_max + y_inc + k) = top_rcv[index];
+			field[j + (y_max + y_inc + k) * field_sizex] = top_rcv[index];
 		}
 	}
 }
 
 
-void clover_pack_message_bottom(int x_min, int x_max, int y_min, int y_max,
-                                clover::Buffer2D<double> &field,
-                                clover::Buffer1D<double> &bottom_snd,
+void clover_pack_message_bottom(bool use_target, int x_min, int x_max, int y_min, int y_max,
+                                clover::Buffer2D<double> &field_buffer,
+                                clover::Buffer1D<double> &bottom_snd_buffer,
                                 int cell_data, int vertex_data, int x_face_data, int y_face_data,
                                 int depth, int field_type, int buffer_offset) {
 
@@ -294,17 +312,20 @@ void clover_pack_message_bottom(int x_min, int x_max, int y_min, int y_max,
 	for (int k = 0; k < depth; ++k) {
 		// DO j=x_min-depth,x_max+x_inc+depth
 
-		_Pragma("kernel1d")
+		double *bottom_snd = bottom_snd_buffer.data;
+		double *field = field_buffer.data;
+		const int field_sizex = field_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int j = (x_min - depth + 1); j < (x_max + x_inc + depth + 2); j++) {
 			int index = buffer_offset + k + (j + depth - 1) * depth;
-			bottom_snd[index] = field(j, y_min + y_inc - 1 + k);
+			bottom_snd[index] = field[j + (y_min + y_inc - 1 + k) * field_sizex];
 		}
 	}
 }
 
-void clover_unpack_message_bottom(int x_min, int x_max, int y_min, int y_max,
-                                  clover::Buffer2D<double> &field,
-                                  clover::Buffer1D<double> &bottom_rcv,
+void clover_unpack_message_bottom(bool use_target, int x_min, int x_max, int y_min, int y_max,
+                                  clover::Buffer2D<double> &field_buffer,
+                                  clover::Buffer1D<double> &bottom_rcv_buffer,
                                   int cell_data, int vertex_data, int x_face_data, int y_face_data,
                                   int depth, int field_type, int buffer_offset) {
 
@@ -329,10 +350,13 @@ void clover_unpack_message_bottom(int x_min, int x_max, int y_min, int y_max,
 	for (int k = 0; k < depth; ++k) {
 		// DO j=x_min-depth,x_max+x_inc+depth
 
-		_Pragma("kernel1d")
+		double *field = field_buffer.data;
+		const int field_sizex = field_buffer.nX();
+		double *bottom_rcv = bottom_rcv_buffer.data;
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int j = (x_min - depth + 1); j < (x_max + x_inc + depth + 2); j++) {
 			int index = buffer_offset + k + (j + depth - 1) * depth;
-			field(j, y_min - k) = bottom_rcv[index];
+			field[j + (y_min - k) * field_sizex] = bottom_rcv[index];
 		}
 	}
 }
diff --git a/src/pack_kernel.h b/src/pack_kernel.h
index dfa6c38..32f27dc 100644
--- a/src/pack_kernel.h
+++ b/src/pack_kernel.h
@@ -22,44 +22,44 @@
 #define PACK_KERNEL_H
 
 #include "definitions.h"
-#include "utils.hpp"
 
-void clover_pack_message_left(int x_min, int x_max, int y_min, int y_max,
+
+void clover_pack_message_left(bool use_target, int x_min, int x_max, int y_min, int y_max,
                               clover::Buffer2D<double> &field,
                               clover::Buffer1D<double> &left_snd, int cell_data, int vertex_data,
                               int x_face_fata, int y_face_data, int depth, int field_type,
                               int buffer_offset);
-void clover_unpack_message_left(int x_min, int x_max, int y_min, int y_max,
+void clover_unpack_message_left(bool use_target, int x_min, int x_max, int y_min, int y_max,
                                 clover::Buffer2D<double> &field,
                                 clover::Buffer1D<double> &left_rcv, int cell_data,
                                 int vertex_data, int x_face_fata, int y_face_data, int depth,
                                 int field_type, int buffer_offset);
-void clover_pack_message_right(int x_min, int x_max, int y_min, int y_max,
+void clover_pack_message_right(bool use_target, int x_min, int x_max, int y_min, int y_max,
                                clover::Buffer2D<double> &field,
                                clover::Buffer1D<double> &right_snd, int cell_data,
                                int vertex_data, int x_face_fata, int y_face_data, int depth,
                                int field_type, int buffer_offset);
-void clover_unpack_message_right(int x_min, int x_max, int y_min, int y_max,
+void clover_unpack_message_right(bool use_target, int x_min, int x_max, int y_min, int y_max,
                                  clover::Buffer2D<double> &field,
                                  clover::Buffer1D<double> &right_rcv, int cell_data,
                                  int vertex_data, int x_face_fata, int y_face_data, int depth,
                                  int field_type, int buffer_offset);
-void clover_pack_message_top(int x_min, int x_max, int y_min, int y_max,
+void clover_pack_message_top(bool use_target, int x_min, int x_max, int y_min, int y_max,
                              clover::Buffer2D<double> &field,
                              clover::Buffer1D<double> &top_snd, int cell_data, int vertex_data,
                              int x_face_fata, int y_face_data, int depth, int field_type,
                              int buffer_offset);
-void clover_unpack_message_top(int x_min, int x_max, int y_min, int y_max,
+void clover_unpack_message_top(bool use_target, int x_min, int x_max, int y_min, int y_max,
                                clover::Buffer2D<double> &field,
                                clover::Buffer1D<double> &top_rcv, int cell_data,
                                int vertex_data, int x_face_fata, int y_face_data, int depth,
                                int field_type, int buffer_offset);
-void clover_pack_message_bottom(int x_min, int x_max, int y_min, int y_max,
+void clover_pack_message_bottom(bool use_target, int x_min, int x_max, int y_min, int y_max,
                                 clover::Buffer2D<double> &field,
                                 clover::Buffer1D<double> &bottom_snd, int cell_data,
                                 int vertex_data, int x_face_fata, int y_face_data, int depth,
                                 int field_type, int buffer_offset);
-void clover_unpack_message_bottom(int x_min, int x_max, int y_min, int y_max,
+void clover_unpack_message_bottom(bool use_target, int x_min, int x_max, int y_min, int y_max,
                                   clover::Buffer2D<double> &field,
                                   clover::Buffer1D<double> &bottom_rcv, int cell_data,
                                   int vertex_data, int x_face_fata, int y_face_data, int depth,
diff --git a/src/reset_field.cpp b/src/reset_field.cpp
index daa4c9d..62982d4 100644
--- a/src/reset_field.cpp
+++ b/src/reset_field.cpp
@@ -20,32 +20,32 @@
 
 #include "reset_field.h"
 #include "timer.h"
-#include "utils.hpp"
+
 
 //  @brief Fortran reset field kernel.
 //  @author Wayne Gaudin
 //  @details Copies all of the final end of step filed data to the begining of
 //  step data, ready for the next timestep.
 void reset_field_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
-		clover::Buffer2D<double> &density0,
-		clover::Buffer2D<double> &density1,
-		clover::Buffer2D<double> &energy0,
-		clover::Buffer2D<double> &energy1,
-		clover::Buffer2D<double> &xvel0,
-		clover::Buffer2D<double> &xvel1,
-		clover::Buffer2D<double> &yvel0,
-		clover::Buffer2D<double> &yvel1) {
+		field_type &field) {
 
 
 
 	// DO k=y_min,y_max
 	//   DO j=x_min,x_max
-	_Pragma("kernel2d")
+	const int base_stride = field.base_stride;
+	double *density0 = field.density0.data;
+	double *density1 = field.density1.data;
+	double *energy0 = field.energy0.data;
+	double *energy1 = field.energy1.data;
+
+	#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 	for (int j = (y_min + 1); j < (y_max + 2); j++) {
 		for (int i = (x_min + 1); i < (x_max + 2); i++) {
-			density0(i, j) = density1(i, j);
-			energy0(i, j) = energy1(i, j);
+			density0[i + j * base_stride] = density1[i + j * base_stride];
+			energy0[i + j * base_stride] = energy1[i + j * base_stride];
 		}
 	}
 
@@ -54,11 +54,17 @@ void reset_field_kernel(
 
 	// DO k=y_min,y_max+1
 	//   DO j=x_min,x_max+1
-	_Pragma("kernel2d")
+	const int vels_wk_stride = field.vels_wk_stride;
+	double *xvel0 = field.xvel0.data;
+	double *xvel1 = field.xvel1.data;
+	double *yvel0 = field.yvel0.data;
+	double *yvel1 = field.yvel1.data;
+
+	#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 	for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) {
 		for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) {
-			xvel0(i, j) = xvel1(i, j);
-			yvel0(i, j) = yvel1(i, j);
+			xvel0[i + j * vels_wk_stride] = xvel1[i + j * vels_wk_stride];
+			yvel0[i + j * vels_wk_stride] = yvel1[i + j * vels_wk_stride];
 		}
 	}
 
@@ -73,26 +79,26 @@ void reset_field(global_variables &globals) {
 	double kernel_time = 0;
 	if (globals.profiler_on) kernel_time = timer();
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
 
 	for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) {
 
 		tile_type &t = globals.chunk.tiles[tile];
 		reset_field_kernel(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
 				t.info.t_ymax,
-
-				t.field.density0,
-				t.field.density1,
-				t.field.energy0,
-				t.field.energy1,
-				t.field.xvel0,
-				t.field.xvel1,
-				t.field.yvel0,
-				t.field.yvel1);
+				t.field);
 	}
 
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
+
 
 	if (globals.profiler_on) globals.profiler.reset += timer() - kernel_time;
 }
diff --git a/src/revert.cpp b/src/revert.cpp
index 681d0ad..036925a 100644
--- a/src/revert.cpp
+++ b/src/revert.cpp
@@ -19,7 +19,7 @@
 
 
 #include "revert.h"
-#include "utils.hpp"
+
 
 //  @brief Fortran revert kernel.
 //  @author Wayne Gaudin
@@ -27,19 +27,24 @@
 //  it to the start of step data, ready for the corrector.
 //  Note that this does not seem necessary in this proxy-app but should be
 //  left in to remain relevant to the full method.
-void revert_kernel(int x_min, int x_max, int y_min, int y_max,
-                   clover::Buffer2D<double> &density0,
-                   clover::Buffer2D<double> &density1,
-                   clover::Buffer2D<double> &energy0,
-                   clover::Buffer2D<double> &energy1) {
+void revert_kernel(
+		bool use_target,
+		int x_min, int x_max, int y_min, int y_max,
+		field_type &field) {
 
 	// DO k=y_min,y_max
 	//   DO j=x_min,x_max
-	_Pragma("kernel2d")
+	const int base_stride = field.base_stride;
+	double *density0 = field.density0.data;
+	double *density1 = field.density1.data;
+	double *energy0 = field.energy0.data;
+	double *energy1 = field.energy1.data;
+
+	#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 	for (int j = (y_min + 1); j < (y_max + 2); j++) {
 		for (int i = (x_min + 1); i < (x_max + 2); i++) {
-			density1(i, j) = density0(i, j);
-			energy1(i, j) = energy0(i, j);
+			density1[i + j * base_stride] = density0[i + j * base_stride];
+			energy1[i + j * base_stride] = energy0[i + j * base_stride];
 		}
 	}
 
@@ -51,19 +56,25 @@ void revert_kernel(int x_min, int x_max, int y_min, int y_max,
 //  @details Invokes the user specified revert kernel.
 void revert(global_variables &globals) {
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
 
 	for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) {
 		tile_type &t = globals.chunk.tiles[tile];
 		revert_kernel(
+				globals.use_target,
 				t.info.t_xmin,
 				t.info.t_xmax,
 				t.info.t_ymin,
 				t.info.t_ymax,
-				t.field.density0,
-				t.field.density1,
-				t.field.energy0,
-				t.field.energy1);
+				t.field);
 	}
 
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
+
+
 }
 
diff --git a/src/start.cpp b/src/start.cpp
index 0cfff35..ad647be 100644
--- a/src/start.cpp
+++ b/src/start.cpp
@@ -34,16 +34,17 @@
 #include "field_summary.h"
 #include "update_halo.h"
 #include "visit.h"
-#include "cxx14_compat.hpp"
+#include "flux_calc.h"
 #include <string>
 #include <sstream>
 
 extern std::ostream g_out;
 
 
-std::unique_ptr<global_variables> start(parallel_ &parallel,
-                                        const global_config &config,
-                                        size_t omp_device) {
+global_variables start(parallel_ &parallel,
+                       const global_config &config,
+                       size_t omp_device,
+                       bool use_target) {
 
 	if (parallel.boss) {
 		g_out << "Setting up initial geometry" << std::endl
@@ -67,6 +68,7 @@ std::unique_ptr<global_variables> start(parallel_ &parallel,
 
 	global_variables globals(config,
 	                         omp_device,
+	                         use_target,
 	                         chunk_type(
 			                         chunkNeighbours,
 			                         parallel.task, 1, 1, x_cells, y_cells,
@@ -80,8 +82,15 @@ std::unique_ptr<global_variables> start(parallel_ &parallel,
 
 	auto infos = clover_tile_decompose(globals, x_cells, y_cells);
 
-	std::transform(infos.begin(), infos.end(), std::back_inserter(globals.chunk.tiles),
-	               [](const tile_info &ti) { return tile_type(ti); });
+	for (auto &ti : infos) {
+		globals.chunk.tiles.emplace_back(ti);
+	}
+
+//	std::transform(infos.begin(), infos.end(), std::back_inserter(globals.chunk.tiles),
+//	               [](const tile_info &ti) { return tile_type(ti); });
+
+
+//	#pragma omp target enter data map(alloc: globals.chunk.tiles[0:N])
 
 
 
@@ -99,7 +108,10 @@ std::unique_ptr<global_variables> start(parallel_ &parallel,
 
 	for (int tile = 0; tile < config.tiles_per_chunk; ++tile) {
 		initialise_chunk(tile, globals);
+		if (DEBUG) std::cout << "Field initialised2" << std::endl;
+
 		generate_chunk(tile, globals);
+		if (DEBUG) std::cout << "Field initialised3" << std::endl;
 	}
 
 
@@ -111,9 +123,17 @@ std::unique_ptr<global_variables> start(parallel_ &parallel,
 	bool profiler_off = globals.profiler_on;
 	globals.profiler_on = false;
 
+
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
+
 	for (int tile = 0; tile < config.tiles_per_chunk; ++tile) {
 		ideal_gas(globals, tile, false);
 	}
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
 	if (DEBUG) globals.dump("dump_0_after_ideal_gas.txt");
 
 	// Prime all halo data for the first step
@@ -132,7 +152,11 @@ std::unique_ptr<global_variables> start(parallel_ &parallel,
 	fields[field_xvel1] = 1;
 	fields[field_yvel1] = 1;
 
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
 	update_halo(globals, fields, 2);
+
 	if (DEBUG)globals.dump("dump_0_after_update_halo.txt");
 
 
@@ -149,6 +173,6 @@ std::unique_ptr<global_variables> start(parallel_ &parallel,
 
 	globals.profiler_on = profiler_off;
 
-	return make_unique<global_variables>(globals);
+	return globals;
 }
 
diff --git a/src/start.h b/src/start.h
index 146ffd4..b8ad4e3 100644
--- a/src/start.h
+++ b/src/start.h
@@ -25,9 +25,10 @@
 #include "comms.h"
 #include "definitions.h"
 
-std::unique_ptr<global_variables> start(parallel_ &parallel,
-                                        const global_config &config,
-                                        size_t omp_device);
+global_variables start(parallel_ &parallel,
+                       const global_config &config,
+                       size_t omp_device,
+                       bool use_target);
 
 #endif
 
diff --git a/src/timestep.cpp b/src/timestep.cpp
index 8efcbfa..10ee443 100644
--- a/src/timestep.cpp
+++ b/src/timestep.cpp
@@ -97,10 +97,10 @@ void timestep(global_variables &globals, parallel_ &parallel) {
 	if (parallel.boss) {
 		g_out << " Step " << globals.step << " time " << globals.time << " control " << dt_control
 		      << " timestep  " << globals.dt << " " << globals.jdt << "," << globals.kdt << " x "
-		      << x_pos << " y " << y_pos << std::endl;
+		      << x_pos << " y " << y_pos  << "\n";
 		std::cout << " Step " << globals.step << " time " << globals.time << " control "
 		          << dt_control << " timestep  " << globals.dt << " " << globals.jdt << ","
-		          << globals.kdt << " x " << x_pos << " y " << y_pos << std::endl;
+		          << globals.kdt << " x " << x_pos << " y " << y_pos  << "\n";
 	}
 
 	if (small == 1) {
diff --git a/src/update_halo.cpp b/src/update_halo.cpp
index 478366d..1f84522 100644
--- a/src/update_halo.cpp
+++ b/src/update_halo.cpp
@@ -23,7 +23,7 @@
 #include "update_halo.h"
 #include "update_tile_halo.h"
 #include "timer.h"
-#include "utils.hpp"
+
 
 
 //   @brief Fortran kernel to update the external halo cells in a chunk.
@@ -33,6 +33,7 @@
 //   of data governs how this is carried out. External boundaries are always
 //   reflective.
 void update_halo_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
 		const std::array<int, 4> &chunk_neighbours,
 		const std::array<int, 4> &tile_neighbours,
@@ -41,6 +42,11 @@ void update_halo_kernel(
 		int depth) {
 
 
+	const int base_stride = field.base_stride;
+	const int vels_wk_stride = field.vels_wk_stride;
+	const int flux_x_stride = field.flux_x_stride;
+	const int flux_y_stride = field.flux_y_stride;
+
 	//  Update values in external halo cells based on depth and fields requested
 	//  Even though half of these loops look the wrong way around, it should be noted
 	//  that depth is either 1 or 2 so that it is more efficient to always thread
@@ -51,10 +57,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *density0 = field.density0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.density0(j, 1 - k) = field.density0(j, 2 + k);
+					density0[j + (1 - k) * base_stride] = density0[j + (2 + k) * base_stride];
 				}
 			}
 
@@ -64,10 +71,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *density0 = field.density0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.density0(j, y_max + 2 + k) = field.density0(j, y_max + 1 - k);
+					density0[j + (y_max + 2 + k) * base_stride] = density0[j + (y_max + 1 - k) * base_stride];
 				}
 			}
 
@@ -77,10 +85,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *density0 = field.density0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.density0(1 - j, k) = field.density0(2 + j, k);
+					density0[(1 - j) + (k) * base_stride] = density0[(2 + j) + (k) * base_stride];
 				}
 			}
 
@@ -90,10 +99,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *density0 = field.density0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.density0(x_max + 2 + j, k) = field.density0(x_max + 1 - j, k);
+					density0[(x_max + 2 + j) + (k) * base_stride] = density0[(x_max + 1 - j) + (k) * base_stride];
 				}
 			}
 
@@ -107,10 +117,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *density1 = field.density1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.density1(j, 1 - k) = field.density1(j, 2 + k);
+					density1[j + (1 - k) * base_stride] = density1[j + (2 + k) * base_stride];
 				}
 			}
 
@@ -120,10 +131,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *density1 = field.density1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.density1(j, y_max + 2 + k) = field.density1(j, y_max + 1 - k);
+					density1[j + (y_max + 2 + k) * base_stride] = density1[j + (y_max + 1 - k) * base_stride];
 				}
 			}
 
@@ -133,10 +145,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *density1 = field.density1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.density1(1 - j, k) = field.density1(2 + j, k);
+					density1[(1 - j) + (k) * base_stride] = density1[(2 + j) + (k) * base_stride];
 				}
 			}
 
@@ -146,10 +159,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *density1 = field.density1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.density1(x_max + 2 + j, k) = field.density1(x_max + 1 - j, k);
+					density1[(x_max + 2 + j) + (k) * base_stride] = density1[(x_max + 1 - j) + (k) * base_stride];
 				}
 			}
 
@@ -161,10 +175,11 @@ void update_halo_kernel(
 		    (tile_neighbours[tile_bottom] == external_tile)) {
 			//  DO j=x_min-depth,x_max+depth
 
-			_Pragma("kernel1d")
+			double *energy0 = field.energy0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.energy0(j, 1 - k) = field.energy0(j, 2 + k);
+					energy0[j + (1 - k) * base_stride] = energy0[j + (2 + k) * base_stride];
 				}
 			}
 
@@ -173,10 +188,11 @@ void update_halo_kernel(
 		    (tile_neighbours[tile_top] == external_tile)) {
 			// DO j=x_min-depth,x_max+depth
 
-			_Pragma("kernel1d")
+			double *energy0 = field.energy0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.energy0(j, y_max + 2 + k) = field.energy0(j, y_max + 1 - k);
+					energy0[j + (y_max + 2 + k) * base_stride] = energy0[j + (y_max + 1 - k) * base_stride];
 				}
 			}
 
@@ -185,10 +201,11 @@ void update_halo_kernel(
 		    (tile_neighbours[tile_left] == external_tile)) {
 			// DO k=y_min-depth,y_max+depth
 
-			_Pragma("kernel1d")
+			double *energy0 = field.energy0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.energy0(1 - j, k) = field.energy0(2 + j, k);
+					energy0[(1 - j) + (k) * base_stride] = energy0[(2 + j) + (k) * base_stride];
 				}
 			}
 
@@ -197,10 +214,11 @@ void update_halo_kernel(
 		    (tile_neighbours[tile_right] == external_tile)) {
 			// DO k=y_min-depth,y_max+depth
 
-			_Pragma("kernel1d")
+			double *energy0 = field.energy0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.energy0(x_max + 2 + j, k) = field.energy0(x_max + 1 - j, k);
+					energy0[(x_max + 2 + j) + (k) * base_stride] = energy0[(x_max + 1 - j) + (k) * base_stride];
 				}
 			}
 
@@ -214,10 +232,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *energy1 = field.energy1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.energy1(j, 1 - k) = field.energy1(j, 2 + k);
+					energy1[j + (1 - k) * base_stride] = energy1[j + (2 + k) * base_stride];
 				}
 			}
 
@@ -227,10 +246,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *energy1 = field.energy1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.energy1(j, y_max + 2 + k) = field.energy1(j, y_max + 1 - k);
+					energy1[j + (y_max + 2 + k) * base_stride] = energy1[j + (y_max + 1 - k) * base_stride];
 				}
 			}
 
@@ -240,10 +260,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *energy1 = field.energy1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.energy1(1 - j, k) = field.energy1(2 + j, k);
+					energy1[(1 - j) + (k) * base_stride] = energy1[(2 + j) + (k) * base_stride];
 				}
 			}
 
@@ -253,10 +274,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *energy1 = field.energy1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.energy1(x_max + 2 + j, k) = field.energy1(x_max + 1 - j, k);
+					energy1[(x_max + 2 + j) + (k) * base_stride] = energy1[(x_max + 1 - j) + (k) * base_stride];
 				}
 			}
 
@@ -269,10 +291,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *pressure = field.pressure.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.pressure(j, 1 - k) = field.pressure(j, 2 + k);
+					pressure[j + (1 - k) * base_stride] = pressure[j + (2 + k) * base_stride];
 				}
 			}
 
@@ -282,10 +305,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *pressure = field.pressure.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.pressure(j, y_max + 2 + k) = field.pressure(j, y_max + 1 - k);
+					pressure[j + (y_max + 2 + k) * base_stride] = pressure[j + (y_max + 1 - k) * base_stride];
 				}
 			}
 
@@ -295,10 +319,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *pressure = field.pressure.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.pressure(1 - j, k) = field.pressure(2 + j, k);
+					pressure[(1 - j) + (k) * base_stride] = pressure[(2 + j) + (k) * base_stride];
 				}
 			}
 
@@ -308,10 +333,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *pressure = field.pressure.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.pressure(x_max + 2 + j, k) = field.pressure(x_max + 1 - j, k);
+					pressure[(x_max + 2 + j) + (k) * base_stride] = pressure[(x_max + 1 - j) + (k) * base_stride];
 				}
 			}
 
@@ -324,10 +350,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *viscosity = field.viscosity.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.viscosity(j, 1 - k) = field.viscosity(j, 2 + k);
+					viscosity[j + (1 - k) * base_stride] = viscosity[j + (2 + k) * base_stride];
 				}
 			}
 
@@ -337,10 +364,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *viscosity = field.viscosity.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.viscosity(j, y_max + 2 + k) = field.viscosity(j, y_max + 1 - k);
+					viscosity[j + (y_max + 2 + k) * base_stride] = viscosity[j + (y_max + 1 - k) * base_stride];
 				}
 			}
 
@@ -350,10 +378,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *viscosity = field.viscosity.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.viscosity(1 - j, k) = field.viscosity(2 + j, k);
+					viscosity[(1 - j) + (k) * base_stride] = viscosity[(2 + j) + (k) * base_stride];
 				}
 			}
 
@@ -363,10 +392,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *viscosity = field.viscosity.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.viscosity(x_max + 2 + j, k) = field.viscosity(x_max + 1 - j, k);
+					viscosity[(x_max + 2 + j) + (k) * base_stride] = viscosity[(x_max + 1 - j) + (k) * base_stride];
 				}
 			}
 
@@ -379,10 +409,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *soundspeed = field.soundspeed.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.soundspeed(j, 1 - k) = field.soundspeed(j, +k);
+					soundspeed[j + (1 - k) * base_stride] = soundspeed[j + (+k) * base_stride];
 				}
 			}
 
@@ -392,10 +423,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *soundspeed = field.soundspeed.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.soundspeed(j, y_max + 2 + k) = field.soundspeed(j, y_max + 1 - k);
+					soundspeed[j + (y_max + 2 + k) * base_stride] = soundspeed[j + (y_max + 1 - k) * base_stride];
 				}
 			}
 
@@ -405,10 +437,11 @@ void update_halo_kernel(
 			//  DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *soundspeed = field.soundspeed.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.soundspeed(1 - j, k) = field.soundspeed(2 + j, k);
+					soundspeed[(1 - j) + (k) * base_stride] = soundspeed[(2 + j) + (k) * base_stride];
 				}
 			}
 
@@ -418,10 +451,11 @@ void update_halo_kernel(
 			//  DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *soundspeed = field.soundspeed.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.soundspeed(x_max + 2 + j, k) = field.soundspeed(x_max + 1 - j, k);
+					soundspeed[(x_max + 2 + j) + (k) * base_stride] = soundspeed[(x_max + 1 - j) + (k) * base_stride];
 				}
 			}
 
@@ -430,17 +464,18 @@ void update_halo_kernel(
 
 
 	if (fields[field_xvel0] == 1) {
+
+
 		if ((chunk_neighbours[chunk_bottom] == external_face) &&
 		    (tile_neighbours[tile_bottom] == external_tile)) {
 			// DO j=x_min-depth,x_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *xvel0 = field.xvel0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.xvel0(j, 1 - k) = field.xvel0(j,
-					                                    1 + 2 +
-					                                    k);
+					xvel0[j + (1 - k) * vels_wk_stride] = xvel0[j + (1 + 2 + k) * vels_wk_stride];
 				}
 			}
 
@@ -450,10 +485,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *xvel0 = field.xvel0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.xvel0(j, y_max + 1 + 2 + k) = field.xvel0(j, y_max + 1 - k);
+					xvel0[j + (y_max + 1 + 2 + k) * vels_wk_stride] = xvel0[j + (y_max + 1 - k) * vels_wk_stride];
 				}
 			}
 
@@ -463,10 +499,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *xvel0 = field.xvel0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.xvel0(1 - j, k) = -field.xvel0(1 + 2 + j, k);
+					xvel0[(1 - j) + (k) * vels_wk_stride] = -xvel0[(1 + 2 + j) + (k) * vels_wk_stride];
 				}
 			}
 
@@ -476,10 +513,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *xvel0 = field.xvel0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.xvel0(x_max + 2 + 1 + j, k) = -field.xvel0(x_max + 1 - j, k);
+					xvel0[(x_max + 2 + 1 + j) + (k) * vels_wk_stride] = -xvel0[(x_max + 1 - j) + (k) * vels_wk_stride];
 				}
 			}
 
@@ -492,10 +530,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *xvel1 = field.xvel1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.xvel1(j, 1 - k) = field.xvel1(j, 1 + 2 + k);
+					xvel1[j + (1 - k) * vels_wk_stride] = xvel1[j + (1 + 2 + k) * vels_wk_stride];
 				}
 			}
 
@@ -505,10 +544,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *xvel1 = field.xvel1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.xvel1(j, y_max + 1 + 2 + k) = field.xvel1(j, y_max + 1 - k);
+					xvel1[j + (y_max + 1 + 2 + k) * vels_wk_stride] = xvel1[j + (y_max + 1 - k) * vels_wk_stride];
 				}
 			}
 
@@ -518,10 +558,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *xvel1 = field.xvel1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.xvel1(1 - j, k) = -field.xvel1(1 + 2 + j, k);
+					xvel1[(1 - j) + (k) * vels_wk_stride] = -xvel1[(1 + 2 + j) + (k) * vels_wk_stride];
 				}
 			}
 
@@ -531,10 +572,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *xvel1 = field.xvel1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.xvel1(x_max + 2 + 1 + j, k) = -field.xvel1(x_max + 1 - j, k);
+					xvel1[(x_max + 2 + 1 + j) + (k) * vels_wk_stride] = -xvel1[(x_max + 1 - j) + (k) * vels_wk_stride];
 				}
 			}
 
@@ -547,10 +589,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *yvel0 = field.yvel0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.yvel0(j, 1 - k) = -field.yvel0(j, 1 + 2 + k);
+					yvel0[j + (1 - k) * vels_wk_stride] = -yvel0[j + (1 + 2 + k) * vels_wk_stride];
 				}
 			}
 
@@ -560,10 +603,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *yvel0 = field.yvel0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.yvel0(j, y_max + 1 + 2 + k) = -field.yvel0(j, y_max + 1 - k);
+					yvel0[j + (y_max + 1 + 2 + k) * vels_wk_stride] = -yvel0[j + (y_max + 1 - k) * vels_wk_stride];
 				}
 			}
 
@@ -573,10 +617,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *yvel0 = field.yvel0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.yvel0(1 - j, k) = field.yvel0(1 + 2 + j, k);
+					yvel0[(1 - j) + (k) * vels_wk_stride] = yvel0[(1 + 2 + j) + (k) * vels_wk_stride];
 				}
 			}
 
@@ -586,10 +631,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *yvel0 = field.yvel0.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.yvel0(x_max + 2 + 1 + j, k) = field.yvel0(x_max + 1 - j, k);
+					yvel0[(x_max + 2 + 1 + j) + (k) * vels_wk_stride] = yvel0[(x_max + 1 - j) + (k) * vels_wk_stride];
 				}
 			}
 
@@ -602,10 +648,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *yvel1 = field.yvel1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.yvel1(j, 1 - k) = -field.yvel1(j, 1 + 2 + k);
+					yvel1[j + (1 - k) * vels_wk_stride] = -yvel1[j + (1 + 2 + k) * vels_wk_stride];
 				}
 			}
 
@@ -615,10 +662,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *yvel1 = field.yvel1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.yvel1(j, y_max + 1 + 2 + k) = -field.yvel1(j, y_max + 1 - k);
+					yvel1[j + (y_max + 1 + 2 + k) * vels_wk_stride] = -yvel1[j + (y_max + 1 - k) * vels_wk_stride];
 				}
 			}
 
@@ -628,10 +676,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *yvel1 = field.yvel1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.yvel1(1 - j, k) = field.yvel1(1 + 2 + j, k);
+					yvel1[(1 - j) + (k) * vels_wk_stride] = yvel1[(1 + 2 + j) + (k) * vels_wk_stride];
 				}
 			}
 
@@ -641,10 +690,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *yvel1 = field.yvel1.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.yvel1(x_max + 2 + 1 + j, k) = field.yvel1(x_max + 1 - j, k);
+					yvel1[(x_max + 2 + 1 + j) + (k) * vels_wk_stride] = yvel1[(x_max + 1 - j) + (k) * vels_wk_stride];
 				}
 			}
 
@@ -658,10 +708,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *vol_flux_x = field.vol_flux_x.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.vol_flux_x(j, 1 - k) = field.vol_flux_x(j, 1 + 2 + k);
+					vol_flux_x[j + (1 - k) * flux_x_stride] = vol_flux_x[j + (1 + 2 + k) * flux_x_stride];
 				}
 			}
 
@@ -671,10 +722,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *vol_flux_x = field.vol_flux_x.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.vol_flux_x(j, y_max + 2 + k) = field.vol_flux_x(j, y_max - k);
+					vol_flux_x[j + (y_max + 2 + k) * flux_x_stride] = vol_flux_x[j + (y_max - k) * flux_x_stride];
 				}
 			}
 
@@ -684,10 +736,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *vol_flux_x = field.vol_flux_x.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.vol_flux_x(1 - j, k) = -field.vol_flux_x(1 + 2 + j, k);
+					vol_flux_x[(1 - j) + (k) * flux_x_stride] = -vol_flux_x[(1 + 2 + j) + (k) * flux_x_stride];
 				}
 			}
 
@@ -697,10 +750,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *vol_flux_x = field.vol_flux_x.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.vol_flux_x(x_max + j + 1 + 2, k) = -field.vol_flux_x(x_max + 1 - j, k);
+					vol_flux_x[(x_max + j + 1 + 2) + (k) * flux_x_stride] = -vol_flux_x[(x_max + 1 - j) + (k) * flux_x_stride];
 				}
 			}
 
@@ -714,10 +768,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *mass_flux_x = field.mass_flux_x.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.mass_flux_x(j, 1 - k) = field.mass_flux_x(j, 1 + 2 + k);
+					mass_flux_x[j + (1 - k) * flux_x_stride] = mass_flux_x[j + (1 + 2 + k) * flux_x_stride];
 				}
 			}
 
@@ -727,10 +782,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *mass_flux_x = field.mass_flux_x.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.mass_flux_x(j, y_max + 2 + k) = field.mass_flux_x(j, y_max - k);
+					mass_flux_x[j + (y_max + 2 + k) * flux_x_stride] = mass_flux_x[j + (y_max - k) * flux_x_stride];
 				}
 			}
 
@@ -740,10 +796,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *mass_flux_x = field.mass_flux_x.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.mass_flux_x(1 - j, k) = -field.mass_flux_x(1 + 2 + j, k);
+					mass_flux_x[(1 - j) + (k) * flux_x_stride] = -mass_flux_x[(1 + 2 + j) + (k) * flux_x_stride];
 				}
 			}
 
@@ -753,10 +810,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+depth
 
 
-			_Pragma("kernel1d")
+			double *mass_flux_x = field.mass_flux_x.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.mass_flux_x(x_max + j + 1 + 2, k) = -field.mass_flux_x(x_max + 1 - j, k);
+					mass_flux_x[(x_max + j + 1 + 2) + (k) * flux_x_stride] = -mass_flux_x[(x_max + 1 - j) + (k) * flux_x_stride];
 				}
 			}
 
@@ -770,10 +828,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *vol_flux_y = field.vol_flux_y.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.vol_flux_y(j, 1 - k) = -field.vol_flux_y(j, 1 + 2 + k);
+					vol_flux_y[j + (1 - k) * flux_y_stride] = -vol_flux_y[j + (1 + 2 + k) * flux_y_stride];
 				}
 			}
 
@@ -783,10 +842,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *vol_flux_y = field.vol_flux_y.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.vol_flux_y(j, y_max + k + 1 + 2) = -field.vol_flux_y(j, y_max + 1 - k);
+					vol_flux_y[j + (y_max + k + 1 + 2) * flux_y_stride] = -vol_flux_y[j + (y_max + 1 - k) * flux_y_stride];
 				}
 			}
 
@@ -796,10 +856,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *vol_flux_y = field.vol_flux_y.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.vol_flux_y(1 - j, k) = field.vol_flux_y(1 + 2 + j, k);
+					vol_flux_y[(1 - j) + (k) * flux_y_stride] = vol_flux_y[(1 + 2 + j) + (k) * flux_y_stride];
 				}
 			}
 
@@ -809,10 +870,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *vol_flux_y = field.vol_flux_y.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.vol_flux_y(x_max + 2 + j, k) = field.vol_flux_y(x_max - j, k);
+					vol_flux_y[(x_max + 2 + j) + (k) * flux_y_stride] = vol_flux_y[(x_max - j) + (k) * flux_y_stride];
 				}
 			}
 
@@ -825,10 +887,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *mass_flux_y = field.mass_flux_y.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.mass_flux_y(j, 1 - k) = -field.mass_flux_y(j, 1 + 2 + k);
+					mass_flux_y[j + (1 - k) * flux_y_stride] = -mass_flux_y[j + (1 + 2 + k) * flux_y_stride];
 				}
 			}
 
@@ -838,10 +901,11 @@ void update_halo_kernel(
 			// DO j=x_min-depth,x_max+depth
 
 
-			_Pragma("kernel1d")
+			double *mass_flux_y = field.mass_flux_y.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
 				for (int k = 0; k < depth; ++k) {
-					field.mass_flux_y(j, y_max + k + 1 + 2) = -field.mass_flux_y(j, y_max + 1 - k);
+					mass_flux_y[j + (y_max + k + 1 + 2) * flux_y_stride] = -mass_flux_y[j + (y_max + 1 - k) * flux_y_stride];
 				}
 			}
 
@@ -851,10 +915,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *mass_flux_y = field.mass_flux_y.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.mass_flux_y(1 - j, k) = field.mass_flux_y(1 + 2 + j, k);
+					mass_flux_y[(1 - j) + (k) * flux_y_stride] = mass_flux_y[(1 + 2 + j) + (k) * flux_y_stride];
 				}
 			}
 
@@ -864,10 +929,11 @@ void update_halo_kernel(
 			// DO k=y_min-depth,y_max+1+depth
 
 
-			_Pragma("kernel1d")
+			double *mass_flux_y = field.mass_flux_y.data;
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 				for (int j = 0; j < depth; ++j) {
-					field.mass_flux_y(x_max + 2 + j, k) = field.mass_flux_y(x_max - j, k);
+					mass_flux_y[(x_max + 2 + j) + (k) * flux_y_stride] = mass_flux_y[(x_max - j) + (k) * flux_y_stride];
 				}
 			}
 
@@ -903,10 +969,15 @@ void update_halo(global_variables &globals, int fields[NUM_FIELDS], int depth) {
 	    (globals.chunk.chunk_neighbours[chunk_bottom] == external_face) ||
 	    (globals.chunk.chunk_neighbours[chunk_top] == external_face)) {
 
+		#if SYNC_BUFFERS
+		globals.hostToDevice();
+		#endif
+
 
 		for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) {
 			tile_type &t = globals.chunk.tiles[tile];
 			update_halo_kernel(
+					globals.use_target,
 					t.info.t_xmin,
 					t.info.t_xmax,
 					t.info.t_ymin,
@@ -918,6 +989,11 @@ void update_halo(global_variables &globals, int fields[NUM_FIELDS], int depth) {
 					depth);
 		}
 
+		#if SYNC_BUFFERS
+		globals.deviceToHost();
+		#endif
+
+
 	}
 
 
diff --git a/src/update_tile_halo.cpp b/src/update_tile_halo.cpp
index 9bc9d25..1c54243 100644
--- a/src/update_tile_halo.cpp
+++ b/src/update_tile_halo.cpp
@@ -30,6 +30,10 @@ void update_tile_halo(global_variables &globals, int fields[NUM_FIELDS], int dep
 
 	// Update Top Bottom - Real to Real
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
 
 	for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) {
 		tile_type &tt = globals.chunk.tiles[tile];
@@ -39,6 +43,7 @@ void update_tile_halo(global_variables &globals, int fields[NUM_FIELDS], int dep
 		if (t_up != external_tile) {
 			tile_type &tup = globals.chunk.tiles[t_up];
 			update_tile_halo_t_kernel(
+					globals.use_target,
 					tt.info.t_xmin,
 					tt.info.t_xmax,
 					tt.info.t_ymin,
@@ -85,6 +90,7 @@ void update_tile_halo(global_variables &globals, int fields[NUM_FIELDS], int dep
 		if (t_down != external_tile) {
 			tile_type &tdown = globals.chunk.tiles[t_down];
 			update_tile_halo_b_kernel(
+					globals.use_target,
 					tt.info.t_xmin,
 					tt.info.t_xmax,
 					tt.info.t_ymin,
@@ -139,6 +145,7 @@ void update_tile_halo(global_variables &globals, int fields[NUM_FIELDS], int dep
 		if (t_left != external_tile) {
 			tile_type &tleft = globals.chunk.tiles[t_left];
 			update_tile_halo_l_kernel(
+					globals.use_target,
 					tt.info.t_xmin,
 					tt.info.t_xmax,
 					tt.info.t_ymin,
@@ -184,6 +191,7 @@ void update_tile_halo(global_variables &globals, int fields[NUM_FIELDS], int dep
 		if (t_right != external_tile) {
 			tile_type &tright = globals.chunk.tiles[t_right];
 			update_tile_halo_r_kernel(
+					globals.use_target,
 					tt.info.t_xmin,
 					tt.info.t_xmax,
 					tt.info.t_ymin,
@@ -227,5 +235,9 @@ void update_tile_halo(global_variables &globals, int fields[NUM_FIELDS], int dep
 		}
 	}
 
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
+
 }
 
diff --git a/src/update_tile_halo_kernel.cpp b/src/update_tile_halo_kernel.cpp
index de6f8dc..c57daa0 100644
--- a/src/update_tile_halo_kernel.cpp
+++ b/src/update_tile_halo_kernel.cpp
@@ -18,7 +18,7 @@
  */
 
 #include "update_tile_halo_kernel.h"
-#include "utils.hpp"
+
 
 //   @brief Fortran kernel to update the external halo cells in a chunk.
 //   @author Wayne Gaudin
@@ -28,40 +28,45 @@
 //   reflective.
 
 void update_tile_halo_l_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
-		clover::Buffer2D<double> &density0, clover::Buffer2D<double> &energy0,
-		clover::Buffer2D<double> &pressure, clover::Buffer2D<double> &viscosity,
-		clover::Buffer2D<double> &soundspeed, clover::Buffer2D<double> &density1,
-		clover::Buffer2D<double> &energy1, clover::Buffer2D<double> &xvel0,
-		clover::Buffer2D<double> &yvel0, clover::Buffer2D<double> &xvel1,
-		clover::Buffer2D<double> &yvel1, clover::Buffer2D<double> &vol_flux_x,
-		clover::Buffer2D<double> &vol_flux_y,
-		clover::Buffer2D<double> &mass_flux_x,
-		clover::Buffer2D<double> &mass_flux_y, int left_xmin, int left_xmax,
-		int left_ymin, int left_ymax, clover::Buffer2D<double> &left_density0,
-		clover::Buffer2D<double> &left_energy0,
-		clover::Buffer2D<double> &left_pressure,
-		clover::Buffer2D<double> &left_viscosity,
-		clover::Buffer2D<double> &left_soundspeed,
-		clover::Buffer2D<double> &left_density1,
-		clover::Buffer2D<double> &left_energy1,
-		clover::Buffer2D<double> &left_xvel0,
-		clover::Buffer2D<double> &left_yvel0,
-		clover::Buffer2D<double> &left_xvel1,
-		clover::Buffer2D<double> &left_yvel1,
-		clover::Buffer2D<double> &left_vol_flux_x,
-		clover::Buffer2D<double> &left_vol_flux_y,
-		clover::Buffer2D<double> &left_mass_flux_x,
-		clover::Buffer2D<double> &left_mass_flux_y, const int fields[NUM_FIELDS],
+		clover::Buffer2D<double> &density0_buffer, clover::Buffer2D<double> &energy0_buffer,
+		clover::Buffer2D<double> &pressure_buffer, clover::Buffer2D<double> &viscosity_buffer,
+		clover::Buffer2D<double> &soundspeed_buffer, clover::Buffer2D<double> &density1_buffer,
+		clover::Buffer2D<double> &energy1_buffer, clover::Buffer2D<double> &xvel0_buffer,
+		clover::Buffer2D<double> &yvel0_buffer, clover::Buffer2D<double> &xvel1_buffer,
+		clover::Buffer2D<double> &yvel1_buffer, clover::Buffer2D<double> &vol_flux_x_buffer,
+		clover::Buffer2D<double> &vol_flux_y_buffer,
+		clover::Buffer2D<double> &mass_flux_x_buffer,
+		clover::Buffer2D<double> &mass_flux_y_buffer, int left_xmin, int left_xmax,
+		int left_ymin, int left_ymax, clover::Buffer2D<double> &left_density0_buffer,
+		clover::Buffer2D<double> &left_energy0_buffer,
+		clover::Buffer2D<double> &left_pressure_buffer,
+		clover::Buffer2D<double> &left_viscosity_buffer,
+		clover::Buffer2D<double> &left_soundspeed_buffer,
+		clover::Buffer2D<double> &left_density1_buffer,
+		clover::Buffer2D<double> &left_energy1_buffer,
+		clover::Buffer2D<double> &left_xvel0_buffer,
+		clover::Buffer2D<double> &left_yvel0_buffer,
+		clover::Buffer2D<double> &left_xvel1_buffer,
+		clover::Buffer2D<double> &left_yvel1_buffer,
+		clover::Buffer2D<double> &left_vol_flux_x_buffer,
+		clover::Buffer2D<double> &left_vol_flux_y_buffer,
+		clover::Buffer2D<double> &left_mass_flux_x_buffer,
+		clover::Buffer2D<double> &left_mass_flux_y_buffer, const int fields[NUM_FIELDS],
 		int depth) {
 	// Density 0
 	if (fields[field_density0] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *density0 = density0_buffer.data;
+		const int density0_sizex = density0_buffer.nX();
+		double *left_density0 = left_density0_buffer.data;
+		const int left_density0_sizex = left_density0_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				density0(x_min - j, k) = left_density0(left_xmax + 1 - j, k);
+				density0[(x_min - j) + (k) * density0_sizex] = left_density0[(left_xmax + 1 - j) + (k) * left_density0_sizex];
 			}
 		}
 	}
@@ -70,10 +75,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_density1] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *density1 = density1_buffer.data;
+		const int density1_sizex = density1_buffer.nX();
+		double *left_density1 = left_density1_buffer.data;
+		const int left_density1_sizex = left_density1_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				density1(x_min - j, k) = left_density1(left_xmax + 1 - j, k);
+				density1[(x_min - j) + (k) * density1_sizex] = left_density1[(left_xmax + 1 - j) + (k) * left_density1_sizex];
 			}
 		}
 	}
@@ -82,10 +91,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_energy0] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *energy0 = energy0_buffer.data;
+		const int energy0_sizex = energy0_buffer.nX();
+		double *left_energy0 = left_energy0_buffer.data;
+		const int left_energy0_sizex = left_energy0_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				energy0(x_min - j, k) = left_energy0(left_xmax + 1 - j, k);
+				energy0[(x_min - j) + (k) * energy0_sizex] = left_energy0[(left_xmax + 1 - j) + (k) * left_energy0_sizex];
 			}
 		}
 	}
@@ -94,10 +107,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_energy1] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *energy1 = energy1_buffer.data;
+		const int energy1_sizex = energy1_buffer.nX();
+		double *left_energy1 = left_energy1_buffer.data;
+		const int left_energy1_sizex = left_energy1_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				energy1(x_min - j, k) = left_energy1(left_xmax + 1 - j, k);
+				energy1[(x_min - j) + (k) * energy1_sizex] = left_energy1[(left_xmax + 1 - j) + (k) * left_energy1_sizex];
 			}
 		}
 	}
@@ -106,10 +123,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_pressure] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *pressure = pressure_buffer.data;
+		const int pressure_sizex = pressure_buffer.nX();
+		double *left_pressure = left_pressure_buffer.data;
+		const int left_pressure_sizex = left_pressure_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				pressure(x_min - j, k) = left_pressure(left_xmax + 1 - j, k);
+				pressure[(x_min - j) + (k) * pressure_sizex] = left_pressure[(left_xmax + 1 - j) + (k) * left_pressure_sizex];
 			}
 		}
 	}
@@ -118,10 +139,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_viscosity] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *viscosity = viscosity_buffer.data;
+		const int viscosity_sizex = viscosity_buffer.nX();
+		double *left_viscosity = left_viscosity_buffer.data;
+		const int left_viscosity_sizex = left_viscosity_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				viscosity(x_min - j, k) = left_viscosity(left_xmax + 1 - j, k);
+				viscosity[(x_min - j) + (k) * viscosity_sizex] = left_viscosity[(left_xmax + 1 - j) + (k) * left_viscosity_sizex];
 			}
 		}
 	}
@@ -130,10 +155,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_soundspeed] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *soundspeed = soundspeed_buffer.data;
+		const int soundspeed_sizex = soundspeed_buffer.nX();
+		double *left_soundspeed = left_soundspeed_buffer.data;
+		const int left_soundspeed_sizex = left_soundspeed_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				soundspeed(x_min - j, k) = left_soundspeed(left_xmax + 1 - j, k);
+				soundspeed[(x_min - j) + (k) * soundspeed_sizex] = left_soundspeed[(left_xmax + 1 - j) + (k) * left_soundspeed_sizex];
 			}
 		}
 	}
@@ -142,10 +171,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_xvel0] == 1) {
 		// DO k=y_min-depth,y_max+1+depth
 
-		_Pragma("kernel1d")
+		double *xvel0 = xvel0_buffer.data;
+		const int xvel0_sizex = xvel0_buffer.nX();
+		double *left_xvel0 = left_xvel0_buffer.data;
+		const int left_xvel0_sizex = left_xvel0_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				xvel0(x_min - j, k) = left_xvel0(left_xmax + 1 - j, k);
+				xvel0[(x_min - j) + (k) * xvel0_sizex] = left_xvel0[(left_xmax + 1 - j) + (k) * left_xvel0_sizex];
 			}
 		}
 	}
@@ -154,10 +187,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_xvel1] == 1) {
 		// DO k=y_min-depth,y_max+1+depth
 
-		_Pragma("kernel1d")
+		double *xvel1 = xvel1_buffer.data;
+		const int xvel1_sizex = xvel1_buffer.nX();
+		double *left_xvel1 = left_xvel1_buffer.data;
+		const int left_xvel1_sizex = left_xvel1_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				xvel1(x_min - j, k) = left_xvel1(left_xmax + 1 - j, k);
+				xvel1[(x_min - j) + (k) * xvel1_sizex] = left_xvel1[(left_xmax + 1 - j) + (k) * left_xvel1_sizex];
 			}
 		}
 	}
@@ -166,10 +203,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_yvel0] == 1) {
 		// DO k=y_min-depth,y_max+1+depth
 
-		_Pragma("kernel1d")
+		double *yvel0 = yvel0_buffer.data;
+		const int yvel0_sizex = yvel0_buffer.nX();
+		double *left_yvel0 = left_yvel0_buffer.data;
+		const int left_yvel0_sizex = left_yvel0_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				yvel0(x_min - j, k) = left_yvel0(left_xmax + 1 - j, k);
+				yvel0[(x_min - j) + (k) * yvel0_sizex] = left_yvel0[(left_xmax + 1 - j) + (k) * left_yvel0_sizex];
 			}
 		}
 	}
@@ -178,10 +219,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_yvel1] == 1) {
 		// DO k=y_min-depth,y_max+1+depth
 
-		_Pragma("kernel1d")
+		double *yvel1 = yvel1_buffer.data;
+		const int yvel1_sizex = yvel1_buffer.nX();
+		double *left_yvel1 = left_yvel1_buffer.data;
+		const int left_yvel1_sizex = left_yvel1_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				yvel1(x_min - j, k) = left_yvel1(left_xmax + 1 - j, k);
+				yvel1[(x_min - j) + (k) * yvel1_sizex] = left_yvel1[(left_xmax + 1 - j) + (k) * left_yvel1_sizex];
 			}
 		}
 	}
@@ -190,10 +235,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_vol_flux_x] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *vol_flux_x = vol_flux_x_buffer.data;
+		const int vol_flux_x_sizex = vol_flux_x_buffer.nX();
+		double *left_vol_flux_x = left_vol_flux_x_buffer.data;
+		const int left_vol_flux_x_sizex = left_vol_flux_x_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				vol_flux_x(x_min - j, k) = left_vol_flux_x(left_xmax + 1 - j, k);
+				vol_flux_x[(x_min - j) + (k) * vol_flux_x_sizex] = left_vol_flux_x[(left_xmax + 1 - j) + (k) * left_vol_flux_x_sizex];
 			}
 		}
 	}
@@ -202,10 +251,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_mass_flux_x] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *mass_flux_x = mass_flux_x_buffer.data;
+		const int mass_flux_x_sizex = mass_flux_x_buffer.nX();
+		double *left_mass_flux_x = left_mass_flux_x_buffer.data;
+		const int left_mass_flux_x_sizex = left_mass_flux_x_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				mass_flux_x(x_min - j, k) = left_mass_flux_x(left_xmax + 1 - j, k);
+				mass_flux_x[(x_min - j) + (k) * mass_flux_x_sizex] = left_mass_flux_x[(left_xmax + 1 - j) + (k) * left_mass_flux_x_sizex];
 			}
 		}
 	}
@@ -214,10 +267,14 @@ void update_tile_halo_l_kernel(
 	if (fields[field_vol_flux_y] == 1) {
 		// DO k=y_min-depth,y_max+1+depth
 
-		_Pragma("kernel1d")
+		double *vol_flux_y = vol_flux_y_buffer.data;
+		const int vol_flux_y_sizex = vol_flux_y_buffer.nX();
+		double *left_vol_flux_y = left_vol_flux_y_buffer.data;
+		const int left_vol_flux_y_sizex = left_vol_flux_y_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				vol_flux_y(x_min - j, k) = left_vol_flux_y(left_xmax + 1 - j, k);
+				vol_flux_y[(x_min - j) + (k) * vol_flux_y_sizex] = left_vol_flux_y[(left_xmax + 1 - j) + (k) * left_vol_flux_y_sizex];
 			}
 		}
 	}
@@ -226,50 +283,59 @@ void update_tile_halo_l_kernel(
 	if (fields[field_mass_flux_y] == 1) {
 		// DO k=y_min-depth,y_max+1+depth
 
-		_Pragma("kernel1d")
+		double *mass_flux_y = mass_flux_y_buffer.data;
+		const int mass_flux_y_sizex = mass_flux_y_buffer.nX();
+		double *left_mass_flux_y = left_mass_flux_y_buffer.data;
+		const int left_mass_flux_y_sizex = left_mass_flux_y_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				mass_flux_y(x_min - j, k) = left_mass_flux_y(left_xmax + 1 - j, k);
+				mass_flux_y[(x_min - j) + (k) * mass_flux_y_sizex] = left_mass_flux_y[(left_xmax + 1 - j) + (k) * left_mass_flux_y_sizex];
 			}
 		}
 	}
 }
 
 void update_tile_halo_r_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
-		clover::Buffer2D<double> &density0, clover::Buffer2D<double> &energy0,
-		clover::Buffer2D<double> &pressure, clover::Buffer2D<double> &viscosity,
-		clover::Buffer2D<double> &soundspeed, clover::Buffer2D<double> &density1,
-		clover::Buffer2D<double> &energy1, clover::Buffer2D<double> &xvel0,
-		clover::Buffer2D<double> &yvel0, clover::Buffer2D<double> &xvel1,
-		clover::Buffer2D<double> &yvel1, clover::Buffer2D<double> &vol_flux_x,
-		clover::Buffer2D<double> &vol_flux_y,
-		clover::Buffer2D<double> &mass_flux_x,
-		clover::Buffer2D<double> &mass_flux_y, int right_xmin, int right_xmax,
-		int right_ymin, int right_ymax, clover::Buffer2D<double> &right_density0,
-		clover::Buffer2D<double> &right_energy0,
-		clover::Buffer2D<double> &right_pressure,
-		clover::Buffer2D<double> &right_viscosity,
-		clover::Buffer2D<double> &right_soundspeed,
-		clover::Buffer2D<double> &right_density1,
-		clover::Buffer2D<double> &right_energy1,
-		clover::Buffer2D<double> &right_xvel0,
-		clover::Buffer2D<double> &right_yvel0,
-		clover::Buffer2D<double> &right_xvel1,
-		clover::Buffer2D<double> &right_yvel1,
-		clover::Buffer2D<double> &right_vol_flux_x,
-		clover::Buffer2D<double> &right_vol_flux_y,
-		clover::Buffer2D<double> &right_mass_flux_x,
-		clover::Buffer2D<double> &right_mass_flux_y, const int fields[NUM_FIELDS],
+		clover::Buffer2D<double> &density0_buffer, clover::Buffer2D<double> &energy0_buffer,
+		clover::Buffer2D<double> &pressure_buffer, clover::Buffer2D<double> &viscosity_buffer,
+		clover::Buffer2D<double> &soundspeed_buffer, clover::Buffer2D<double> &density1_buffer,
+		clover::Buffer2D<double> &energy1_buffer, clover::Buffer2D<double> &xvel0_buffer,
+		clover::Buffer2D<double> &yvel0_buffer, clover::Buffer2D<double> &xvel1_buffer,
+		clover::Buffer2D<double> &yvel1_buffer, clover::Buffer2D<double> &vol_flux_x_buffer,
+		clover::Buffer2D<double> &vol_flux_y_buffer,
+		clover::Buffer2D<double> &mass_flux_x_buffer,
+		clover::Buffer2D<double> &mass_flux_y_buffer, int right_xmin, int right_xmax,
+		int right_ymin, int right_ymax, clover::Buffer2D<double> &right_density0_buffer,
+		clover::Buffer2D<double> &right_energy0_buffer,
+		clover::Buffer2D<double> &right_pressure_buffer,
+		clover::Buffer2D<double> &right_viscosity_buffer,
+		clover::Buffer2D<double> &right_soundspeed_buffer,
+		clover::Buffer2D<double> &right_density1_buffer,
+		clover::Buffer2D<double> &right_energy1_buffer,
+		clover::Buffer2D<double> &right_xvel0_buffer,
+		clover::Buffer2D<double> &right_yvel0_buffer,
+		clover::Buffer2D<double> &right_xvel1_buffer,
+		clover::Buffer2D<double> &right_yvel1_buffer,
+		clover::Buffer2D<double> &right_vol_flux_x_buffer,
+		clover::Buffer2D<double> &right_vol_flux_y_buffer,
+		clover::Buffer2D<double> &right_mass_flux_x_buffer,
+		clover::Buffer2D<double> &right_mass_flux_y_buffer, const int fields[NUM_FIELDS],
 		int depth) {
 	// Density 0
 	if (fields[field_density0] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *density0 = density0_buffer.data;
+		const int density0_sizex = density0_buffer.nX();
+		double *right_density0 = right_density0_buffer.data;
+		const int right_density0_sizex = right_density0_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				density0(x_max + 2 + j, k) = right_density0(right_xmin - 1 + 2 + j, k);
+				density0[(x_max + 2 + j) + (k) * density0_sizex] = right_density0[(right_xmin - 1 + 2 + j) + (k) * right_density0_sizex];
 			}
 		}
 	}
@@ -278,10 +344,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_density1] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *density1 = density1_buffer.data;
+		const int density1_sizex = density1_buffer.nX();
+		double *right_density1 = right_density1_buffer.data;
+		const int right_density1_sizex = right_density1_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				density1(x_max + 2 + j, k) = right_density1(right_xmin - 1 + 2 + j, k);
+				density1[(x_max + 2 + j) + (k) * density1_sizex] = right_density1[(right_xmin - 1 + 2 + j) + (k) * right_density1_sizex];
 			}
 		}
 	}
@@ -290,10 +360,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_energy0] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *energy0 = energy0_buffer.data;
+		const int energy0_sizex = energy0_buffer.nX();
+		double *right_energy0 = right_energy0_buffer.data;
+		const int right_energy0_sizex = right_energy0_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				energy0(x_max + 2 + j, k) = right_energy0(right_xmin - 1 + 2 + j, k);
+				energy0[(x_max + 2 + j) + (k) * energy0_sizex] = right_energy0[(right_xmin - 1 + 2 + j) + (k) * right_energy0_sizex];
 			}
 		}
 	}
@@ -302,10 +376,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_energy1] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *energy1 = energy1_buffer.data;
+		const int energy1_sizex = energy1_buffer.nX();
+		double *right_energy1 = right_energy1_buffer.data;
+		const int right_energy1_sizex = right_energy1_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				energy1(x_max + 2 + j, k) = right_energy1(right_xmin - 1 + 2 + j, k);
+				energy1[(x_max + 2 + j) + (k) * energy1_sizex] = right_energy1[(right_xmin - 1 + 2 + j) + (k) * right_energy1_sizex];
 			}
 		}
 	}
@@ -314,10 +392,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_pressure] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *pressure = pressure_buffer.data;
+		const int pressure_sizex = pressure_buffer.nX();
+		double *right_pressure = right_pressure_buffer.data;
+		const int right_pressure_sizex = right_pressure_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				pressure(x_max + 2 + j, k) = right_pressure(right_xmin - 1 + 2 + j, k);
+				pressure[(x_max + 2 + j) + (k) * pressure_sizex] = right_pressure[(right_xmin - 1 + 2 + j) + (k) * right_pressure_sizex];
 			}
 		}
 	}
@@ -326,10 +408,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_viscosity] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *viscosity = viscosity_buffer.data;
+		const int viscosity_sizex = viscosity_buffer.nX();
+		double *right_viscosity = right_viscosity_buffer.data;
+		const int right_viscosity_sizex = right_viscosity_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				viscosity(x_max + 2 + j, k) = right_viscosity(right_xmin - 1 + 2 + j, k);
+				viscosity[(x_max + 2 + j) + (k) * viscosity_sizex] = right_viscosity[(right_xmin - 1 + 2 + j) + (k) * right_viscosity_sizex];
 			}
 		}
 	}
@@ -338,10 +424,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_soundspeed] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *soundspeed = soundspeed_buffer.data;
+		const int soundspeed_sizex = soundspeed_buffer.nX();
+		double *right_soundspeed = right_soundspeed_buffer.data;
+		const int right_soundspeed_sizex = right_soundspeed_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				soundspeed(x_max + 2 + j, k) = right_soundspeed(right_xmin - 1 + 2 + j, k);
+				soundspeed[(x_max + 2 + j) + (k) * soundspeed_sizex] = right_soundspeed[(right_xmin - 1 + 2 + j) + (k) * right_soundspeed_sizex];
 			}
 		}
 	}
@@ -350,10 +440,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_xvel0] == 1) {
 		// DO k=y_min-depth,y_max+1+depth
 
-		_Pragma("kernel1d")
+		double *xvel0 = xvel0_buffer.data;
+		const int xvel0_sizex = xvel0_buffer.nX();
+		double *right_xvel0 = right_xvel0_buffer.data;
+		const int right_xvel0_sizex = right_xvel0_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				xvel0(x_max + 1 + 2 + j, k) = right_xvel0(right_xmin + 1 - 1 + 2 + j, k);
+				xvel0[(x_max + 1 + 2 + j) + (k) * xvel0_sizex] = right_xvel0[(right_xmin + 1 - 1 + 2 + j) + (k) * right_xvel0_sizex];
 			}
 		}
 	}
@@ -362,10 +456,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_xvel1] == 1) {
 		// DO k=y_min-depth,y_max+1+depth
 
-		_Pragma("kernel1d")
+		double *xvel1 = xvel1_buffer.data;
+		const int xvel1_sizex = xvel1_buffer.nX();
+		double *right_xvel1 = right_xvel1_buffer.data;
+		const int right_xvel1_sizex = right_xvel1_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				xvel1(x_max + 1 + 2 + j, k) = right_xvel1(right_xmin + 1 - 1 + 2 + j, k);
+				xvel1[(x_max + 1 + 2 + j) + (k) * xvel1_sizex] = right_xvel1[(right_xmin + 1 - 1 + 2 + j) + (k) * right_xvel1_sizex];
 			}
 		}
 	}
@@ -374,10 +472,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_yvel0] == 1) {
 		// DO k=y_min-depth,y_max+1+depth
 
-		_Pragma("kernel1d")
+		double *yvel0 = yvel0_buffer.data;
+		const int yvel0_sizex = yvel0_buffer.nX();
+		double *right_yvel0 = right_yvel0_buffer.data;
+		const int right_yvel0_sizex = right_yvel0_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				yvel0(x_max + 1 + 2 + j, k) = right_yvel0(right_xmin + 1 - 1 + 2 + j, k);
+				yvel0[(x_max + 1 + 2 + j) + (k) * yvel0_sizex] = right_yvel0[(right_xmin + 1 - 1 + 2 + j) + (k) * right_yvel0_sizex];
 			}
 		}
 	}
@@ -386,10 +488,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_yvel1] == 1) {
 		// DO k=y_min-depth,y_max+1+depth
 
-		_Pragma("kernel1d")
+		double *yvel1 = yvel1_buffer.data;
+		const int yvel1_sizex = yvel1_buffer.nX();
+		double *right_yvel1 = right_yvel1_buffer.data;
+		const int right_yvel1_sizex = right_yvel1_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				yvel1(x_max + 1 + 2 + j, k) = right_yvel1(right_xmin + 1 - 1 + 2 + j, k);
+				yvel1[(x_max + 1 + 2 + j) + (k) * yvel1_sizex] = right_yvel1[(right_xmin + 1 - 1 + 2 + j) + (k) * right_yvel1_sizex];
 			}
 		}
 	}
@@ -398,10 +504,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_vol_flux_x] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *vol_flux_x = vol_flux_x_buffer.data;
+		const int vol_flux_x_sizex = vol_flux_x_buffer.nX();
+		double *right_vol_flux_x = right_vol_flux_x_buffer.data;
+		const int right_vol_flux_x_sizex = right_vol_flux_x_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				vol_flux_x(x_max + 1 + 2 + j, k) = right_vol_flux_x(right_xmin + 1 - 1 + 2 + j, k);
+				vol_flux_x[(x_max + 1 + 2 + j) + (k) * vol_flux_x_sizex] = right_vol_flux_x[(right_xmin + 1 - 1 + 2 + j) + (k) * right_vol_flux_x_sizex];
 			}
 		}
 	}
@@ -410,10 +520,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_mass_flux_x] == 1) {
 		// DO k=y_min-depth,y_max+depth
 
-		_Pragma("kernel1d")
+		double *mass_flux_x = mass_flux_x_buffer.data;
+		const int mass_flux_x_sizex = mass_flux_x_buffer.nX();
+		double *right_mass_flux_x = right_mass_flux_x_buffer.data;
+		const int right_mass_flux_x_sizex = right_mass_flux_x_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				mass_flux_x(x_max + 1 + 2 + j, k) = right_mass_flux_x(right_xmin + 1 - 1 + 2 + j, k);
+				mass_flux_x[(x_max + 1 + 2 + j) + (k) * mass_flux_x_sizex] = right_mass_flux_x[(right_xmin + 1 - 1 + 2 + j) + (k) * right_mass_flux_x_sizex];
 			}
 		}
 	}
@@ -422,10 +536,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_vol_flux_y] == 1) {
 		// DO k=y_min-depth,y_max+1+depth
 
-		_Pragma("kernel1d")
+		double *vol_flux_y = vol_flux_y_buffer.data;
+		const int vol_flux_y_sizex = vol_flux_y_buffer.nX();
+		double *right_vol_flux_y = right_vol_flux_y_buffer.data;
+		const int right_vol_flux_y_sizex = right_vol_flux_y_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				vol_flux_y(x_max + 2 + j, k) = right_vol_flux_y(right_xmin - 1 + 2 + j, k);
+				vol_flux_y[(x_max + 2 + j) + (k) * vol_flux_y_sizex] = right_vol_flux_y[(right_xmin - 1 + 2 + j) + (k) * right_vol_flux_y_sizex];
 			}
 		}
 	}
@@ -434,10 +552,14 @@ void update_tile_halo_r_kernel(
 	if (fields[field_mass_flux_y] == 1) {
 		// DO k=y_min-depth,y_max+1+depth
 
-		_Pragma("kernel1d")
+		double *mass_flux_y = mass_flux_y_buffer.data;
+		const int mass_flux_y_sizex = mass_flux_y_buffer.nX();
+		double *right_mass_flux_y = right_mass_flux_y_buffer.data;
+		const int right_mass_flux_y_sizex = right_mass_flux_y_buffer.nX();
+		#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 		for (int k = (y_min - depth + 1); k < (y_max + 1 + depth + 2); k++) {
 			for (int j = 0; j < depth; ++j) {
-				mass_flux_y(x_max + 2 + j, k) = right_mass_flux_y(right_xmin - 1 + 2 + j, k);
+				mass_flux_y[(x_max + 2 + j) + (k) * mass_flux_y_sizex] = right_mass_flux_y[(right_xmin - 1 + 2 + j) + (k) * right_mass_flux_y_sizex];
 			}
 		}
 	}
@@ -448,38 +570,43 @@ void update_tile_halo_r_kernel(
 //  communication
 
 void update_tile_halo_t_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
-		clover::Buffer2D<double> &density0, clover::Buffer2D<double> &energy0,
-		clover::Buffer2D<double> &pressure, clover::Buffer2D<double> &viscosity,
-		clover::Buffer2D<double> &soundspeed, clover::Buffer2D<double> &density1,
-		clover::Buffer2D<double> &energy1, clover::Buffer2D<double> &xvel0,
-		clover::Buffer2D<double> &yvel0, clover::Buffer2D<double> &xvel1,
-		clover::Buffer2D<double> &yvel1, clover::Buffer2D<double> &vol_flux_x,
-		clover::Buffer2D<double> &vol_flux_y,
-		clover::Buffer2D<double> &mass_flux_x,
-		clover::Buffer2D<double> &mass_flux_y, int top_xmin, int top_xmax,
-		int top_ymin, int top_ymax, clover::Buffer2D<double> &top_density0,
-		clover::Buffer2D<double> &top_energy0,
-		clover::Buffer2D<double> &top_pressure,
-		clover::Buffer2D<double> &top_viscosity,
-		clover::Buffer2D<double> &top_soundspeed,
-		clover::Buffer2D<double> &top_density1,
-		clover::Buffer2D<double> &top_energy1,
-		clover::Buffer2D<double> &top_xvel0, clover::Buffer2D<double> &top_yvel0,
-		clover::Buffer2D<double> &top_xvel1, clover::Buffer2D<double> &top_yvel1,
-		clover::Buffer2D<double> &top_vol_flux_x,
-		clover::Buffer2D<double> &top_vol_flux_y,
-		clover::Buffer2D<double> &top_mass_flux_x,
-		clover::Buffer2D<double> &top_mass_flux_y, const int fields[NUM_FIELDS],
+		clover::Buffer2D<double> &density0_buffer, clover::Buffer2D<double> &energy0_buffer,
+		clover::Buffer2D<double> &pressure_buffer, clover::Buffer2D<double> &viscosity_buffer,
+		clover::Buffer2D<double> &soundspeed_buffer, clover::Buffer2D<double> &density1_buffer,
+		clover::Buffer2D<double> &energy1_buffer, clover::Buffer2D<double> &xvel0_buffer,
+		clover::Buffer2D<double> &yvel0_buffer, clover::Buffer2D<double> &xvel1_buffer,
+		clover::Buffer2D<double> &yvel1_buffer, clover::Buffer2D<double> &vol_flux_x_buffer,
+		clover::Buffer2D<double> &vol_flux_y_buffer,
+		clover::Buffer2D<double> &mass_flux_x_buffer,
+		clover::Buffer2D<double> &mass_flux_y_buffer, int top_xmin, int top_xmax,
+		int top_ymin, int top_ymax, clover::Buffer2D<double> &top_density0_buffer,
+		clover::Buffer2D<double> &top_energy0_buffer,
+		clover::Buffer2D<double> &top_pressure_buffer,
+		clover::Buffer2D<double> &top_viscosity_buffer,
+		clover::Buffer2D<double> &top_soundspeed_buffer,
+		clover::Buffer2D<double> &top_density1_buffer,
+		clover::Buffer2D<double> &top_energy1_buffer,
+		clover::Buffer2D<double> &top_xvel0_buffer, clover::Buffer2D<double> &top_yvel0_buffer,
+		clover::Buffer2D<double> &top_xvel1_buffer, clover::Buffer2D<double> &top_yvel1_buffer,
+		clover::Buffer2D<double> &top_vol_flux_x_buffer,
+		clover::Buffer2D<double> &top_vol_flux_y_buffer,
+		clover::Buffer2D<double> &top_mass_flux_x_buffer,
+		clover::Buffer2D<double> &top_mass_flux_y_buffer, const int fields[NUM_FIELDS],
 		int depth) {
 	// Density 0
 	if (fields[field_density0] == 1) {
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *density0 = density0_buffer.data;
+			const int density0_sizex = density0_buffer.nX();
+			double *top_density0 = top_density0_buffer.data;
+			const int top_density0_sizex = top_density0_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				density0(j, y_max + 2 + k) = top_density0(j, top_ymin - 1 + 2 + k);
+				density0[j + (y_max + 2 + k) * density0_sizex] = top_density0[j + (top_ymin - 1 + 2 + k) * top_density0_sizex];
 			}
 		}
 	}
@@ -489,9 +616,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *density1 = density1_buffer.data;
+			const int density1_sizex = density1_buffer.nX();
+			double *top_density1 = top_density1_buffer.data;
+			const int top_density1_sizex = top_density1_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				density1(j, y_max + 2 + k) = top_density1(j, top_ymin - 1 + 2 + k);
+				density1[j + (y_max + 2 + k) * density1_sizex] = top_density1[j + (top_ymin - 1 + 2 + k) * top_density1_sizex];
 			}
 		}
 	}
@@ -501,9 +632,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *energy0 = energy0_buffer.data;
+			const int energy0_sizex = energy0_buffer.nX();
+			double *top_energy0 = top_energy0_buffer.data;
+			const int top_energy0_sizex = top_energy0_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				energy0(j, y_max + 2 + k) = top_energy0(j, top_ymin - 1 + 2 + k);
+				energy0[j + (y_max + 2 + k) * energy0_sizex] = top_energy0[j + (top_ymin - 1 + 2 + k) * top_energy0_sizex];
 			}
 		}
 	}
@@ -513,9 +648,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *energy1 = energy1_buffer.data;
+			const int energy1_sizex = energy1_buffer.nX();
+			double *top_energy1 = top_energy1_buffer.data;
+			const int top_energy1_sizex = top_energy1_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				energy1(j, y_max + 2 + k) = top_energy1(j, top_ymin - 1 + 2 + k);
+				energy1[j + (y_max + 2 + k) * energy1_sizex] = top_energy1[j + (top_ymin - 1 + 2 + k) * top_energy1_sizex];
 			}
 		}
 	}
@@ -525,9 +664,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *pressure = pressure_buffer.data;
+			const int pressure_sizex = pressure_buffer.nX();
+			double *top_pressure = top_pressure_buffer.data;
+			const int top_pressure_sizex = top_pressure_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				pressure(j, y_max + 2 + k) = top_pressure(j, top_ymin - 1 + 2 + k);
+				pressure[j + (y_max + 2 + k) * pressure_sizex] = top_pressure[j + (top_ymin - 1 + 2 + k) * top_pressure_sizex];
 			}
 		}
 	}
@@ -537,9 +680,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *viscosity = viscosity_buffer.data;
+			const int viscosity_sizex = viscosity_buffer.nX();
+			double *top_viscosity = top_viscosity_buffer.data;
+			const int top_viscosity_sizex = top_viscosity_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				viscosity(j, y_max + 2 + k) = top_viscosity(j, top_ymin - 1 + 2 + k);
+				viscosity[j + (y_max + 2 + k) * viscosity_sizex] = top_viscosity[j + (top_ymin - 1 + 2 + k) * top_viscosity_sizex];
 			}
 		}
 	}
@@ -549,9 +696,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *soundspeed = soundspeed_buffer.data;
+			const int soundspeed_sizex = soundspeed_buffer.nX();
+			double *top_soundspeed = top_soundspeed_buffer.data;
+			const int top_soundspeed_sizex = top_soundspeed_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				soundspeed(j, y_max + 2 + k) = top_soundspeed(j, top_ymin - 1 + 2 + k);
+				soundspeed[j + (y_max + 2 + k) * soundspeed_sizex] = top_soundspeed[j + (top_ymin - 1 + 2 + k) * top_soundspeed_sizex];
 			}
 		}
 	}
@@ -561,9 +712,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+1+depth
 
-			_Pragma("kernel1d")
+			double *xvel0 = xvel0_buffer.data;
+			const int xvel0_sizex = xvel0_buffer.nX();
+			double *top_xvel0 = top_xvel0_buffer.data;
+			const int top_xvel0_sizex = top_xvel0_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
-				xvel0(j, y_max + 1 + 2 + k) = top_xvel0(j, top_ymin + 1 - 1 + 2 + k);
+				xvel0[j + (y_max + 1 + 2 + k) * xvel0_sizex] = top_xvel0[j + (top_ymin + 1 - 1 + 2 + k) * top_xvel0_sizex];
 			}
 		}
 	}
@@ -573,9 +728,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+1+depth
 
-			_Pragma("kernel1d")
+			double *xvel1 = xvel1_buffer.data;
+			const int xvel1_sizex = xvel1_buffer.nX();
+			double *top_xvel1 = top_xvel1_buffer.data;
+			const int top_xvel1_sizex = top_xvel1_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
-				xvel1(j, y_max + 1 + 2 + k) = top_xvel1(j, top_ymin + 1 - 1 + 2 + k);
+				xvel1[j + (y_max + 1 + 2 + k) * xvel1_sizex] = top_xvel1[j + (top_ymin + 1 - 1 + 2 + k) * top_xvel1_sizex];
 			}
 		}
 	}
@@ -585,9 +744,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+1+depth
 
-			_Pragma("kernel1d")
+			double *yvel0 = yvel0_buffer.data;
+			const int yvel0_sizex = yvel0_buffer.nX();
+			double *top_yvel0 = top_yvel0_buffer.data;
+			const int top_yvel0_sizex = top_yvel0_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
-				yvel0(j, y_max + 1 + 2 + k) = top_yvel0(j, top_ymin + 1 - 1 + 2 + k);
+				yvel0[j + (y_max + 1 + 2 + k) * yvel0_sizex] = top_yvel0[j + (top_ymin + 1 - 1 + 2 + k) * top_yvel0_sizex];
 			}
 		}
 	}
@@ -597,9 +760,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+1+depth
 
-			_Pragma("kernel1d")
+			double *yvel1 = yvel1_buffer.data;
+			const int yvel1_sizex = yvel1_buffer.nX();
+			double *top_yvel1 = top_yvel1_buffer.data;
+			const int top_yvel1_sizex = top_yvel1_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
-				yvel1(j, y_max + 1 + 2 + k) = top_yvel1(j, top_ymin + 1 - 1 + 2 + k);
+				yvel1[j + (y_max + 1 + 2 + k) * yvel1_sizex] = top_yvel1[j + (top_ymin + 1 - 1 + 2 + k) * top_yvel1_sizex];
 			}
 		}
 	}
@@ -609,9 +776,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+1+depth
 
-			_Pragma("kernel1d")
+			double *vol_flux_x = vol_flux_x_buffer.data;
+			const int vol_flux_x_sizex = vol_flux_x_buffer.nX();
+			double *top_vol_flux_x = top_vol_flux_x_buffer.data;
+			const int top_vol_flux_x_sizex = top_vol_flux_x_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
-				vol_flux_x(j, y_max + 2 + k) = top_vol_flux_x(j, top_ymin - 1 + 2 + k);
+				vol_flux_x[j + (y_max + 2 + k) * vol_flux_x_sizex] = top_vol_flux_x[j + (top_ymin - 1 + 2 + k) * top_vol_flux_x_sizex];
 			}
 		}
 	}
@@ -621,9 +792,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+1+depth
 
-			_Pragma("kernel1d")
+			double *mass_flux_x = mass_flux_x_buffer.data;
+			const int mass_flux_x_sizex = mass_flux_x_buffer.nX();
+			double *top_mass_flux_x = top_mass_flux_x_buffer.data;
+			const int top_mass_flux_x_sizex = top_mass_flux_x_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
-				mass_flux_x(j, y_max + 2 + k) = top_mass_flux_x(j, top_ymin - 1 + 2 + k);
+				mass_flux_x[j + (y_max + 2 + k) * mass_flux_x_sizex] = top_mass_flux_x[j + (top_ymin - 1 + 2 + k) * top_mass_flux_x_sizex];
 			}
 		}
 	}
@@ -633,9 +808,13 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *vol_flux_y = vol_flux_y_buffer.data;
+			const int vol_flux_y_sizex = vol_flux_y_buffer.nX();
+			double *top_vol_flux_y = top_vol_flux_y_buffer.data;
+			const int top_vol_flux_y_sizex = top_vol_flux_y_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				vol_flux_y(j, y_max + 1 + 2 + k) = top_vol_flux_y(j, top_ymin + 1 - 1 + 2 + k);
+				vol_flux_y[j + (y_max + 1 + 2 + k) * vol_flux_y_sizex] = top_vol_flux_y[j + (top_ymin + 1 - 1 + 2 + k) * top_vol_flux_y_sizex];
 			}
 		}
 	}
@@ -645,50 +824,59 @@ void update_tile_halo_t_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *mass_flux_y = mass_flux_y_buffer.data;
+			const int mass_flux_y_sizex = mass_flux_y_buffer.nX();
+			double *top_mass_flux_y = top_mass_flux_y_buffer.data;
+			const int top_mass_flux_y_sizex = top_mass_flux_y_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				mass_flux_y(j, y_max + 1 + 2 + k) = top_mass_flux_y(j, top_ymin + 1 - 1 + 2 + k);
+				mass_flux_y[j + (y_max + 1 + 2 + k) * mass_flux_y_sizex] = top_mass_flux_y[j + (top_ymin + 1 - 1 + 2 + k) * top_mass_flux_y_sizex];
 			}
 		}
 	}
 }
 
 void update_tile_halo_b_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
-		clover::Buffer2D<double> &density0, clover::Buffer2D<double> &energy0,
-		clover::Buffer2D<double> &pressure, clover::Buffer2D<double> &viscosity,
-		clover::Buffer2D<double> &soundspeed, clover::Buffer2D<double> &density1,
-		clover::Buffer2D<double> &energy1, clover::Buffer2D<double> &xvel0,
-		clover::Buffer2D<double> &yvel0, clover::Buffer2D<double> &xvel1,
-		clover::Buffer2D<double> &yvel1, clover::Buffer2D<double> &vol_flux_x,
-		clover::Buffer2D<double> &vol_flux_y,
-		clover::Buffer2D<double> &mass_flux_x,
-		clover::Buffer2D<double> &mass_flux_y, int bottom_xmin, int bottom_xmax,
+		clover::Buffer2D<double> &density0_buffer, clover::Buffer2D<double> &energy0_buffer,
+		clover::Buffer2D<double> &pressure_buffer, clover::Buffer2D<double> &viscosity_buffer,
+		clover::Buffer2D<double> &soundspeed_buffer, clover::Buffer2D<double> &density1_buffer,
+		clover::Buffer2D<double> &energy1_buffer, clover::Buffer2D<double> &xvel0_buffer,
+		clover::Buffer2D<double> &yvel0_buffer, clover::Buffer2D<double> &xvel1_buffer,
+		clover::Buffer2D<double> &yvel1_buffer, clover::Buffer2D<double> &vol_flux_x_buffer,
+		clover::Buffer2D<double> &vol_flux_y_buffer,
+		clover::Buffer2D<double> &mass_flux_x_buffer,
+		clover::Buffer2D<double> &mass_flux_y_buffer, int bottom_xmin, int bottom_xmax,
 		int bottom_ymin, int bottom_ymax,
-		clover::Buffer2D<double> &bottom_density0,
-		clover::Buffer2D<double> &bottom_energy0,
-		clover::Buffer2D<double> &bottom_pressure,
-		clover::Buffer2D<double> &bottom_viscosity,
-		clover::Buffer2D<double> &bottom_soundspeed,
-		clover::Buffer2D<double> &bottom_density1,
-		clover::Buffer2D<double> &bottom_energy1,
-		clover::Buffer2D<double> &bottom_xvel0,
-		clover::Buffer2D<double> &bottom_yvel0,
-		clover::Buffer2D<double> &bottom_xvel1,
-		clover::Buffer2D<double> &bottom_yvel1,
-		clover::Buffer2D<double> &bottom_vol_flux_x,
-		clover::Buffer2D<double> &bottom_vol_flux_y,
-		clover::Buffer2D<double> &bottom_mass_flux_x,
-		clover::Buffer2D<double> &bottom_mass_flux_y, const int fields[NUM_FIELDS],
+		clover::Buffer2D<double> &bottom_density0_buffer,
+		clover::Buffer2D<double> &bottom_energy0_buffer,
+		clover::Buffer2D<double> &bottom_pressure_buffer,
+		clover::Buffer2D<double> &bottom_viscosity_buffer,
+		clover::Buffer2D<double> &bottom_soundspeed_buffer,
+		clover::Buffer2D<double> &bottom_density1_buffer,
+		clover::Buffer2D<double> &bottom_energy1_buffer,
+		clover::Buffer2D<double> &bottom_xvel0_buffer,
+		clover::Buffer2D<double> &bottom_yvel0_buffer,
+		clover::Buffer2D<double> &bottom_xvel1_buffer,
+		clover::Buffer2D<double> &bottom_yvel1_buffer,
+		clover::Buffer2D<double> &bottom_vol_flux_x_buffer,
+		clover::Buffer2D<double> &bottom_vol_flux_y_buffer,
+		clover::Buffer2D<double> &bottom_mass_flux_x_buffer,
+		clover::Buffer2D<double> &bottom_mass_flux_y_buffer, const int fields[NUM_FIELDS],
 		int depth) {
 	// Density 0
 	if (fields[field_density0] == 1) {
 		for (int k = 0; k < depth; ++k) {
 			//  DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *density0 = density0_buffer.data;
+			const int density0_sizex = density0_buffer.nX();
+			double *bottom_density0 = bottom_density0_buffer.data;
+			const int bottom_density0_sizex = bottom_density0_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				density0(j, y_min - k) = bottom_density0(j, bottom_ymax + 1 - k);
+				density0[j + (y_min - k) * density0_sizex] = bottom_density0[j + (bottom_ymax + 1 - k) * bottom_density0_sizex];
 			}
 		}
 	}
@@ -698,9 +886,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			//  DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *density1 = density1_buffer.data;
+			const int density1_sizex = density1_buffer.nX();
+			double *bottom_density1 = bottom_density1_buffer.data;
+			const int bottom_density1_sizex = bottom_density1_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				density1(j, y_min - k) = bottom_density1(j, bottom_ymax + 1 - k);
+				density1[j + (y_min - k) * density1_sizex] = bottom_density1[j + (bottom_ymax + 1 - k) * bottom_density1_sizex];
 			}
 		}
 	}
@@ -710,9 +902,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			//  DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *energy0 = energy0_buffer.data;
+			const int energy0_sizex = energy0_buffer.nX();
+			double *bottom_energy0 = bottom_energy0_buffer.data;
+			const int bottom_energy0_sizex = bottom_energy0_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				energy0(j, y_min - k) = bottom_energy0(j, bottom_ymax + 1 - k);
+				energy0[j + (y_min - k) * energy0_sizex] = bottom_energy0[j + (bottom_ymax + 1 - k) * bottom_energy0_sizex];
 			}
 		}
 	}
@@ -722,9 +918,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			//  DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *energy1 = energy1_buffer.data;
+			const int energy1_sizex = energy1_buffer.nX();
+			double *bottom_energy1 = bottom_energy1_buffer.data;
+			const int bottom_energy1_sizex = bottom_energy1_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				energy1(j, y_min - k) = bottom_energy1(j, bottom_ymax + 1 - k);
+				energy1[j + (y_min - k) * energy1_sizex] = bottom_energy1[j + (bottom_ymax + 1 - k) * bottom_energy1_sizex];
 			}
 		}
 	}
@@ -734,9 +934,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			//  DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *pressure = pressure_buffer.data;
+			const int pressure_sizex = pressure_buffer.nX();
+			double *bottom_pressure = bottom_pressure_buffer.data;
+			const int bottom_pressure_sizex = bottom_pressure_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				pressure(j, y_min - k) = bottom_pressure(j, bottom_ymax + 1 - k);
+				pressure[j + (y_min - k) * pressure_sizex] = bottom_pressure[j + (bottom_ymax + 1 - k) * bottom_pressure_sizex];
 			}
 		}
 	}
@@ -746,9 +950,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			//  DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *viscosity = viscosity_buffer.data;
+			const int viscosity_sizex = viscosity_buffer.nX();
+			double *bottom_viscosity = bottom_viscosity_buffer.data;
+			const int bottom_viscosity_sizex = bottom_viscosity_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				viscosity(j, y_min - k) = bottom_viscosity(j, bottom_ymax + 1 - k);
+				viscosity[j + (y_min - k) * viscosity_sizex] = bottom_viscosity[j + (bottom_ymax + 1 - k) * bottom_viscosity_sizex];
 			}
 		}
 	}
@@ -758,9 +966,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			//  DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *soundspeed = soundspeed_buffer.data;
+			const int soundspeed_sizex = soundspeed_buffer.nX();
+			double *bottom_soundspeed = bottom_soundspeed_buffer.data;
+			const int bottom_soundspeed_sizex = bottom_soundspeed_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				soundspeed(j, y_min - k) = bottom_soundspeed(j, bottom_ymax + 1 - k);
+				soundspeed[j + (y_min - k) * soundspeed_sizex] = bottom_soundspeed[j + (bottom_ymax + 1 - k) * bottom_soundspeed_sizex];
 			}
 		}
 	}
@@ -770,9 +982,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+1+depth
 
-			_Pragma("kernel1d")
+			double *xvel0 = xvel0_buffer.data;
+			const int xvel0_sizex = xvel0_buffer.nX();
+			double *bottom_xvel0 = bottom_xvel0_buffer.data;
+			const int bottom_xvel0_sizex = bottom_xvel0_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
-				xvel0(j, y_min - k) = bottom_xvel0(j, bottom_ymax + 1 - k);
+				xvel0[j + (y_min - k) * xvel0_sizex] = bottom_xvel0[j + (bottom_ymax + 1 - k) * bottom_xvel0_sizex];
 			}
 		}
 	}
@@ -782,9 +998,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+1+depth
 
-			_Pragma("kernel1d")
+			double *xvel1 = xvel1_buffer.data;
+			const int xvel1_sizex = xvel1_buffer.nX();
+			double *bottom_xvel1 = bottom_xvel1_buffer.data;
+			const int bottom_xvel1_sizex = bottom_xvel1_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
-				xvel1(j, y_min - k) = bottom_xvel1(j, bottom_ymax + 1 - k);
+				xvel1[j + (y_min - k) * xvel1_sizex] = bottom_xvel1[j + (bottom_ymax + 1 - k) * bottom_xvel1_sizex];
 			}
 		}
 	}
@@ -794,9 +1014,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+1+depth
 
-			_Pragma("kernel1d")
+			double *yvel0 = yvel0_buffer.data;
+			const int yvel0_sizex = yvel0_buffer.nX();
+			double *bottom_yvel0 = bottom_yvel0_buffer.data;
+			const int bottom_yvel0_sizex = bottom_yvel0_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
-				yvel0(j, y_min - k) = bottom_yvel0(j, bottom_ymax + 1 - k);
+				yvel0[j + (y_min - k) * yvel0_sizex] = bottom_yvel0[j + (bottom_ymax + 1 - k) * bottom_yvel0_sizex];
 			}
 		}
 	}
@@ -806,9 +1030,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+1+depth
 
-			_Pragma("kernel1d")
+			double *yvel1 = yvel1_buffer.data;
+			const int yvel1_sizex = yvel1_buffer.nX();
+			double *bottom_yvel1 = bottom_yvel1_buffer.data;
+			const int bottom_yvel1_sizex = bottom_yvel1_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
-				yvel1(j, y_min - k) = bottom_yvel1(j, bottom_ymax + 1 - k);
+				yvel1[j + (y_min - k) * yvel1_sizex] = bottom_yvel1[j + (bottom_ymax + 1 - k) * bottom_yvel1_sizex];
 			}
 		}
 	}
@@ -818,9 +1046,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+1+depth
 
-			_Pragma("kernel1d")
+			double *vol_flux_x = vol_flux_x_buffer.data;
+			const int vol_flux_x_sizex = vol_flux_x_buffer.nX();
+			double *bottom_vol_flux_x = bottom_vol_flux_x_buffer.data;
+			const int bottom_vol_flux_x_sizex = bottom_vol_flux_x_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
-				vol_flux_x(j, y_min - k) = bottom_vol_flux_x(j, bottom_ymax + 1 - k);
+				vol_flux_x[j + (y_min - k) * vol_flux_x_sizex] = bottom_vol_flux_x[j + (bottom_ymax + 1 - k) * bottom_vol_flux_x_sizex];
 			}
 		}
 	}
@@ -830,9 +1062,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+1+depth
 
-			_Pragma("kernel1d")
+			double *mass_flux_x = mass_flux_x_buffer.data;
+			const int mass_flux_x_sizex = mass_flux_x_buffer.nX();
+			double *bottom_mass_flux_x = bottom_mass_flux_x_buffer.data;
+			const int bottom_mass_flux_x_sizex = bottom_mass_flux_x_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + 1 + depth + 2); j++) {
-				mass_flux_x(j, y_min - k) = bottom_mass_flux_x(j, bottom_ymax + 1 - k);
+				mass_flux_x[j + (y_min - k) * mass_flux_x_sizex] = bottom_mass_flux_x[j + (bottom_ymax + 1 - k) * bottom_mass_flux_x_sizex];
 			}
 		}
 	}
@@ -842,9 +1078,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *vol_flux_y = vol_flux_y_buffer.data;
+			const int vol_flux_y_sizex = vol_flux_y_buffer.nX();
+			double *bottom_vol_flux_y = bottom_vol_flux_y_buffer.data;
+			const int bottom_vol_flux_y_sizex = bottom_vol_flux_y_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				vol_flux_y(j, y_min - k) = bottom_vol_flux_y(j, bottom_ymax + 1 - k);
+				vol_flux_y[j + (y_min - k) * vol_flux_y_sizex] = bottom_vol_flux_y[j + (bottom_ymax + 1 - k) * bottom_vol_flux_y_sizex];
 			}
 		}
 	}
@@ -854,9 +1094,13 @@ void update_tile_halo_b_kernel(
 		for (int k = 0; k < depth; ++k) {
 			// DO j=x_min-depth, x_max+depth
 
-			_Pragma("kernel1d")
+			double *mass_flux_y = mass_flux_y_buffer.data;
+			const int mass_flux_y_sizex = mass_flux_y_buffer.nX();
+			double *bottom_mass_flux_y = bottom_mass_flux_y_buffer.data;
+			const int bottom_mass_flux_y_sizex = bottom_mass_flux_y_buffer.nX();
+			#pragma omp target teams distribute parallel for simd clover_use_target(use_target)
 			for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) {
-				mass_flux_y(j, y_min - k) = bottom_mass_flux_y(j, bottom_ymax + 1 - k);
+				mass_flux_y[j + (y_min - k) * mass_flux_y_sizex] = bottom_mass_flux_y[j + (bottom_ymax + 1 - k) * bottom_mass_flux_y_sizex];
 			}
 		}
 	}
diff --git a/src/update_tile_halo_kernel.h b/src/update_tile_halo_kernel.h
index 3c3d305..897b184 100644
--- a/src/update_tile_halo_kernel.h
+++ b/src/update_tile_halo_kernel.h
@@ -22,9 +22,10 @@
 #define UPDATE_TILE_HALO_KERNEL_H
 
 #include "definitions.h"
-#include "utils.hpp"
+
 
 void update_tile_halo_l_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
 		clover::Buffer2D<double> &density0,
 		clover::Buffer2D<double> &energy0,
@@ -62,6 +63,7 @@ void update_tile_halo_l_kernel(
 
 
 void update_tile_halo_r_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
 		clover::Buffer2D<double> &density0,
 		clover::Buffer2D<double> &energy0,
@@ -98,6 +100,7 @@ void update_tile_halo_r_kernel(
 		int depth);
 
 void update_tile_halo_t_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
 		clover::Buffer2D<double> &density0,
 		clover::Buffer2D<double> &energy0,
@@ -135,6 +138,7 @@ void update_tile_halo_t_kernel(
 
 
 void update_tile_halo_b_kernel(
+		bool use_target,
 		int x_min, int x_max, int y_min, int y_max,
 		clover::Buffer2D<double> &density0,
 		clover::Buffer2D<double> &energy0,
diff --git a/src/utils.hpp b/src/utils.hpp
deleted file mode 100644
index 7abef27..0000000
--- a/src/utils.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- Crown Copyright 2012 AWE.
-
- This file is part of CloverLeaf.
-
- CloverLeaf is free software: you can redistribute it and/or modify it under
- the terms of the GNU General Public License as published by the
- Free Software Foundation, either version 3 of the License, or (at your option)
- any later version.
-
- CloverLeaf is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
- details.
-
- You should have received a copy of the GNU General Public License along with
- CloverLeaf. If not, see http://www.gnu.org/licenses/.
- */
-
-#ifndef UTILS_HPP
-#define UTILS_HPP
-
-#include <iostream>
-#include <utility>
-#include <cassert>
-#include <vector>
-
-namespace clover {
-
-	template<typename T>
-	struct Buffer1D {
-
-		std::vector<T> data;
-
-		explicit Buffer1D(size_t size) : data(size) {}
-
-		T operator[](size_t i) const { return data[i]; }
-		T &operator[](size_t i) { return data[i]; }
-
-		T *actual() { return data.data(); }
-
-		[[nodiscard]] size_t size() const { return data.size(); }
-
-		friend std::ostream &operator<<(std::ostream &os, const Buffer1D<T> &buffer) {
-			os << "Buffer1D(size: " << buffer.size << ")";
-			return os;
-		}
-
-	};
-
-	template<typename T>
-	struct Buffer2D {
-
-		const size_t sizeX, sizeY;
-		std::vector<T> data;
-
-		Buffer2D(size_t sizeX, size_t sizeY) : sizeX(sizeX), sizeY(sizeY), data(sizeX * sizeY) {}
-
-		T &operator()(size_t i, size_t j) { return data[i + j * sizeX]; }
-		T const &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; }
-
-
-		T *actual() { return data.data(); }
-
-		friend std::ostream &operator<<(std::ostream &os, const Buffer2D<T> &buffer) {
-			os << "Buffer2D(sizeX: " << buffer.sizeX << " sizeY: " << buffer.sizeY << ")";
-			return os;
-		}
-
-
-	};
-
-
-}
-
-
-using namespace clover;
-
-#endif //UTILS_HPP
diff --git a/src/viscosity.cpp b/src/viscosity.cpp
index 9f0830c..ab26f49 100644
--- a/src/viscosity.cpp
+++ b/src/viscosity.cpp
@@ -19,7 +19,6 @@
 
 #include <cmath>
 #include "viscosity.h"
-#include "utils.hpp"
 
 //  @brief Fortran viscosity kernel.
 //  @author Wayne Gaudin
@@ -27,52 +26,61 @@
 //  smooth out shock front and prevent oscillations around discontinuities.
 //  Only cells in compression will have a non-zero value.
 
-void viscosity_kernel(int x_min, int x_max, int y_min, int y_max,
-                      clover::Buffer1D<double> &celldx,
-                      clover::Buffer1D<double> &celldy,
-                      clover::Buffer2D<double> &density0,
-                      clover::Buffer2D<double> &pressure,
-                      clover::Buffer2D<double> &viscosity,
-                      clover::Buffer2D<double> &xvel0,
-                      clover::Buffer2D<double> &yvel0) {
+void viscosity_kernel(
+		bool use_target,
+		int x_min, int x_max, int y_min, int y_max,
+		field_type &field) {
 
 	// DO k=y_min,y_max
 	//   DO j=x_min,x_max
-	_Pragma("kernel2d")
+
+
+	const int base_stride = field.base_stride;
+	const int vels_wk_stride = field.vels_wk_stride;
+
+	double *celldx = field.celldx.data;
+	double *celldy = field.celldy.data;
+	double *density0 = field.density0.data;
+	double *pressure = field.pressure.data;
+	double *viscosity = field.viscosity.data;
+	double *xvel0 = field.xvel0.data;
+	double *yvel0 = field.yvel0.data;
+
+	#pragma omp target teams distribute parallel for simd collapse(2) clover_use_target(use_target)
 	for (int j = (y_min + 1); j < (y_max + 2); j++) {
 		for (int i = (x_min + 1); i < (x_max + 2); i++) {
-			double ugrad = (xvel0(i + 1, j + 0) + xvel0(i + 1, j + 1)) - (xvel0(i, j) + xvel0(i + 0, j + 1));
-			double vgrad = (yvel0(i + 0, j + 1) + yvel0(i + 1, j + 1)) - (yvel0(i, j) + yvel0(i + 1, j + 0));
+			double ugrad = (xvel0[(i + 1) + (j + 0) * vels_wk_stride] + xvel0[(i + 1) + (j + 1) * vels_wk_stride]) - (xvel0[i + j * vels_wk_stride] + xvel0[(i + 0) + (j + 1) * vels_wk_stride]);
+			double vgrad = (yvel0[(i + 0) + (j + 1) * vels_wk_stride] + yvel0[(i + 1) + (j + 1) * vels_wk_stride]) - (yvel0[i + j * vels_wk_stride] + yvel0[(i + 1) + (j + 0) * vels_wk_stride]);
 			double div = (celldx[i] * (ugrad) + celldy[j] * (vgrad));
-			double strain2 = 0.5 * (xvel0(i + 0, j + 1) +
-			                        xvel0(i + 1, j + 1) -
-			                        xvel0(i, j) -
-			                        xvel0(i + 1, j + 0)) / celldy[j] +
-			                 0.5 * (yvel0(i + 1, j + 0) +
-			                        yvel0(i + 1, j + 1) -
-			                        yvel0(i, j) -
-			                        yvel0(i + 0, j + 1)) / celldx[i];
-			double pgradx = (pressure(i + 1, j + 0) - pressure(i - 1, j + 0)) / (celldx[i] + celldx[i + 1]);
-			double pgrady = (pressure(i + 0, j + 1) - pressure(i + 0, j - 1)) / (celldy[j] + celldy[j + 2]);
+			double strain2 = 0.5 * (xvel0[(i + 0) + (j + 1) * vels_wk_stride] +
+			                        xvel0[(i + 1) + (j + 1) * vels_wk_stride] -
+			                        xvel0[i + j * vels_wk_stride] -
+			                        xvel0[(i + 1) + (j + 0) * vels_wk_stride]) / celldy[j] +
+			                 0.5 * (yvel0[(i + 1) + (j + 0) * vels_wk_stride] +
+			                        yvel0[(i + 1) + (j + 1) * vels_wk_stride] -
+			                        yvel0[i + j * vels_wk_stride] -
+			                        yvel0[(i + 0) + (j + 1) * vels_wk_stride]) / celldx[i];
+			double pgradx = (pressure[(i + 1) + (j + 0) * base_stride] - pressure[(i - 1) + (j + 0) * base_stride]) / (celldx[i] + celldx[i + 1]);
+			double pgrady = (pressure[(i + 0) + (j + 1) * base_stride] - pressure[(i + 0) + (j - 1) * base_stride]) / (celldy[j] + celldy[j + 2]);
 			double pgradx2 = pgradx * pgradx;
 			double pgrady2 = pgrady * pgrady;
 			double limiter = ((0.5 * (ugrad) / celldx[i]) * pgradx2 +
 			                  (0.5 * (vgrad) / celldy[j]) * pgrady2 + strain2 * pgradx * pgrady) /
-			                 std::fmax(pgradx2 + pgrady2, g_small);
-			if ((limiter > 0.0) || (div >= 0.0)) { viscosity(i, j) = 0.0; }
+			                 fmax(pgradx2 + pgrady2, g_small);
+			if ((limiter > 0.0) || (div >= 0.0)) { viscosity[i + j * base_stride] = 0.0; }
 			else {
 				double dirx = 1.0;
 				if (pgradx < 0.0)dirx = -1.0;
-				pgradx = dirx * std::fmax(g_small, std::fabs(pgradx));
+				pgradx = dirx * fmax(g_small, fabs(pgradx));
 				double diry = 1.0;
 				if (pgradx < 0.0)diry = -1.0;
-				pgrady = diry * std::fmax(g_small, std::fabs(pgrady));
-				double pgrad = std::sqrt(pgradx * pgradx + pgrady * pgrady);
-				double xgrad = std::fabs(celldx[i] * pgrad / pgradx);
-				double ygrad = std::fabs(celldy[j] * pgrad / pgrady);
-				double grad = std::fmin(xgrad, ygrad);
+				pgrady = diry * fmax(g_small, fabs(pgrady));
+				double pgrad = sqrt(pgradx * pgradx + pgrady * pgrady);
+				double xgrad = fabs(celldx[i] * pgrad / pgradx);
+				double ygrad = fabs(celldy[j] * pgrad / pgrady);
+				double grad = fmin(xgrad, ygrad);
 				double grad2 = grad * grad;
-				viscosity(i, j) = 2.0 * density0(i, j) * grad2 * limiter * limiter;
+				viscosity[i + j * base_stride] = 2.0 * density0[i + j * base_stride] * grad2 * limiter * limiter;
 			}
 		}
 	}
@@ -84,11 +92,23 @@ void viscosity_kernel(int x_min, int x_max, int y_min, int y_max,
 //  viscosity.
 void viscosity(global_variables &globals) {
 
+	#if SYNC_BUFFERS
+	globals.hostToDevice();
+	#endif
+
+
 	for (int tile = 0; tile < globals.config.tiles_per_chunk; ++tile) {
 		tile_type &t = globals.chunk.tiles[tile];
-		viscosity_kernel(t.info.t_xmin, t.info.t_xmax, t.info.t_ymin, t.info.t_ymax,
-		                 t.field.celldx, t.field.celldy, t.field.density0,
-		                 t.field.pressure, t.field.viscosity, t.field.xvel0,
-		                 t.field.yvel0);
+		viscosity_kernel(globals.use_target,
+		                 t.info.t_xmin,
+		                 t.info.t_xmax,
+		                 t.info.t_ymin,
+		                 t.info.t_ymax,
+		                 t.field);
 	}
+
+	#if SYNC_BUFFERS
+	globals.deviceToHost();
+	#endif
+
 }
diff --git a/src/visit.cpp b/src/visit.cpp
index b7d532d..1786555 100644
--- a/src/visit.cpp
+++ b/src/visit.cpp
@@ -183,7 +183,7 @@ void visit(global_variables &globals, parallel_ &parallel) {
 				for (int j = globals.chunk.tiles[tile].info.t_xmin + 1;
 				     j <= globals.chunk.tiles[tile].info.t_xmax + 1; ++j) {
 					double temp = (std::fabs(hm_viscosity(j, k)) > 0.00000001) ? hm_viscosity(j, k)
-					                                                           : 0.0;
+					                                                                     : 0.0;
 					u << std::scientific << std::setprecision(3) << temp << std::endl;
 				}
 			}