From f5595d379b4f9e2b768c8096d67903722a59b3e1 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 25 Feb 2020 13:14:17 -0800
Subject: [PATCH 01/80] remove HPX-5, HPX-3 is the only HPX from now on

---
 travis/build-run-prk.sh |  3 ---
 travis/install-deps.sh  |  9 ++-----
 travis/install-hpx.sh   | 40 +++++++++++++++++++++++++++++
 travis/install-hpx3.sh  | 57 -----------------------------------------
 travis/install-hpx5.sh  | 39 ----------------------------
 5 files changed, 42 insertions(+), 106 deletions(-)
 create mode 100755 travis/install-hpx.sh
 delete mode 100755 travis/install-hpx3.sh
 delete mode 100755 travis/install-hpx5.sh

diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh
index c9e98f2f0..14e505265 100755
--- a/travis/build-run-prk.sh
+++ b/travis/build-run-prk.sh
@@ -1134,9 +1134,6 @@ case "$PRK_TARGET" in
     allhpx3)
         echo "Nothing to do yet"
         ;;
-    allhpx5)
-        echo "Nothing to do yet"
-        ;;
     alllegion)
         echo "Legion"
         echo "LEGIONTOP=${TRAVIS_ROOT}/legion" > common/make.defs
diff --git a/travis/install-deps.sh b/travis/install-deps.sh
index 433ebc44a..3c4fc29a1 100755
--- a/travis/install-deps.sh
+++ b/travis/install-deps.sh
@@ -169,16 +169,11 @@ case "$PRK_TARGET" in
         echo "Chapel"
         sh ./travis/install-chapel.sh $TRAVIS_ROOT
         ;;
-    allhpx3)
-        echo "HPX-3"
+    allhpx)
+        echo "HPX"
         sh ./travis/install-cmake.sh $TRAVIS_ROOT
         sh ./travis/install-hpx3.sh $TRAVIS_ROOT
         ;;
-    allhpx5)
-        echo "HPX-5"
-        sh ./travis/install-autotools.sh $TRAVIS_ROOT
-        sh ./travis/install-hpx5.sh $TRAVIS_ROOT
-        ;;
     alllegion)
         echo "Legion"
         # GASNet is not needed, it seems
diff --git a/travis/install-hpx.sh b/travis/install-hpx.sh
new file mode 100755
index 000000000..5fa0cda89
--- /dev/null
+++ b/travis/install-hpx.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+set -e
+set -x
+
+if [ -f ~/use-intel-compilers ] ; then
+    export CC=icc
+    export CXX=icpc
+    export FC=ifort
+fi
+
+TRAVIS_ROOT="$1"
+
+case "$TRAVIS_OS_NAME" in
+    linux)
+        ;;
+    osx)
+        set +e
+        brew update
+        for p in boost jemalloc gperftools ; do
+            brew install $p || brew upgrade $p
+        done
+        set -e
+        ;;
+esac
+
+if [ ! -d "$TRAVIS_ROOT/hpx" ]; then
+    cd $TRAVIS_ROOT
+    git clone --depth 1 https://github.com/STEllAR-GROUP/hpx.git hpx-source
+    cd hpx-source
+    mkdir build
+    cd build
+    cmake .. -DCMAKE_INSTALL_PREFIX:PATH=$TRAVIS_ROOT/hpx -DCMAKE_MACOSX_RPATH=YES -DHPX_WITH_HWLOC=OFF
+    make -j2
+    # make check # target does not exist
+    make install
+else
+    echo "HPX installed..."
+    find $TRAVIS_ROOT/hpx
+fi
diff --git a/travis/install-hpx3.sh b/travis/install-hpx3.sh
deleted file mode 100755
index 50bf6878d..000000000
--- a/travis/install-hpx3.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/bin/sh
-
-set -e
-set -x
-
-if [ -f ~/use-intel-compilers ] ; then
-    export CC=icc
-    export CXX=icpc
-    export FC=ifort
-fi
-
-TRAVIS_ROOT="$1"
-
-case "$TRAVIS_OS_NAME" in
-    linux)
-        ;;
-    osx)
-        set +e
-        brew update
-        if [ "$USE_HPX_TARBALL" ] ; then
-            export HPX_BOOST="homebrew/versions/boost155"
-        else
-            export HPX_BOOST="boost"
-        fi
-        for p in $HPX_BOOST jemalloc gperftools ; do
-            brew install $p || brew upgrade $p
-        done
-        set -e
-        ;;
-esac
-
-if [ ! -d "$TRAVIS_ROOT/hpx3" ]; then
-    cd $TRAVIS_ROOT
-    #if [ "$USE_HPX_TARBALL" ] ; then
-    #    wget -q --no-check-certificate http://stellar.cct.lsu.edu/files/hpx_0.9.11.tar.bz2
-    #    if [ `which md5` ] ; then
-    #        echo "MD5 signature is:"
-    #        md5 hpx_0.9.11.tar.bz2
-    #        echo "MD5 signature should be:"
-    #        echo "86a71189fb6344d27bf53d6aa2b33122"
-    #    fi
-    #    tar -xjf hpx_0.9.11.tar.bz2
-    #    cd hpx_0.9.11
-    #else
-        git clone --depth 1 https://github.com/STEllAR-GROUP/hpx.git hpx3-source
-        cd hpx3-source
-    #fi
-    mkdir build
-    cd build
-    cmake .. -DCMAKE_INSTALL_PREFIX:PATH=$TRAVIS_ROOT/hpx3 -DCMAKE_MACOSX_RPATH=YES -DHPX_WITH_HWLOC=OFF
-    make -j2
-    # make check # target does not exist
-    make install
-else
-    echo "HPX-3 installed..."
-    find $TRAVIS_ROOT/hpx3
-fi
diff --git a/travis/install-hpx5.sh b/travis/install-hpx5.sh
deleted file mode 100755
index 6fa6f29d9..000000000
--- a/travis/install-hpx5.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/sh
-
-set -e
-set -x
-
-if [ -f ~/use-intel-compilers ] ; then
-    export CC=icc
-    export CXX=icpc
-    export FC=ifort
-fi
-
-TRAVIS_ROOT="$1"
-
-if [ ! -d "$TRAVIS_ROOT/hpx5" ] ; then
-    cd $TRAVIS_ROOT
-    if [ "0" = "1" ] ; then
-        wget -q --no-check-certificate http://hpx.crest.iu.edu/release/HPX_Release_v2.0.0.tar.gz
-        if [ `which shasum` ] ; then
-            echo "SHA-256 signature is:"
-            shasum -a 256 HPX_Release_v2.0.0.tar.gz
-            echo "SHA-256 signature should be:"
-            echo "647c5f0ef3618f734066c91d741021d7bd38cf21"
-        fi
-        tar -xzf HPX_Release_v2.0.0.tar.gz
-        cd HPX_Release_v2.0.0/hpx
-    else
-       export GIT_SSL_NO_VERIFY=1
-       git clone --depth 1 http://gitlab.crest.iu.edu/extreme/hpx.git hpx5-source
-       cd hpx5-source
-    fi
-    ./bootstrap
-    ./configure --prefix=$TRAVIS_ROOT/hpx5
-    make -j2
-    make check
-    make install
-else
-    echo "HPX-5 installed..."
-    find $TRAVIS_ROOT/hpx5 -name hpx-config
-fi

From 042b1855aa7618669e242682c36fb7574134d57f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 25 Feb 2020 22:46:03 -0800
Subject: [PATCH 02/80] HPX is WIP

---
 Cxx11/Makefile       |   7 ++
 Cxx11/nstream-hpx.cc | 176 +++++++++++++++++++++++++++++++++++++++++++
 Cxx11/prk_hpx.h      |  41 ++++++++++
 doc/HPX.md           |  13 ++++
 4 files changed, 237 insertions(+)
 create mode 100644 Cxx11/nstream-hpx.cc
 create mode 100644 Cxx11/prk_hpx.h
 create mode 100644 doc/HPX.md

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index e3b9e76fe..2adb8c486 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -44,6 +44,7 @@ RANGEFLAGS = $(RANGEFLAG) -DUSE_RANGES
 STLFLAGS = $(STLFLAG) $(RANGEFLAGS)
 PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL
 RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA
+HPXFLAGS = -I$(HPXDIR)/include -DUSE_HPX -L$(HPXDIR)/lib $(BOOSTFLAG) $(HWLOCFLAG) $(RANGEFLAGS)
 THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST
 KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS
 SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0
@@ -122,6 +123,8 @@ rangefor: stencil-vector-rangefor transpose-vector-rangefor nstream-vector-range
 
 kokkos: stencil-kokkos transpose-kokkos nstream-kokkos
 
+hpx: nstream-hpx
+
 raja: p2p-vector-raja stencil-vector-raja nstream-vector-raja \
       p2p-raja transpose-raja nstream-raja stencil-raja # transpose-vector-raja
 
@@ -193,6 +196,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 %-raja: %-raja.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@
 
+%-hpx: %-hpx.cc prk_util.h prk_hpx.h
+	$(CXX) $(CXXFLAGS) $< $(HPXFLAGS) -o $@
+
 ifeq ($(PRK_KOKKOS_BACKEND),Cuda)
 %-kokkos: %-kokkos.cc prk_util.h
 	${KOKKOSDIR}/bin/nvcc_wrapper $(CPPFLAGS) $(CUDAFLAGS) $< $(KOKKOSFLAG) -DUSE_KOKKOS -DPRK_KOKKOS_BACKEND=Cuda -o $@
@@ -262,6 +268,7 @@ clean:
 	-rm -f *-rangefor
 	-rm -f *-raja
 	-rm -f *-kokkos
+	-rm -f *-hpx
 	-rm -f *-thrust
 	-rm -f *-cuda
 	-rm -f *-cublas
diff --git a/Cxx11/nstream-hpx.cc b/Cxx11/nstream-hpx.cc
new file mode 100644
index 000000000..d41f5fe6d
--- /dev/null
+++ b/Cxx11/nstream-hpx.cc
@@ -0,0 +1,176 @@
+///
+/// Copyright (c) 2019, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_hpx.h"
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++ HPX STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto nstream_time = 0.0;
+
+  std::vector<double> A(length);
+  std::vector<double> B(length);
+  std::vector<double> C(length);
+
+  auto range = prk::range(static_cast<size_t>(0), length);
+
+  double scalar(3);
+
+  {
+    std::for_each( std::begin(range), std::end(range), [&] (size_t i) {
+        A[i] = 0;
+        B[i] = 2;
+        C[i] = 2;
+    });
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+      std::for_each( std::begin(range), std::end(range), [&] (size_t i) {
+          A[i] += B[i] + scalar * C[i];
+      });
+    }
+    nstream_time = prk::wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar(0);
+  double br(2);
+  double cr(2);
+  for (auto i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum(0);
+  for (size_t i=0; i<length; i++) {
+      asum += std::fabs(A[i]);
+  }
+
+  double epsilon(1.e-8);
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+      return 1;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/prk_hpx.h b/Cxx11/prk_hpx.h
new file mode 100644
index 000000000..36c523eb1
--- /dev/null
+++ b/Cxx11/prk_hpx.h
@@ -0,0 +1,41 @@
+///
+/// Copyright (c) 2019, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef PRK_HPX_H
+#define PRK_HPX_H
+
+#include <hpx/hpx_init.hpp>
+#include <hpx/hpx.hpp>
+
+#include <hpx/include/parallel_algorithm.hpp>
+#include <hpx/include/parallel_numeric.hpp>
+
+#endif /* PRK_HPX_H */
diff --git a/doc/HPX.md b/doc/HPX.md
new file mode 100644
index 000000000..1fce480f7
--- /dev/null
+++ b/doc/HPX.md
@@ -0,0 +1,13 @@
+#
+
+```sh
+cmake .. -DCMAKE_INSTALL_PREFIX=$PRK_DIR/Cxx11/hpx \
+         -DCMAKE_CXX_COMPILER=/usr/local/Cellar/llvm/9.0.1/bin/clang++ \
+         -DCMAKE_C_COMPILER=/usr/local/Cellar/llvm/9.0.1/bin/clang \
+         -DHPX_WITH_TESTS:BOOL=Off \
+         -DHPX_WITH_TESTS_BENCHMARKS:BOOL=Off \
+         -DHPX_WITH_TESTS_EXAMPLES:BOOL=Off \
+         -DHPX_WITH_TESTS_REGRESSIONS:BOOL=Off \
+         -DHPX_WITH_TESTS_UNIT:BOOL=Off
+make install
+```

From 8c63805f1d6267beaa91398618f4c494224977c7 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 26 Feb 2020 10:46:26 -0800
Subject: [PATCH 03/80] add HPX flags to make.defs.

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>
---
 Cxx11/Makefile         |  2 +-
 common/make.defs.gcc   | 44 ++++++++++++++++++++++++++++++++++++++++--
 common/make.defs.intel | 36 +++++++++++++++++++++++++---------
 common/make.defs.llvm  | 13 ++++++++++---
 4 files changed, 80 insertions(+), 15 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 2adb8c486..478b29cb4 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -197,7 +197,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 	$(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@
 
 %-hpx: %-hpx.cc prk_util.h prk_hpx.h
-	$(CXX) $(CXXFLAGS) $< $(HPXFLAGS) -o $@
+	$(HPXCXX) --exe=$@ $(CXXFLAGS) $(HPXFLAGS) $<
 
 ifeq ($(PRK_KOKKOS_BACKEND),Cuda)
 %-kokkos: %-kokkos.cc prk_util.h
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 8df7db087..4fd4a74ff 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -44,6 +44,9 @@ ORNLACCFLAG=-fopenacc
 #
 # MacOS
 OPENCLFLAG=-framework OpenCL
+# POCL
+# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct...
+#OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL
 # Linux
 #OPENCLDIR=/etc/alternatives/opencl-intel-tools
 #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
@@ -52,13 +55,33 @@ METALFLAG=-framework MetalPerformanceShaders
 #
 # SYCL flags
 #
+# Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md
+#SYCLDIR=/opt/isycl
+#SYCLCXX=${SYCLDIR}/bin/clang++
+#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib
+#SYCLFLAG+=-std=c++17 -O3
+# CodePlay ComputeCpp
+#SYCLDIR=/opt/sycl/latest
+#SYCLCXX=${SYCLDIR}/bin/compute++
+#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp
+#SYCLFLAG+=-std=c++14 -O3
+# This makes a huge difference in e.g. nstream...
+#SYCLFLAG+=-no-serial-memop
+# CentOS7 and Ubuntu14 built for this
+#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
+# PRK header rejects GCC4
+#SYCLFLAG+=--gcc-toolchain=/swtools/gcc/5.4.0
+# If not found automatically
+#SYCLFLAG+=${OPENCLFLAG}
+# NVIDIA target
+#SYCLFLAG+=-sycl-target ptx64
+#
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 #SYCLDIR=./triSYCL
 #SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
 #SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL
 #
-METALFLAG=-framework MetalPerformanceShaders
 #
 # OCCA
 #
@@ -71,7 +94,7 @@ METALFLAG=-framework MetalPerformanceShaders
 # TBB
 #
 #TBBDIR=/usr/lib/x86_64-linux-gnu
-TBBDIR=/usr/local/Cellar/tbb/2019_U8
+TBBDIR=/usr/local/Cellar/tbb/2020_U0
 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb
 #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb
@@ -79,6 +102,9 @@ TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 # Parallel STL, Boost, etc.
 #
 #BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include
+BOOSTROOT=/usr/local/Cellar/boost/1.72.0/include
+BOOSTFLAG+=-I${BOOSTROOT}
+BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11
 RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 #RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
 PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG}
@@ -89,6 +115,10 @@ RAJADIR=/opt/raja/gcc
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust
 THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
+# HPX is more complicated...
+HPXDIR=./hpx
+HPXCXX=${HPXDIR}/bin/hpxcxx
+HWLOCFLAG=-I/usr/local/include
 #
 # CBLAS for C++ DGEMM
 #
@@ -105,6 +135,16 @@ CUDAFLAGS=-g -O3 -std=c++11 -arch=sm_50
 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
 #
+# Halide
+#
+HALIDECXX=c++
+HALIDEDIR=/opt/halide
+HALIDEFLAG=-I${HALIDEDIR}/include
+HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide
+#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
+HALIDEFLAG+=${DEFAULT_OPT_FLAGS}
+HALIDEFLAG+=-std=c++17 -g3
+#
 # ISPC
 #
 ISPC=ispc
diff --git a/common/make.defs.intel b/common/make.defs.intel
index cab461c08..145b1e750 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -42,19 +42,37 @@ OFFLOADFLAG=-qopenmp-offload=host
 # Linux
 OPENCLDIR=/etc/alternatives/opencl-intel-tools
 OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+#OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations
+METALFLAG=-framework MetalPerformanceShaders
 #
 # SYCL flags
 #
+# Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md
+#SYCLDIR=/opt/isycl
+#SYCLCXX=${SYCLDIR}/bin/clang++
+#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib
+#SYCLFLAG+=-std=c++17 -O3
+# CodePlay ComputeCpp
+#SYCLDIR=/opt/sycl/latest
+#SYCLCXX=${SYCLDIR}/bin/compute++
+#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp
+#SYCLFLAG+=-std=c++14 -O3
+# This makes a huge difference in e.g. nstream...
+#SYCLFLAG+=-no-serial-memop
+# CentOS7 and Ubuntu14 built for this
+#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
+# PRK header rejects GCC4
+#SYCLFLAG+=--gcc-toolchain=/swtools/gcc/5.4.0
+# If not found automatically
+#SYCLFLAG+=${OPENCLFLAG}
+# NVIDIA target
+#SYCLFLAG+=-sycl-target ptx64
+#
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-SYCLDIR=./triSYCL
-SYCLCXX=${CXX} ${OPENMPFLAG}
-SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include
-# ProGTX
-# https://github.com/ProGTX/sycl-gtx
-#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx
-#SYCLCXX=${CXX} ${OPENMPFLAG}
-#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG}
+#SYCLDIR=./triSYCL
+#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
+#SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL
 #
 # OCCA
 #
@@ -62,7 +80,7 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include
 #
 # Cilk
 #
-CILKFLAG=-intel-extensions # default
+#CILKFLAG=-intel-extensions # default
 #
 # TBB
 #
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 6a668bf14..b65febe80 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -85,7 +85,7 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib
 # triSYCL
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
 SYCLDIR=./triSYCL
-SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS)
+SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
 SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL
 #
 # OCCA
@@ -95,7 +95,7 @@ SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL
 # TBB
 #
 #TBBDIR=/usr/lib/x86_64-linux-gnu
-TBBDIR=/usr/local/Cellar/tbb/2019_U8
+TBBDIR=/usr/local/Cellar/tbb/2020_U0
 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb
 #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb
@@ -103,15 +103,22 @@ TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 # Parallel STL, Boost, etc.
 #
 #BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include
+BOOSTROOT=/usr/local/Cellar/boost/1.72.0/include
+BOOSTFLAG+=-I${BOOSTROOT}
+BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11
 #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
-PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages
+PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} -Wno-\#pragma-messages -DUSE_INTEL_PSTL -I./pstl/include
 KOKKOSDIR=/opt/kokkos/clang
 KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl
 RAJADIR=/opt/raja/clang
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/opt/nvidia/thrust
 THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
+# HPX is more complicated...
+HPXDIR=./hpx
+HPXCXX=${HPXDIR}/bin/hpxcxx
+HWLOCFLAG=-I/usr/local/include
 #
 # CBLAS for C++ DGEMM
 #

From 0556b39e400134cd586a5b79dd3fc43730876ca2 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 26 Feb 2020 10:53:02 -0800
Subject: [PATCH 04/80] update HPX flags

---
 Cxx11/Makefile         | 2 +-
 common/make.defs.gcc   | 3 ++-
 common/make.defs.intel | 5 +++++
 common/make.defs.llvm  | 3 ++-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 478b29cb4..72526a38a 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -44,7 +44,7 @@ RANGEFLAGS = $(RANGEFLAG) -DUSE_RANGES
 STLFLAGS = $(STLFLAG) $(RANGEFLAGS)
 PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL
 RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA
-HPXFLAGS = -I$(HPXDIR)/include -DUSE_HPX -L$(HPXDIR)/lib $(BOOSTFLAG) $(HWLOCFLAG) $(RANGEFLAGS)
+HPXFLAGS = -DUSE_HPX $(HPXFLAG) $(BOOSTFLAG) $(RANGEFLAGS)
 THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST
 KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS
 SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 4fd4a74ff..7e0736211 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -116,9 +116,10 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust
 THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
 # HPX is more complicated...
+HWLOCFLAG=-I/usr/local/include
 HPXDIR=./hpx
 HPXCXX=${HPXDIR}/bin/hpxcxx
-HWLOCFLAG=-I/usr/local/include
+HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG}
 #
 # CBLAS for C++ DGEMM
 #
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 145b1e750..92a0d4e64 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -99,6 +99,11 @@ RAJADIR=/opt/raja/intel
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/opt/nvidia/thrust
 THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
+# HPX is more complicated...
+HWLOCFLAG=-I/usr/local/include
+HPXDIR=./hpx
+HPXCXX=${HPXDIR}/bin/hpxcxx
+HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG}
 #
 # CBLAS for C++ DGEMM
 #
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index b65febe80..4021f6c6d 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -116,9 +116,10 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/opt/nvidia/thrust
 THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
 # HPX is more complicated...
+HWLOCFLAG=-I/usr/local/include
 HPXDIR=./hpx
 HPXCXX=${HPXDIR}/bin/hpxcxx
-HWLOCFLAG=-I/usr/local/include
+HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG}
 #
 # CBLAS for C++ DGEMM
 #

From 5c799cc1faf5261a6a8a05399246eff5fb2cc0b6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 26 Feb 2020 10:53:32 -0800
Subject: [PATCH 05/80] this template is really dated at this point

---
 common/{make.defs.in => make.defs.old} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename common/{make.defs.in => make.defs.old} (100%)

diff --git a/common/make.defs.in b/common/make.defs.old
similarity index 100%
rename from common/make.defs.in
rename to common/make.defs.old

From d8ef1bdec5f9b27d08655f9ae47ff2b75573d743 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 26 Feb 2020 15:02:07 -0800
Subject: [PATCH 06/80] use HPX for_each

---
 Cxx11/nstream-hpx.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/nstream-hpx.cc b/Cxx11/nstream-hpx.cc
index d41f5fe6d..47d1b775c 100644
--- a/Cxx11/nstream-hpx.cc
+++ b/Cxx11/nstream-hpx.cc
@@ -120,7 +120,7 @@ int main(int argc, char * argv[])
   double scalar(3);
 
   {
-    std::for_each( std::begin(range), std::end(range), [&] (size_t i) {
+    hpx::parallel::for_each(hpx::parallel::execution::seq, std::begin(range), std::end(range), [&] (size_t i) {
         A[i] = 0;
         B[i] = 2;
         C[i] = 2;
@@ -130,7 +130,7 @@ int main(int argc, char * argv[])
 
       if (iter==1) nstream_time = prk::wtime();
 
-      std::for_each( std::begin(range), std::end(range), [&] (size_t i) {
+      hpx::parallel::for_each(hpx::parallel::execution::seq, std::begin(range), std::end(range), [&] (size_t i) {
           A[i] += B[i] + scalar * C[i];
       });
     }

From 21621722e7e93426ec6e9b35ae211ab1e2ce6fbe Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 26 Feb 2020 15:20:10 -0800
Subject: [PATCH 07/80] UPC++ support

---
 travis/install-deps.sh  |  1 +
 travis/install-upcxx.sh | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100755 travis/install-upcxx.sh

diff --git a/travis/install-deps.sh b/travis/install-deps.sh
index 433ebc44a..72effa7b6 100755
--- a/travis/install-deps.sh
+++ b/travis/install-deps.sh
@@ -76,6 +76,7 @@ case "$PRK_TARGET" in
         sh ./travis/install-kokkos.sh $TRAVIS_ROOT
         #sh ./travis/install-occa.sh $TRAVIS_ROOT
         sh ./travis/install-sycl.sh $TRAVIS_ROOT
+        sh ./travis/install-upcxx.sh $TRAVIS_ROOT
         ;;
     allfortran)
         echo "Fortran"
diff --git a/travis/install-upcxx.sh b/travis/install-upcxx.sh
new file mode 100755
index 000000000..3725d361f
--- /dev/null
+++ b/travis/install-upcxx.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+set -e
+set -x
+
+if [ -f ~/use-intel-compilers ] ; then
+    export CC=icc
+    export CXX=icpc
+    export FC=ifort
+fi
+
+TRAVIS_ROOT="$1"
+
+UPCXX_RELEASE=upcxx-2019.9.0
+UPCXX_PREFIX=$TRAVIS_ROOT/$UPCXX_RELEASE
+
+if [ ! -d "$UPCXX_PREFIX" ]; then
+    cd $TRAVIS_ROOT
+    wget --no-check-certificate -q https://bitbucket.org/berkeleylab/upcxx/downloads/${UPCXX_RELEASE}.tar.gz
+    tar -xzf $UPCXX_RELEASE.tar.gz
+    cd $UPCXX_RELEASE
+    ./install $TRAVIS_ROOT/upcxx
+else
+    echo "UPC++ installed..."
+    find $TRAVIS_ROOT/upcxx -name upcxx -type f
+fi
+

From 33cc8d5928b58a5f3656c0fb384ba7f7bbfb4448 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 26 Feb 2020 15:21:34 -0800
Subject: [PATCH 08/80] install-hpx in deps

---
 travis/install-deps.sh | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/travis/install-deps.sh b/travis/install-deps.sh
index 3c4fc29a1..19aa5bdcc 100755
--- a/travis/install-deps.sh
+++ b/travis/install-deps.sh
@@ -75,6 +75,7 @@ case "$PRK_TARGET" in
         #sh ./travis/install-raja.sh $TRAVIS_ROOT
         sh ./travis/install-kokkos.sh $TRAVIS_ROOT
         #sh ./travis/install-occa.sh $TRAVIS_ROOT
+        sh ./travis/install-hpx.sh $TRAVIS_ROOT
         sh ./travis/install-sycl.sh $TRAVIS_ROOT
         ;;
     allfortran)
@@ -169,11 +170,6 @@ case "$PRK_TARGET" in
         echo "Chapel"
         sh ./travis/install-chapel.sh $TRAVIS_ROOT
         ;;
-    allhpx)
-        echo "HPX"
-        sh ./travis/install-cmake.sh $TRAVIS_ROOT
-        sh ./travis/install-hpx3.sh $TRAVIS_ROOT
-        ;;
     alllegion)
         echo "Legion"
         # GASNet is not needed, it seems

From 69efdf6efc464fc3d1cee82877b1eb9ab275618c Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 26 Feb 2020 15:45:51 -0800
Subject: [PATCH 09/80] add UPC++ skeleton

---
 Cxx11/Makefile         |   6 ++
 Cxx11/nstream-upcxx.cc | 184 +++++++++++++++++++++++++++++++++++++++++
 Cxx11/prk_ranges.h     |  18 ++--
 Cxx11/prk_upcxx.h      |  37 +++++++++
 common/make.defs.gcc   |   6 ++
 common/make.defs.llvm  |   6 ++
 6 files changed, 247 insertions(+), 10 deletions(-)
 create mode 100644 Cxx11/nstream-upcxx.cc
 create mode 100644 Cxx11/prk_upcxx.h

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 72526a38a..2bb1bc4ba 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -45,6 +45,7 @@ STLFLAGS = $(STLFLAG) $(RANGEFLAGS)
 PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL
 RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA
 HPXFLAGS = -DUSE_HPX $(HPXFLAG) $(BOOSTFLAG) $(RANGEFLAGS)
+UPCXXFLAGS = $(CPPFLAGS) -DUSE_UPCXX $(UPCXXFLAG) $(BOOSTFLAG) $(RANGEFLAGS)
 THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST
 KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS
 SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0
@@ -125,6 +126,8 @@ kokkos: stencil-kokkos transpose-kokkos nstream-kokkos
 
 hpx: nstream-hpx
 
+upcxx: nstream-upcxx
+
 raja: p2p-vector-raja stencil-vector-raja nstream-vector-raja \
       p2p-raja transpose-raja nstream-raja stencil-raja # transpose-vector-raja
 
@@ -196,6 +199,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h
 %-raja: %-raja.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@
 
+%-upcxx: %-upcxx.cc prk_util.h prk_upcxx.h
+	$(UPCXX) $(UPCXXFLAGS) $< -o $@
+
 %-hpx: %-hpx.cc prk_util.h prk_hpx.h
 	$(HPXCXX) --exe=$@ $(CXXFLAGS) $(HPXFLAGS) $<
 
diff --git a/Cxx11/nstream-upcxx.cc b/Cxx11/nstream-upcxx.cc
new file mode 100644
index 000000000..083ab96b4
--- /dev/null
+++ b/Cxx11/nstream-upcxx.cc
@@ -0,0 +1,184 @@
+///
+/// Copyright (c) 2019, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_upcxx.h"
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  upcxx::init();
+
+  const int me = upcxx::rank_me();
+  const int np = upcxx::rank_n();
+
+  if (me==0) {
+      std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+      std::cout << "C++ HPX STREAM triad: A = B + scalar * C" << std::endl;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto nstream_time = 0.0;
+
+  std::vector<double> A(length);
+  std::vector<double> B(length);
+  std::vector<double> C(length);
+
+  auto range = prk::range(static_cast<size_t>(0), length);
+
+  double scalar(3);
+
+  {
+    std::for_each(std::begin(range), std::end(range), [&] (size_t i) {
+        A[i] = 0;
+        B[i] = 2;
+        C[i] = 2;
+    });
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+      std::for_each(std::begin(range), std::end(range), [&] (size_t i) {
+          A[i] += B[i] + scalar * C[i];
+      });
+    }
+    nstream_time = prk::wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar(0);
+  double br(2);
+  double cr(2);
+  for (auto i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum(0);
+  for (size_t i=0; i<length; i++) {
+      asum += std::fabs(A[i]);
+  }
+
+  double epsilon(1.e-8);
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+      return 1;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  upcxx::finalize();
+  return 0;
+}
+
+
diff --git a/Cxx11/prk_ranges.h b/Cxx11/prk_ranges.h
index 9eb081844..62281e043 100644
--- a/Cxx11/prk_ranges.h
+++ b/Cxx11/prk_ranges.h
@@ -32,16 +32,14 @@
 #ifndef PRK_RANGES_H
 #define PRK_RANGES_H
 
-#if defined(USE_RANGES)
-# if defined(USE_BOOST_IRANGE)
-#  include "boost/range/irange.hpp"
-# elif defined(USE_RANGES_TS)
-#  include "range/v3/view/iota.hpp"
-#  include "range/v3/view/slice.hpp"
-#  include "range/v3/view/stride.hpp"
-# else
-#  error You have not provided a version of ranges to use.
-# endif
+#if defined(USE_BOOST_IRANGE)
+# include "boost/range/irange.hpp"
+#elif defined(USE_RANGES_TS)
+# include "range/v3/view/iota.hpp"
+# include "range/v3/view/slice.hpp"
+# include "range/v3/view/stride.hpp"
+#else
+# error You have not provided a version of ranges to use.
 #endif
 
 namespace prk {
diff --git a/Cxx11/prk_upcxx.h b/Cxx11/prk_upcxx.h
new file mode 100644
index 000000000..27db8592e
--- /dev/null
+++ b/Cxx11/prk_upcxx.h
@@ -0,0 +1,37 @@
+///
+/// Copyright (c) 2019, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef PRK_UPCXX_H
+#define PRK_UPCXX_H
+
+#include <upcxx/upcxx.hpp>
+
+#endif /* PRK_UPCXX_H */
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 7e0736211..ec0535f57 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -120,6 +120,12 @@ HWLOCFLAG=-I/usr/local/include
 HPXDIR=./hpx
 HPXCXX=${HPXDIR}/bin/hpxcxx
 HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG}
+# UPC++
+UPCXXDIR=./upcxx
+UPCXX=${UPCXXDIR}/bin/upcxx
+UPCXXFLAG=-codemode={O3,debug}
+UPCXXFLAG+=-std=c++17
+UPCXXFLAG+=-mtune=native -ffast-math
 #
 # CBLAS for C++ DGEMM
 #
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 4021f6c6d..c150d9ed2 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -120,6 +120,12 @@ HWLOCFLAG=-I/usr/local/include
 HPXDIR=./hpx
 HPXCXX=${HPXDIR}/bin/hpxcxx
 HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG}
+# UPC++
+UPCXXDIR=./upcxx
+UPCXX=${UPCXXDIR}/bin/upcxx
+UPCXXFLAG=-codemode={O3,debug}
+UPCXXFLAG+=-std=c++17
+UPCXXFLAG+=-mtune=native -ffast-math
 #
 # CBLAS for C++ DGEMM
 #

From 2699970ba76e369e157fd72c0a466628d9bc98a4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Wed, 26 Feb 2020 16:15:30 -0800
Subject: [PATCH 10/80] fix banner

---
 Cxx11/nstream-upcxx.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/nstream-upcxx.cc b/Cxx11/nstream-upcxx.cc
index 083ab96b4..7925aef08 100644
--- a/Cxx11/nstream-upcxx.cc
+++ b/Cxx11/nstream-upcxx.cc
@@ -74,7 +74,7 @@ int main(int argc, char * argv[])
 
   if (me==0) {
       std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-      std::cout << "C++ HPX STREAM triad: A = B + scalar * C" << std::endl;
+      std::cout << "UPC++ STREAM triad: A = B + scalar * C" << std::endl;
   }
 
   //////////////////////////////////////////////////////////////////////

From a834102a6f69a38aa697a2c1653127a77a67d2db Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 16 May 2019 06:48:34 -0700
Subject: [PATCH 11/80] add README to capture what I am learning here

---
 HALIDE/README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 HALIDE/README.md

diff --git a/HALIDE/README.md b/HALIDE/README.md
new file mode 100644
index 000000000..af170de52
--- /dev/null
+++ b/HALIDE/README.md
@@ -0,0 +1,11 @@
+# Halide
+
+# Notes
+
+```
+$ git clone https://github.com/halide/Halide.git
+```
+
+```
+$ make CXX=clang++ PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config
+```

From f37a0ac91fe5d1a2fe16bc53a872ce99dfcb32dc Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 16 May 2019 10:21:40 -0700
Subject: [PATCH 12/80] add notes since Halide has pre-modern build system

---
 HALIDE/README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/HALIDE/README.md b/HALIDE/README.md
index af170de52..efa2e2d96 100644
--- a/HALIDE/README.md
+++ b/HALIDE/README.md
@@ -6,6 +6,51 @@
 $ git clone https://github.com/halide/Halide.git
 ```
 
+# MacOS
+
+This works:
+```
+make CLANG=/usr/local/Cellar/llvm/8.0.0/bin/clang PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config
+```
+
+# Ubuntu 18.10
+
+This works:
+```
+make PREFIX=/opt/halide
+```
+
+This does not work:
+
 ```
 $ make CXX=clang++ PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config
 ```
+
+This does not work:
+
+```
+$ make CC=/usr/local/Cellar/llvm/8.0.0/bin/clang CXX=/usr/local/Cellar/llvm/8.0.0/bin/clang++ CLANG=/usr/local/Cellar/llvm/8.0.0/bin/clang PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config
+```
+
+# Issues
+
+*TL;DR* Do not try to use non-default compilers.
+
+https://github.com/halide/Halide/issues/3884
+
+Mac:
+```
+$ make CC=gcc-9 CXX=g++-9 CLANG=/usr/local/Cellar/llvm/8.0.0/bin/clang PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config
+g++-9 -Wall -Werror -Wno-unused-function -Wcast-qual -Wignored-qualifiers -Wno-comment -Wsign-compare -Wno-unknown-warning-option -Wno-psabi -Wsuggest-override   -Woverloaded-virtual -fPIC -O3 -fno-omit-frame-pointer -DCOMPILING_HALIDE -std=c++11  -I/usr/local/Cellar/llvm/8.0.0/include -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/tmp/llvm-20190320-85215-19esl1h/llvm-8.0.0.src/tools/lld/include -DLLVM_VERSION=80  -DWITH_PTX=1  -DWITH_ARM=1  -DWITH_HEXAGON=1  -DWITH_AARCH64=1  -DWITH_X86=1  -DWITH_OPENCL=1  -DWITH_METAL=1  -DWITH_OPENGL=1  -DWITH_D3D12=1  -DWITH_MIPS=1  -DWITH_POWERPC=1  -DWITH_WEBASSEMBLY=1  -DWITH_INTROSPECTION    -DWITH_AMDGPU=1     -funwind-tables -c ~/Work/Languages/Halide/src/Util.cpp -o bin/build/Util.o -MMD -MP -MF bin/build/Util.d -MT bin/build/Util.o
+~/Work/Languages/Halide/src/Util.cpp: In function 'std::string Halide::Internal::running_program_name()':
+~/Work/Languages/Halide/src/Util.cpp:80:19: error: 'PATH_MAX' was not declared in this scope
+   80 |         char path[PATH_MAX] = { 0 };
+      |                   ^~~~~~~~
+~/Work/Languages/Halide/src/Util.cpp:81:32: error: 'path' was not declared in this scope
+   81 |         uint32_t size = sizeof(path);
+      |                                ^~~~
+At global scope:
+cc1plus: error: unrecognized command line option '-Wno-unknown-warning-option' [-Werror]
+cc1plus: all warnings being treated as errors
+make: *** [bin/build/Util.o] Error 1
+```

From e69f46c08111e7b28ec078f81f8e7f0b965face2 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 16 May 2019 13:14:16 -0700
Subject: [PATCH 13/80] add Halide to examples

---
 common/make.defs.gcc  | 14 ++++++++++++--
 common/make.defs.llvm |  4 ++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 8ad79efb2..bf4b46ecf 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -151,13 +151,23 @@ CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
 # CUDA flags
 #
 # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander
-#NVCC=/opt/llvm/cocl/bin/cocl
+NVCC=/opt/llvm/cocl/bin/cocl
 # Linux w/ NVIDIA CUDA
 NVCC=nvcc
 CUDAFLAGS=-g -O3 -std=c++11
 CUDAFLAGS+=-arch=sm_50
 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
-CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
+#CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
+#
+# Halide
+#
+HALIDECXX=c++
+HALIDEDIR=/opt/halide
+HALIDEFLAG=-I${HALIDEDIR}/include
+HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide
+#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
+HALIDEFLAG+=${DEFAULT_OPT_FLAGS}
+HALIDEFLAG+=-std=c++17 -g3
 #
 # ISPC
 #
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 2aecf26d8..db54b5cc0 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -187,9 +187,9 @@ CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
 # CUDA flags
 #
 # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander
-NVCC=/opt/llvm/cocl/bin/cocl
+#NVCC=/opt/llvm/cocl/bin/cocl
 # Linux w/ NVIDIA CUDA
-#NVCC=nvcc -arch=sm_50
+NVCC=nvcc -arch=sm_50
 CUDAFLAGS=-g -O3 -std=c++11
 CUDAFLAGS+=-arch=sm_50
 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233

From 2bd32e246c5fdb61c5cb1b9158796981cf5718d6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Thu, 16 May 2019 13:14:27 -0700
Subject: [PATCH 14/80] add Halide nstream

I have no idea what I am doing and this code is wrong and/or bad.
---
 HALIDE/README.md => Cxx11/HALIDE.md |   0
 Cxx11/Makefile                      |   6 +
 Cxx11/nstream-halide.cc             | 190 ++++++++++++++++++++++++++++
 3 files changed, 196 insertions(+)
 rename HALIDE/README.md => Cxx11/HALIDE.md (100%)
 create mode 100644 Cxx11/nstream-halide.cc

diff --git a/HALIDE/README.md b/Cxx11/HALIDE.md
similarity index 100%
rename from HALIDE/README.md
rename to Cxx11/HALIDE.md
diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index b435091ed..fe6e8e891 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -50,6 +50,7 @@ THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS)
 KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG)
 SYCLFLAGS = $(SYCLFLAG) -DUSE_2D_INDEXING=0
 ORNLACCFLAGS = $(ORNLACCFLAG)
+HALIDEFLAGS = $(HALIDEFLAG)
 
 ifdef OCCADIR
   include ${OCCADIR}/scripts/makefile
@@ -133,6 +134,7 @@ oneapi: onemkl dpcpp sycl sycl-usm sycl-explicit
 occa: transpose-occa nstream-occa
 
 ornlacc: p2p-hyperplane-ornlacc
+halide: nstream-halide
 
 boost-compute: nstream-boost-compute
 # busted
@@ -253,6 +255,9 @@ endif
 	$(info PRK help: Set OCCA_CXX=$(firstword $(CXX)) to use that compiler for OKL files.)
 	$(CXX) $(CXXFLAGS) $< $(OCCAFLAGS) -o $@
 
+%-halide: %-halide.cc prk_util.h
+	$(HALIDECXX) $(CXXFLAGS) $< $(HALIDEFLAGS) -o $@
+
 %-ornlacc: %-ornlacc.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $< $(ORNLACCFLAGS) -o $@
 
@@ -300,6 +305,7 @@ clean:
 	-rm -f *-cblas
 	-rm -f *-onemkl
 	-rm -f *-occa
+	-rm -f *-halide
 	-rm -f *-boost-compute
 	-rm -f *-ornlacc
 	-rm -f transpose-async transpose-thread
diff --git a/Cxx11/nstream-halide.cc b/Cxx11/nstream-halide.cc
new file mode 100644
index 000000000..51f0eee16
--- /dev/null
+++ b/Cxx11/nstream-halide.cc
@@ -0,0 +1,190 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors, the length of the
+///          vectors, and the offset between vectors
+///
+///          <progname> <# iterations> <vector length> <offset>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "Halide.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/Halide STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, offset;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length> [<offset>]";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+
+      offset = (argc>3) ? std::atoi(argv[3]) : 0;
+      if (length <= 0) {
+        throw "ERROR: offset must be nonnegative";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Vector length        = " << length << std::endl;
+  std::cout << "Offset               = " << offset << std::endl;
+
+  const Halide::Target target = Halide::get_jit_target_from_environment();
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double scalar = 3.0;
+
+  Halide::Buffer<double> A(length);
+  Halide::Buffer<double> B(length);
+  Halide::Buffer<double> C(length);
+
+  for (size_t i=0; i<length; i++) {
+      A(i) = 0;
+      B(i) = 2;
+      C(i) = 2;
+  }
+
+  Halide::Var index("index");
+
+  Halide::Expr e = A(index);
+  e = e + B(index);
+  e = e + C(index);
+  e = e + C(index);
+  e = e + C(index);
+
+  Halide::Func nstream("nstream");
+  nstream(index) = e;
+
+  auto nstream_time = 0.0;
+
+  {
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+#if 0
+      for (size_t i=0; i<length; i++) {
+          A(i) += B(i) + scalar * C(i);
+      }
+#else
+      Halide::Buffer<double> out = nstream.realize(length);
+#endif
+    }
+    nstream_time = prk::wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar(0);
+  double br(2);
+  double cr(2);
+  for (auto i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum(0);
+  for (size_t i=0; i<length; i++) {
+      asum += std::fabs(A(i));
+  }
+
+  double epsilon=1.e-8;
+  if (std::fabs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+      return 1;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
+
+

From a54488c28d8e8b2d2314d03340d20da81255bae1 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 18 May 2019 20:59:45 -0700
Subject: [PATCH 15/80] add Stencil for Halide

---
 Cxx11/Makefile          |   3 +-
 Cxx11/stencil-halide.cc | 231 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 233 insertions(+), 1 deletion(-)
 create mode 100644 Cxx11/stencil-halide.cc

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index fe6e8e891..f3e6e5d3b 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -134,7 +134,8 @@ oneapi: onemkl dpcpp sycl sycl-usm sycl-explicit
 occa: transpose-occa nstream-occa
 
 ornlacc: p2p-hyperplane-ornlacc
-halide: nstream-halide
+
+halide: nstream-halide stencil-halide
 
 boost-compute: nstream-boost-compute
 # busted
diff --git a/Cxx11/stencil-halide.cc b/Cxx11/stencil-halide.cc
new file mode 100644
index 000000000..f0aab6461
--- /dev/null
+++ b/Cxx11/stencil-halide.cc
@@ -0,0 +1,231 @@
+
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Stencil
+///
+/// PURPOSE: This program tests the efficiency with which a space-invariant,
+///          linear, symmetric filter (stencil) can be applied to a square
+///          grid or image.
+///
+/// USAGE:   The program takes as input the linear
+///          dimension of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <grid size>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following functions are used in
+///          this program:
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///          - RvdW: Removed unrolling pragmas for clarity;
+///            added constant to array "in" at end of each iteration to force
+///            refreshing of neighbor data in parallel versions; August 2013
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "stencil_seq.hpp"
+
+void nothing(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out)
+{
+    std::cout << "You are trying to use a stencil that does not exist.\n";
+    std::cout << "Please generate the new stencil using the code generator\n";
+    std::cout << "and add it to the case-switch in the driver." << std::endl;
+    // n will never be zero - this is to silence compiler warnings.
+    if (n==0 || t==0) std::cout << in.size() << out.size() << std::endl;
+    std::abort();
+}
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11 Stencil execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, n, radius, tile_size;
+  bool star = true;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <array dimension> [<tile_size> <star/grid> <radius>]";
+      }
+
+      // number of times to run the algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // linear grid dimension
+      n  = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimension must be positive";
+      } else if (n > std::floor(std::sqrt(INT_MAX))) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // default tile size for tiling of local transpose
+      tile_size = 32;
+      if (argc > 3) {
+          tile_size = std::atoi(argv[3]);
+          if (tile_size <= 0) tile_size = n;
+          if (tile_size > n) tile_size = n;
+      }
+
+      // stencil pattern
+      if (argc > 4) {
+          auto stencil = std::string(argv[4]);
+          auto grid = std::string("grid");
+          star = (stencil == grid) ? false : true;
+      }
+
+      // stencil radius
+      radius = 2;
+      if (argc > 5) {
+          radius = std::atoi(argv[5]);
+      }
+
+      if ( (radius < 1) || (2*radius+1 > n) ) {
+        throw "ERROR: Stencil radius negative or too large";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid size            = " << n << std::endl;
+  std::cout << "Tile size            = " << tile_size << std::endl;
+  std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
+  std::cout << "Radius of stencil    = " << radius << std::endl;
+
+  auto stencil = nothing;
+  if (star) {
+      switch (radius) {
+          case 1: stencil = star1; break;
+          case 2: stencil = star2; break;
+          case 3: stencil = star3; break;
+          case 4: stencil = star4; break;
+          case 5: stencil = star5; break;
+      }
+  } else {
+      switch (radius) {
+          case 1: stencil = grid1; break;
+          case 2: stencil = grid2; break;
+          case 3: stencil = grid3; break;
+          case 4: stencil = grid4; break;
+          case 5: stencil = grid5; break;
+      }
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  auto stencil_time = 0.0;
+
+  prk::vector<double> in(n*n);
+  prk::vector<double> out(n*n);
+
+  {
+    for (auto it=0; it<n; it+=tile_size) {
+      for (auto jt=0; jt<n; jt+=tile_size) {
+        for (auto i=it; i<std::min(n,it+tile_size); i++) {
+          PRAGMA_SIMD
+          for (auto j=jt; j<std::min(n,jt+tile_size); j++) {
+            in[i*n+j] = static_cast<double>(i+j);
+            out[i*n+j] = 0.0;
+          }
+        }
+      }
+    }
+
+    for (auto iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) stencil_time = prk::wtime();
+      // Apply the stencil operator
+      stencil(n, tile_size, in, out);
+      // Add constant to solution to force refresh of neighbor data, if any
+      std::transform(in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; });
+    }
+    stencil_time = prk::wtime() - stencil_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  // interior of grid with respect to stencil
+  size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
+
+  // compute L1 norm in parallel
+  double norm = 0.0;
+  for (auto i=radius; i<n-radius; i++) {
+    for (auto j=radius; j<n-radius; j++) {
+      norm += std::fabs(out[i*n+j]);
+    }
+  }
+  norm /= active_points;
+
+  // verify correctness
+  const double epsilon = 1.0e-8;
+  double reference_norm = 2.*(iterations+1.);
+  if (std::fabs(norm-reference_norm) > epsilon) {
+    std::cout << "ERROR: L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+    return 1;
+  } else {
+    std::cout << "Solution validates" << std::endl;
+#ifdef VERBOSE
+    std::cout << "L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+#endif
+    const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
+    size_t flops = (2L*(size_t)stencil_size+1L) * active_points;
+    auto avgtime = stencil_time/iterations;
+    std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}

From b36ad8d0246a32c40dd1b8e410fdf4d550e8d7a4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 18 May 2019 21:49:07 -0700
Subject: [PATCH 16/80] ignore halide and occa binaries

---
 .gitignore | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 55140184d..8329936cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -184,13 +184,11 @@ Cxx11/p2p-vector-raja
 Cxx11/p2p-tbb
 Cxx11/p2p-innerloop-openmp
 Cxx11/p2p-doacross-openmp
-Cxx11/p2p-doacross-openmp
 Cxx11/p2p-innerloop-opencl
 Cxx11/p2p-innerloop-vector
+Cxx11/p2p-innerloop-tbb
 Cxx11/p2p-hyperplane-vector
 Cxx11/p2p-hyperplane-openmp
-Cxx11/p2p-hyperplane-openmp
-Cxx11/p2p-innerloop-tbb
 Cxx11/p2p-hyperplane-stl
 Cxx11/p2p-hyperplane-pstl
 Cxx11/p2p-hyperplane-tbb
@@ -224,6 +222,8 @@ Cxx11/nstream-celerity
 Cxx11/nstream-hpx
 Cxx11/nstream-upcxx
 Cxx11/nstream-executors
+Cxx11/nstream-occa
+Cxx11/nstream-halide
 Cxx11/pic
 Cxx11/pic-dpcpp
 Cxx11/pic-sycl
@@ -258,6 +258,8 @@ Cxx11/stencil-sycl-usm
 Cxx11/stencil-sycl-explicit
 Cxx11/stencil-sycl-explicit-usm
 Cxx11/stencil-dpcpp
+Cxx11/stencil-occa
+Cxx11/stencil-halide
 Cxx11/transpose
 Cxx11/transpose-openmp
 Cxx11/transpose-mpi
@@ -288,6 +290,8 @@ Cxx11/transpose-device-thrust
 Cxx11/transpose-host-thrust
 Cxx11/transpose-cublas
 Cxx11/transpose-cuda
+Cxx11/transpose-occa
+Cxx11/transpose-halide
 Cxx11/grid1.cl
 Cxx11/grid2.cl
 Cxx11/grid3.cl

From 6cb5371e7bc828f109fcdff2f48993b001c1c826 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Sat, 18 May 2019 21:49:22 -0700
Subject: [PATCH 17/80] less wrong

---
 Cxx11/stencil-halide.cc | 76 ++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 46 deletions(-)

diff --git a/Cxx11/stencil-halide.cc b/Cxx11/stencil-halide.cc
index f0aab6461..bdd1f1487 100644
--- a/Cxx11/stencil-halide.cc
+++ b/Cxx11/stencil-halide.cc
@@ -61,22 +61,12 @@
 //////////////////////////////////////////////////////////////////////
 
 #include "prk_util.h"
-#include "stencil_seq.hpp"
-
-void nothing(const int n, const int t, prk::vector<double> & in, prk::vector<double> & out)
-{
-    std::cout << "You are trying to use a stencil that does not exist.\n";
-    std::cout << "Please generate the new stencil using the code generator\n";
-    std::cout << "and add it to the case-switch in the driver." << std::endl;
-    // n will never be zero - this is to silence compiler warnings.
-    if (n==0 || t==0) std::cout << in.size() << out.size() << std::endl;
-    std::abort();
-}
+#include "Halide.h"
 
 int main(int argc, char* argv[])
 {
   std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
-  std::cout << "C++11 Stencil execution on 2D grid" << std::endl;
+  std::cout << "C++11/Halide Stencil execution on 2D grid" << std::endl;
 
   //////////////////////////////////////////////////////////////////////
   // Process and test input parameters
@@ -139,54 +129,48 @@ int main(int argc, char* argv[])
   std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
   std::cout << "Radius of stencil    = " << radius << std::endl;
 
-  auto stencil = nothing;
-  if (star) {
-      switch (radius) {
-          case 1: stencil = star1; break;
-          case 2: stencil = star2; break;
-          case 3: stencil = star3; break;
-          case 4: stencil = star4; break;
-          case 5: stencil = star5; break;
-      }
-  } else {
-      switch (radius) {
-          case 1: stencil = grid1; break;
-          case 2: stencil = grid2; break;
-          case 3: stencil = grid3; break;
-          case 4: stencil = grid4; break;
-          case 5: stencil = grid5; break;
-      }
-  }
+  const Halide::Target target = Halide::get_jit_target_from_environment();
 
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation
   //////////////////////////////////////////////////////////////////////
 
-  auto stencil_time = 0.0;
+  double stencil_time(0);
 
-  prk::vector<double> in(n*n);
-  prk::vector<double> out(n*n);
+  Halide::Buffer<double> in(n,n);
+  Halide::Buffer<double> out(n,n);
+
+  Halide::Var x("x");
+  Halide::Var y("y");
+
+  Halide::Expr c1(0.25);
+  Halide::Expr c2(0.125);
+  Halide::Func stencil;
+  stencil(x,y) = c1 * ( in(x+1,y) + in(x-1,y) + in(x,y+1) + in(x,y+1) )
+               + c2 * ( in(x+2,y) + in(x-2,y) + in(x,y+2) + in(x,y+2) );
 
   {
-    for (auto it=0; it<n; it+=tile_size) {
-      for (auto jt=0; jt<n; jt+=tile_size) {
-        for (auto i=it; i<std::min(n,it+tile_size); i++) {
-          PRAGMA_SIMD
-          for (auto j=jt; j<std::min(n,jt+tile_size); j++) {
-            in[i*n+j] = static_cast<double>(i+j);
-            out[i*n+j] = 0.0;
-          }
-        }
+    for (auto i=0; i<n; ++i) {
+      for (auto j=0; j<n; ++j) {
+        in(i,j)  = 1.0*(i+j);
+        out(i,j) = 0.0;
       }
     }
 
-    for (auto iter = 0; iter<=iterations; iter++) {
+    for (int iter = 0; iter<=iterations; iter++) {
 
       if (iter==1) stencil_time = prk::wtime();
+
       // Apply the stencil operator
-      stencil(n, tile_size, in, out);
+      //stencil(n, tile_size, in, out);
+      out = stencil.realize(n,n);
+
       // Add constant to solution to force refresh of neighbor data, if any
-      std::transform(in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; });
+      for (int i=0; i<n; ++i) {
+        for (int j=0; j<n; ++j) {
+          in(i,j) += 1.0;
+        }
+      }
     }
     stencil_time = prk::wtime() - stencil_time;
   }
@@ -202,7 +186,7 @@ int main(int argc, char* argv[])
   double norm = 0.0;
   for (auto i=radius; i<n-radius; i++) {
     for (auto j=radius; j<n-radius; j++) {
-      norm += std::fabs(out[i*n+j]);
+      norm += std::fabs(out(i,j));
     }
   }
   norm /= active_points;

From 0047cdab017657a9adcdbac3c59c74e97abc83cd Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 3 Mar 2020 09:44:00 -0800
Subject: [PATCH 18/80] move documentation to the right place

Signed-off-by: Jeff Hammond <jeff.r.hammond@intel.com>
---
 {Cxx11 => doc}/HALIDE.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {Cxx11 => doc}/HALIDE.md (100%)

diff --git a/Cxx11/HALIDE.md b/doc/HALIDE.md
similarity index 100%
rename from Cxx11/HALIDE.md
rename to doc/HALIDE.md

From 52bd76531144854fb3f9e537b1c1e46457853796 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Mon, 2 Nov 2020 08:48:46 -0800
Subject: [PATCH 19/80] update Halide stuff for 10.0 release

---
 common/make.defs.gcc    |  6 +++---
 common/make.defs.llvm   |  6 +++---
 common/make.defs.oneapi | 10 ++++++++++
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index bf4b46ecf..f0fccc68c 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -162,12 +162,12 @@ CUDAFLAGS+=-arch=sm_50
 # Halide
 #
 HALIDECXX=c++
-HALIDEDIR=/opt/halide
+HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux
 HALIDEFLAG=-I${HALIDEDIR}/include
-HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide
+HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide
 #HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
 HALIDEFLAG+=${DEFAULT_OPT_FLAGS}
-HALIDEFLAG+=-std=c++17 -g3
+HALIDEFLAG+=-std=c++17
 #
 # ISPC
 #
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 1764d24f0..08fb7b1a1 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -202,12 +202,12 @@ CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
 # Halide
 #
 HALIDECXX=c++
-HALIDEDIR=/opt/halide
+HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux
 HALIDEFLAG=-I${HALIDEDIR}/include
-HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide
+HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide
 #HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
 HALIDEFLAG+=${DEFAULT_OPT_FLAGS}
-HALIDEFLAG+=-std=c++17 -g3
+HALIDEFLAG+=-std=c++17
 #
 # ISPC
 #
diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi
index be6b2dc4b..38e163047 100644
--- a/common/make.defs.oneapi
+++ b/common/make.defs.oneapi
@@ -106,6 +106,16 @@ CUDAFLAGS+=-arch=sm_50
 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
 #
+# Halide
+#
+HALIDECXX=icpx
+HALIDEDIR=/opt/halide
+HALIDEFLAG=-I${HALIDEDIR}/include
+HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide
+#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
+HALIDEFLAG+=${DEFAULT_OPT_FLAGS}
+HALIDEFLAG+=-std=c++17
+#
 # ISPC
 #
 ISPC=ispc

From b5b422f1f8f1987d8f8720f412fee512b2ac0721 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Fri, 31 Dec 2021 17:53:42 +0200
Subject: [PATCH 20/80] fix make.defs*

---
 common/make.defs.gcc       | 39 --------------------------------------
 common/make.defs.intel     | 29 +---------------------------
 common/make.defs.llvm      | 23 ----------------------
 common/make.defs.upcxx-hpx |  9 +++++++++
 4 files changed, 10 insertions(+), 90 deletions(-)
 create mode 100644 common/make.defs.upcxx-hpx

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 7a179c356..7c6d9b188 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -56,16 +56,6 @@ METALFLAG=-framework MetalPerformanceShaders
 # SYCL flags
 #
 # Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md
-<<<<<<< HEAD
-#SYCLDIR=/opt/isycl
-#SYCLCXX=${SYCLDIR}/bin/clang++
-#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib
-#SYCLFLAG+=-std=c++17 -O3
-# CodePlay ComputeCpp
-#SYCLDIR=/opt/sycl/latest
-#SYCLCXX=${SYCLDIR}/bin/compute++
-#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp
-=======
 #
 #SYCLDIR=/opt/isycl
 #SYCLCXX=${SYCLDIR}/bin/clang++
@@ -87,7 +77,6 @@ METALFLAG=-framework MetalPerformanceShaders
 #SYCLDIR=/opt/sycl/latest
 #SYCLCXX=${SYCLDIR}/bin/compute++
 #SYCLFLAG=-sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp
->>>>>>> default
 #SYCLFLAG+=-std=c++14 -O3
 # This makes a huge difference in e.g. nstream...
 #SYCLFLAG+=-no-serial-memop
@@ -107,8 +96,6 @@ METALFLAG=-framework MetalPerformanceShaders
 #SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
 #SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL
 #
-<<<<<<< HEAD
-=======
 # hipSYCL
 #
 SYCLDIR=/opt/hipSYCL
@@ -124,7 +111,6 @@ CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/
 CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime
 MPIINC=-I/usr/include/mpich-3.2-x86_64
 MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi
->>>>>>> default
 #
 # OCCA
 #
@@ -137,11 +123,7 @@ MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi
 # TBB
 #
 #TBBDIR=/usr/lib/x86_64-linux-gnu
-<<<<<<< HEAD
-TBBDIR=/usr/local/Cellar/tbb/2020_U0
-=======
 TBBDIR=/opt/homebrew/Cellar/tbb/2020_U3_1
->>>>>>> default
 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb
 #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb
@@ -149,19 +131,11 @@ TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 # Parallel STL, Boost, etc.
 #
 #BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include
-<<<<<<< HEAD
-BOOSTROOT=/usr/local/Cellar/boost/1.72.0/include
-BOOSTFLAG+=-I${BOOSTROOT}
-BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11
-RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
-#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
-=======
 #BOOSTFLAG=-I/usr/include/boost169
 BOOSTFLAG=-I/opt/homebrew/Cellar/boost/1.75.0_1/include # M1 Big Sur
 #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 RANGEFLAG=-DUSE_RANGES_TS -I../deps/range-v3/include
 #RANGEFLAG=-DUSE_GCC_RANGES
->>>>>>> default
 PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG}
 #PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages
 KOKKOSDIR=/opt/kokkos/gcc
@@ -170,11 +144,8 @@ RAJADIR=/opt/raja/gcc
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust
 THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
-<<<<<<< HEAD
-=======
 EXECUTORSDIR=./libunifex
 EXECUTORSFLAG=-I${EXECUTORSDIR}/include -I${EXECUTORSDIR}/build/include
->>>>>>> default
 # HPX is more complicated...
 HWLOCFLAG=-I/usr/local/include
 HPXDIR=./hpx
@@ -209,15 +180,6 @@ CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
 #
 # Halide
 #
-<<<<<<< HEAD
-HALIDECXX=c++
-HALIDEDIR=/opt/halide
-HALIDEFLAG=-I${HALIDEDIR}/include
-HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide
-#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
-HALIDEFLAG+=${DEFAULT_OPT_FLAGS}
-HALIDEFLAG+=-std=c++17 -g3
-=======
 HALIDECXX=${CXX}
 HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux
 HALIDEFLAG=-I${HALIDEDIR}/include
@@ -225,7 +187,6 @@ HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide
 #HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
 HALIDEFLAG+=${DEFAULT_OPT_FLAGS}
 HALIDEFLAG+=-std=c++17
->>>>>>> default
 #
 # ISPC
 #
diff --git a/common/make.defs.intel b/common/make.defs.intel
index 047c363ad..1abbb0c75 100644
--- a/common/make.defs.intel
+++ b/common/make.defs.intel
@@ -43,22 +43,10 @@ OFFLOADFLAG+=-DGPU_SCHEDULE=""
 # Linux
 OPENCLDIR=/etc/alternatives/opencl-intel-tools
 OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
-#OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations
-METALFLAG=-framework MetalPerformanceShaders
 #
 # SYCL flags
 #
 # Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md
-<<<<<<< HEAD
-#SYCLDIR=/opt/isycl
-#SYCLCXX=${SYCLDIR}/bin/clang++
-#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib
-#SYCLFLAG+=-std=c++17 -O3
-# CodePlay ComputeCpp
-#SYCLDIR=/opt/sycl/latest
-#SYCLCXX=${SYCLDIR}/bin/compute++
-#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp
-=======
 #
 #SYCLDIR=/opt/isycl
 #SYCLCXX=${SYCLDIR}/bin/clang++
@@ -81,7 +69,6 @@ METALFLAG=-framework MetalPerformanceShaders
 #SYCLDIR=/opt/codeplay/latest
 #SYCLCXX=${SYCLDIR}/bin/compute++
 #SYCLFLAG=-sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp
->>>>>>> default
 #SYCLFLAG+=-std=c++14 -O3
 # This makes a huge difference in e.g. nstream...
 #SYCLFLAG+=-no-serial-memop
@@ -93,19 +80,11 @@ METALFLAG=-framework MetalPerformanceShaders
 #SYCLFLAG+=${OPENCLFLAG}
 # NVIDIA target
 #SYCLFLAG+=-sycl-target ptx64
-<<<<<<< HEAD
-=======
 #SYCLFLAG+=-DPRK_NO_OPENCL_GPU
->>>>>>> default
 #
 # triSYCL
 #
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-<<<<<<< HEAD
-#SYCLDIR=./triSYCL
-#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
-#SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL
-=======
 SYCLDIR=./triSYCL
 SYCLCXX=${CXX} ${OPENMPFLAG}
 SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include
@@ -123,7 +102,6 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include
 #CELERITYDIR=${SYCLDIR}
 #CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor
 #CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime
->>>>>>> default
 #
 # OCCA
 #
@@ -131,7 +109,7 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include
 #
 # Cilk
 #
-#CILKFLAG=-intel-extensions # default
+CILKFLAG=-intel-extensions # default
 #
 # TBB
 #
@@ -150,11 +128,6 @@ RAJADIR=/opt/raja/intel
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/opt/nvidia/thrust
 THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
-# HPX is more complicated...
-HWLOCFLAG=-I/usr/local/include
-HPXDIR=./hpx
-HPXCXX=${HPXDIR}/bin/hpxcxx
-HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG}
 #
 # CBLAS for C++ DGEMM
 #
diff --git a/common/make.defs.llvm b/common/make.defs.llvm
index 502331404..730e1fa08 100644
--- a/common/make.defs.llvm
+++ b/common/make.defs.llvm
@@ -128,11 +128,6 @@ SYCLFLAG+=${OPENCLFLAG}
 # triSYCL
 #
 # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory...
-<<<<<<< HEAD
-SYCLDIR=./triSYCL
-SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS)
-SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL
-=======
 #SYCLDIR=./triSYCL
 #SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS)
 #SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL
@@ -155,7 +150,6 @@ SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL
 CELERITYDIR=${SYCLDIR}
 CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor
 CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime
->>>>>>> default
 #
 # OCCA
 #
@@ -164,26 +158,13 @@ CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime
 # TBB
 #
 #TBBDIR=/usr/lib/x86_64-linux-gnu
-<<<<<<< HEAD
-TBBDIR=/usr/local/Cellar/tbb/2020_U0
-=======
 TBBDIR=/opt/homebrew/Cellar/tbb/2020_U3_1
->>>>>>> default
 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb
 #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb
 #
 # Parallel STL, Boost, etc.
 #
-<<<<<<< HEAD
-#BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include
-BOOSTROOT=/usr/local/Cellar/boost/1.72.0/include
-BOOSTFLAG+=-I${BOOSTROOT}
-BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11
-#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
-RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
-PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} -Wno-\#pragma-messages -DUSE_INTEL_PSTL -I./pstl/include
-=======
 #BOOSTFLAG=-I/usr/local/Cellar/boost/1.72.0/include      # old Homebrew
 #BOOSTFLAG=-I/usr/include/boost169                       # Linux
 BOOSTFLAG=-I/opt/homebrew/Cellar/boost/1.75.0_2/include  # new Homebrew
@@ -196,7 +177,6 @@ SYCLFLAG+=${BOOSTFLAG}
 RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
 PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG}
 PSTLFLAG+=-I./llvm-pstl/include -DLLVM_PSTL
->>>>>>> default
 KOKKOSDIR=/opt/kokkos/clang
 KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos -ldl
 KOKKOSFLAG+=${OPENMPFLAG}
@@ -205,11 +185,8 @@ RAJADIR=/opt/raja/clang
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/opt/nvidia/thrust
 THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
-<<<<<<< HEAD
-=======
 EXECUTORSDIR=./libunifex
 EXECUTORSFLAG=-I${EXECUTORSDIR}/include -I${EXECUTORSDIR}/build/include
->>>>>>> default
 # HPX is more complicated...
 HWLOCFLAG=-I/usr/local/include
 HPXDIR=./hpx
diff --git a/common/make.defs.upcxx-hpx b/common/make.defs.upcxx-hpx
new file mode 100644
index 000000000..a30623ad5
--- /dev/null
+++ b/common/make.defs.upcxx-hpx
@@ -0,0 +1,9 @@
+UPCXXDIR=./upcxx
+UPCXX=${UPCXXDIR}/bin/upcxx
+UPCXXFLAG=-codemode={O3,debug}
+UPCXXFLAG+=-std=c++17
+UPCXXFLAG+=-mtune=native -ffast-math
+
+HPXDIR=./hpx
+HPXCXX=${HPXDIR}/bin/hpxcxx
+HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG}

From 285677d201e1d04ccb68eda8df07eb86b52d549c Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 4 Jan 2022 12:15:37 +0200
Subject: [PATCH 21/80] factor out PRK MPI module utility

---
 FORTRAN/Makefile              | 11 +++++--
 FORTRAN/prk_mpi.F90           | 60 +++++++++++++++++++++++++++++++++++
 FORTRAN/transpose-a2a-mpi.F90 | 29 -----------------
 FORTRAN/transpose-acc-mpi.F90 | 29 -----------------
 FORTRAN/transpose-get-mpi.F90 | 29 -----------------
 FORTRAN/transpose-p2p-mpi.F90 | 29 -----------------
 6 files changed, 68 insertions(+), 119 deletions(-)
 create mode 100644 FORTRAN/prk_mpi.F90

diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index 4a1315edb..54e0b11ae 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -96,6 +96,9 @@ blas: dgemm-blas
 prk.mod prk_mod.o: prk_mod.F90
 	$(FC) $(FCFLAGS) -c $< -o prk_mod.o
 
+prk_mpi.mod prk_mpi_mod.o: prk_mpi.F90
+	$(FC) $(FCFLAGS) -c $< -o prk_mpi_mod.o
+
 stencil: stencil.F90 prk.mod
 	$(FC) $(FCFLAGS) -c stencil_serial.F90
 	$(FC) $(FCFLAGS) stencil.F90 stencil_serial.o prk_mod.o -o $@
@@ -119,10 +122,10 @@ dgemm-blas: dgemm-blas.F90 prk.mod
 	$(MPIFORT) $(FCFLAGS) $< prk_mod.o $(GAFLAG) -o $@
 
 %-mpi-openmp: %-mpi.F90 prk.mod
-	$(MPIFORT) $(FCFLAGS) $(OPENMPFLAG) $< prk_mod.o -o $@
+	$(MPIFORT) $(FCFLAGS) $(OPENMPFLAG) $< prk_mod.o prk_mpi_mod.o -o $@
 
-%-mpi: %-mpi.F90 prk.mod
-	$(MPIFORT) $(FCFLAGS) $< prk_mod.o -o $@
+%-mpi: %-mpi.F90 prk.mod prk_mpi.mod
+	$(MPIFORT) $(FCFLAGS) $< prk_mod.o prk_mpi_mod.o -o $@
 
 %-coarray: %-coarray.F90 prk.mod
 	$(CAFC) $(FCFLAGS) $< prk_mod.o $(COARRAYFLAG) -o $@
@@ -142,6 +145,8 @@ dgemm-blas: dgemm-blas.F90 prk.mod
 clean:
 	-rm -f prk.mod
 	-rm -f prk.f18.mod
+	-rm -f prk_mpi.mod
+	-rm -f prk_mpi.f18.mod
 	-rm -f *.o
 	-rm -f *.i90
 	-rm -f *.dbg
diff --git a/FORTRAN/prk_mpi.F90 b/FORTRAN/prk_mpi.F90
new file mode 100644
index 000000000..f1508f450
--- /dev/null
+++ b/FORTRAN/prk_mpi.F90
@@ -0,0 +1,60 @@
+!
+! Copyright (c) 2015, Intel Corporation
+! Copyright (c) 2021, NVIDIA
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!
+! * Redistributions of source code must retain the above copyright
+!      notice, this list of conditions and the following disclaimer.
+! * Redistributions in binary form must reproduce the above
+!      copyright notice, this list of conditions and the following
+!      disclaimer in the documentation and/or other materials provided
+!      with the distribution.
+! * Neither the name of Intel Corporation nor the names of its
+!      contributors may be used to endorse or promote products
+!      derived from this software without specific prior written
+!      permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+! POSSIBILITY OF SUCH DAMAGE.
+
+module prk_mpi
+  contains
+    subroutine mpi_print_matrix(mat,clabel)
+      use, intrinsic :: iso_fortran_env
+      use mpi_f08
+      use prk
+      implicit none
+      real(kind=REAL64), intent(in) :: mat(:,:)
+      character(*), intent(in), optional :: clabel
+      integer(kind=INT32) :: r, me, np
+      flush(6)
+      call MPI_Comm_rank(MPI_COMM_WORLD, me)
+      call MPI_Comm_size(MPI_COMM_WORLD, np)
+      call MPI_Barrier(MPI_COMM_WORLD)
+      flush(6)
+      if (me.eq.0) print*,clabel
+      flush(6)
+      call MPI_Barrier(MPI_COMM_WORLD)
+      flush(6)
+      do r=0,np-1
+        if (me.eq.r) then
+          call print_matrix(mat,me)
+        endif
+        call MPI_Barrier(MPI_COMM_WORLD)
+      enddo
+      flush(6)
+    end subroutine
+end module prk_mpi
diff --git a/FORTRAN/transpose-a2a-mpi.F90 b/FORTRAN/transpose-a2a-mpi.F90
index c121b037a..a57615201 100644
--- a/FORTRAN/transpose-a2a-mpi.F90
+++ b/FORTRAN/transpose-a2a-mpi.F90
@@ -53,35 +53,6 @@
 !          MPI by Jeff Hammond, November 2021
 ! *******************************************************************
 
-module prk_mpi
-  contains
-    subroutine mpi_print_matrix(mat,clabel)
-      use, intrinsic :: iso_fortran_env
-      use mpi_f08
-      use prk
-      implicit none
-      real(kind=REAL64), intent(in) :: mat(:,:)
-      character(*), intent(in), optional :: clabel
-      integer(kind=INT32) :: r, me, np
-      flush(6)
-      call MPI_Comm_rank(MPI_COMM_WORLD, me)
-      call MPI_Comm_size(MPI_COMM_WORLD, np)
-      call MPI_Barrier(MPI_COMM_WORLD)
-      flush(6)
-      if (me.eq.0) print*,clabel
-      flush(6)
-      call MPI_Barrier(MPI_COMM_WORLD)
-      flush(6)
-      do r=0,np-1
-        if (me.eq.r) then
-          call print_matrix(mat,me)
-        endif
-        call MPI_Barrier(MPI_COMM_WORLD)
-      enddo
-      flush(6)
-    end subroutine
-end module prk_mpi
-
 program main
   use, intrinsic :: iso_fortran_env
   use mpi_f08
diff --git a/FORTRAN/transpose-acc-mpi.F90 b/FORTRAN/transpose-acc-mpi.F90
index 9023a006f..6e2904a47 100644
--- a/FORTRAN/transpose-acc-mpi.F90
+++ b/FORTRAN/transpose-acc-mpi.F90
@@ -53,35 +53,6 @@
 !          MPI by Jeff Hammond, November 2021
 ! *******************************************************************
 
-module prk_mpi
-  contains
-    subroutine mpi_print_matrix(mat,clabel)
-      use, intrinsic :: iso_fortran_env
-      use mpi_f08
-      use prk
-      implicit none
-      real(kind=REAL64), intent(in) :: mat(:,:)
-      character(*), intent(in), optional :: clabel
-      integer(kind=INT32) :: r, me, np
-      flush(6)
-      call MPI_Comm_rank(MPI_COMM_WORLD, me)
-      call MPI_Comm_size(MPI_COMM_WORLD, np)
-      call MPI_Barrier(MPI_COMM_WORLD)
-      flush(6)
-      if (me.eq.0) print*,clabel
-      flush(6)
-      call MPI_Barrier(MPI_COMM_WORLD)
-      flush(6)
-      do r=0,np-1
-        if (me.eq.r) then
-          call print_matrix(mat,me)
-        endif
-        call MPI_Barrier(MPI_COMM_WORLD)
-      enddo
-      flush(6)
-    end subroutine
-end module prk_mpi
-
 program main
   use, intrinsic :: iso_fortran_env
   use, intrinsic :: iso_c_binding
diff --git a/FORTRAN/transpose-get-mpi.F90 b/FORTRAN/transpose-get-mpi.F90
index b153117ca..ecd6ed18d 100644
--- a/FORTRAN/transpose-get-mpi.F90
+++ b/FORTRAN/transpose-get-mpi.F90
@@ -53,35 +53,6 @@
 !          MPI by Jeff Hammond, November 2021
 ! *******************************************************************
 
-module prk_mpi
-  contains
-    subroutine mpi_print_matrix(mat,clabel)
-      use, intrinsic :: iso_fortran_env
-      use mpi_f08
-      use prk
-      implicit none
-      real(kind=REAL64), intent(in) :: mat(:,:)
-      character(*), intent(in), optional :: clabel
-      integer(kind=INT32) :: r, me, np
-      flush(6)
-      call MPI_Comm_rank(MPI_COMM_WORLD, me)
-      call MPI_Comm_size(MPI_COMM_WORLD, np)
-      call MPI_Barrier(MPI_COMM_WORLD)
-      flush(6)
-      if (me.eq.0) print*,clabel
-      flush(6)
-      call MPI_Barrier(MPI_COMM_WORLD)
-      flush(6)
-      do r=0,np-1
-        if (me.eq.r) then
-          call print_matrix(mat,me)
-        endif
-        call MPI_Barrier(MPI_COMM_WORLD)
-      enddo
-      flush(6)
-    end subroutine
-end module prk_mpi
-
 program main
   use, intrinsic :: iso_fortran_env
   use, intrinsic :: iso_c_binding
diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90
index 3d72cb36c..1ae8dbc9a 100644
--- a/FORTRAN/transpose-p2p-mpi.F90
+++ b/FORTRAN/transpose-p2p-mpi.F90
@@ -53,35 +53,6 @@
 !          MPI by Jeff Hammond, November 2021
 ! *******************************************************************
 
-module prk_mpi
-  contains
-    subroutine mpi_print_matrix(mat,clabel)
-      use, intrinsic :: iso_fortran_env
-      use mpi_f08
-      use prk
-      implicit none
-      real(kind=REAL64), intent(in) :: mat(:,:)
-      character(*), intent(in), optional :: clabel
-      integer(kind=INT32) :: r, me, np
-      flush(6)
-      call MPI_Comm_rank(MPI_COMM_WORLD, me)
-      call MPI_Comm_size(MPI_COMM_WORLD, np)
-      call MPI_Barrier(MPI_COMM_WORLD)
-      flush(6)
-      if (me.eq.0) print*,clabel
-      flush(6)
-      call MPI_Barrier(MPI_COMM_WORLD)
-      flush(6)
-      do r=0,np-1
-        if (me.eq.r) then
-          call print_matrix(mat,me)
-        endif
-        call MPI_Barrier(MPI_COMM_WORLD)
-      enddo
-      flush(6)
-    end subroutine
-end module prk_mpi
-
 program main
   use, intrinsic :: iso_fortran_env
   use mpi_f08

From a640d7add111bbcc05341cd4e162dd7080e4244d Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Wed, 5 Jan 2022 07:31:12 -0800
Subject: [PATCH 22/80] extra targets

---
 FORTRAN/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index 3cd82fe25..bcc68182a 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -38,7 +38,7 @@ ifeq ($(findstring ifx,$(FC)),ifx)
 endif
 # GCC (also matches pgfortran so PGI must come after)
 ifeq ($(findstring gfortran,$(FC)),gfortran)
-  EXTRA = target coarray taskloop openacc
+  EXTRA = target coarray taskloop openacc blas
 endif
 # PGI and LLVM Flang
 ifeq ($(findstring flang,$(FC)),flang)
@@ -50,7 +50,7 @@ ifeq ($(findstring pgf,$(FC)),pgf)
   FCFLAGS += -DPGI
 endif
 ifeq ($(findstring nvf,$(FC)),nvf)
-  EXTRA = target openacc cufortran
+  EXTRA = target openacc cufortran stdpar blas
   FCFLAGS += -DNVHPC
 endif
 ifeq ($(findstring xlf,$(FC)),xlf)

From 719977ce107152e4355629ca749d2914957ca570 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 7 May 2022 11:56:26 +0300
Subject: [PATCH 23/80] implement Pablo's changes in code generator

Signed-off-by: Jeff Hammond <jeff.science@gmail.com>
---
 Cxx11/generate-sycl-stencil.py |  26 ++-----
 Cxx11/stencil_sycl.hpp         | 121 ++++++++++++++++-----------------
 2 files changed, 64 insertions(+), 83 deletions(-)

diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py
index 22756399e..c67f2d124 100755
--- a/Cxx11/generate-sycl-stencil.py
+++ b/Cxx11/generate-sycl-stencil.py
@@ -33,20 +33,14 @@ def codegen(src,pattern,stencil_size,radius,model,dim,usm):
             src.write('    sycl::id<2> dx'+str(r)+'(sycl::range<2> {'+str(r)+',0});\n')
             src.write('    sycl::id<2> dy'+str(r)+'(sycl::range<2> {0,'+str(r)+'});\n')
     src.write('    h.parallel_for<class '+kernel_name+'<T>>(')
-    src.write('sycl::range<2> {n-'+str(2*radius)+',n-'+str(2*radius)+'}, ')
-    src.write('sycl::id<2> {'+str(radius)+','+str(radius)+'}, ')
+    src.write('sycl::range<2> {n-'+str(radius)+',n-'+str(radius)+'}, ')
     src.write('[=] (sycl::item<2> it) {\n')
     if (dim==2):
-        src.write('        sycl::id<2> xy = it.get_id();\n')
+        src.write('        sycl::id<2> xy = it.get_id() + sycl::id<2> {'+str(radius)+','+str(radius)+'};\n')
         src.write('        out[xy] += ')
     else:
-        # 1D indexing the slow way
-        #src.write('        auto i = it[0];\n')
-        #src.write('        auto j = it[1];\n')
-        #src.write('        out[i*n+j] += ')
-        # 1D indexing the fast way
-        src.write('        const auto i = it[0];\n')
-        src.write('        const auto j = it[1];\n')
+        src.write('        const auto i = it[0] + '+str(radius)+';\n')
+        src.write('        const auto j = it[1] + '+str(radius)+';\n')
         src.write('        out[i*n+j] += ')
     if pattern == 'star':
         for i in range(1,radius+1):
@@ -62,18 +56,6 @@ def codegen(src,pattern,stencil_size,radius,model,dim,usm):
                 src.write('\n'+19*' ')
                 src.write('+in[xy-dy'+str(i)+'] * static_cast<T>('+str(-1./(2.*i*radius))+')')
             else:
-                # 1D indexing the slow way
-                #if i > 1:
-                #    src.write('\n')
-                #    src.write(22*' ')
-                #src.write('+in[i*n+(j+'+str(i)+')] * static_cast<T>('+str(+1./(2.*i*radius))+')')
-                #src.write('\n'+22*' ')
-                #src.write('+in[i*n+(j-'+str(i)+')] * static_cast<T>('+str(-1./(2.*i*radius))+')')
-                #src.write('\n'+22*' ')
-                #src.write('+in[(i+'+str(i)+')*n+j] * static_cast<T>('+str(+1./(2.*i*radius))+')')
-                #src.write('\n'+22*' ')
-                #src.write('+in[(i-'+str(i)+')*n+j] * static_cast<T>('+str(-1./(2.*i*radius))+')')
-                # 1D indexing the fast way
                 if i > 1:
                     src.write('\n')
                     src.write(30*' ')
diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp
index 024e796c4..64af40b79 100644
--- a/Cxx11/stencil_sycl.hpp
+++ b/Cxx11/stencil_sycl.hpp
@@ -1,4 +1,3 @@
-
 // declare the kernel name used in SYCL parallel_for
 template <typename T> class star1_1d;
 
@@ -143,18 +142,18 @@ void star3(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer
     h.parallel_for<class star3_1d<T>>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) {
         const auto i = it[0] + 3;
         const auto j = it[1] + 3;
-        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.166666666667)
-                              +in[i*n+(j-1)] * static_cast<T>(-0.166666666667)
-                              +in[(i+1)*n+j] * static_cast<T>(0.166666666667)
-                              +in[(i-1)*n+j] * static_cast<T>(-0.166666666667)
-                              +in[i*n+(j+2)] * static_cast<T>(0.0833333333333)
-                              +in[i*n+(j-2)] * static_cast<T>(-0.0833333333333)
-                              +in[(i+2)*n+j] * static_cast<T>(0.0833333333333)
-                              +in[(i-2)*n+j] * static_cast<T>(-0.0833333333333)
-                              +in[i*n+(j+3)] * static_cast<T>(0.0555555555556)
-                              +in[i*n+(j-3)] * static_cast<T>(-0.0555555555556)
-                              +in[(i+3)*n+j] * static_cast<T>(0.0555555555556)
-                              +in[(i-3)*n+j] * static_cast<T>(-0.0555555555556);
+        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.16666666666666666)
+                              +in[i*n+(j-1)] * static_cast<T>(-0.16666666666666666)
+                              +in[(i+1)*n+j] * static_cast<T>(0.16666666666666666)
+                              +in[(i-1)*n+j] * static_cast<T>(-0.16666666666666666)
+                              +in[i*n+(j+2)] * static_cast<T>(0.08333333333333333)
+                              +in[i*n+(j-2)] * static_cast<T>(-0.08333333333333333)
+                              +in[(i+2)*n+j] * static_cast<T>(0.08333333333333333)
+                              +in[(i-2)*n+j] * static_cast<T>(-0.08333333333333333)
+                              +in[i*n+(j+3)] * static_cast<T>(0.05555555555555555)
+                              +in[i*n+(j-3)] * static_cast<T>(-0.05555555555555555)
+                              +in[(i+3)*n+j] * static_cast<T>(0.05555555555555555)
+                              +in[(i-3)*n+j] * static_cast<T>(-0.05555555555555555);
     });
   });
 }
@@ -176,18 +175,18 @@ void star3(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buf
     sycl::id<2> dy3(sycl::range<2> {0,3});
     h.parallel_for<class star3_2d<T>>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) {
         sycl::id<2> xy = it.get_id() + sycl::id<2> {3,3};
-        out[xy] += +in[xy+dx1] * static_cast<T>(0.166666666667)
-                   +in[xy-dx1] * static_cast<T>(-0.166666666667)
-                   +in[xy+dy1] * static_cast<T>(0.166666666667)
-                   +in[xy-dy1] * static_cast<T>(-0.166666666667)
-                   +in[xy+dx2] * static_cast<T>(0.0833333333333)
-                   +in[xy-dx2] * static_cast<T>(-0.0833333333333)
-                   +in[xy+dy2] * static_cast<T>(0.0833333333333)
-                   +in[xy-dy2] * static_cast<T>(-0.0833333333333)
-                   +in[xy+dx3] * static_cast<T>(0.0555555555556)
-                   +in[xy-dx3] * static_cast<T>(-0.0555555555556)
-                   +in[xy+dy3] * static_cast<T>(0.0555555555556)
-                   +in[xy-dy3] * static_cast<T>(-0.0555555555556);
+        out[xy] += +in[xy+dx1] * static_cast<T>(0.16666666666666666)
+                   +in[xy-dx1] * static_cast<T>(-0.16666666666666666)
+                   +in[xy+dy1] * static_cast<T>(0.16666666666666666)
+                   +in[xy-dy1] * static_cast<T>(-0.16666666666666666)
+                   +in[xy+dx2] * static_cast<T>(0.08333333333333333)
+                   +in[xy-dx2] * static_cast<T>(-0.08333333333333333)
+                   +in[xy+dy2] * static_cast<T>(0.08333333333333333)
+                   +in[xy-dy2] * static_cast<T>(-0.08333333333333333)
+                   +in[xy+dx3] * static_cast<T>(0.05555555555555555)
+                   +in[xy-dx3] * static_cast<T>(-0.05555555555555555)
+                   +in[xy+dy3] * static_cast<T>(0.05555555555555555)
+                   +in[xy-dy3] * static_cast<T>(-0.05555555555555555);
     });
   });
 }
@@ -202,18 +201,18 @@ void star3(sycl::queue & q, const size_t n, const T * in, T * out)
     h.parallel_for<class star3_usm<T>>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) {
         const auto i = it[0] + 3;
         const auto j = it[1] + 3;
-        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.166666666667)
-                              +in[i*n+(j-1)] * static_cast<T>(-0.166666666667)
-                              +in[(i+1)*n+j] * static_cast<T>(0.166666666667)
-                              +in[(i-1)*n+j] * static_cast<T>(-0.166666666667)
-                              +in[i*n+(j+2)] * static_cast<T>(0.0833333333333)
-                              +in[i*n+(j-2)] * static_cast<T>(-0.0833333333333)
-                              +in[(i+2)*n+j] * static_cast<T>(0.0833333333333)
-                              +in[(i-2)*n+j] * static_cast<T>(-0.0833333333333)
-                              +in[i*n+(j+3)] * static_cast<T>(0.0555555555556)
-                              +in[i*n+(j-3)] * static_cast<T>(-0.0555555555556)
-                              +in[(i+3)*n+j] * static_cast<T>(0.0555555555556)
-                              +in[(i-3)*n+j] * static_cast<T>(-0.0555555555556);
+        out[i*n+j] += +in[i*n+(j+1)] * static_cast<T>(0.16666666666666666)
+                              +in[i*n+(j-1)] * static_cast<T>(-0.16666666666666666)
+                              +in[(i+1)*n+j] * static_cast<T>(0.16666666666666666)
+                              +in[(i-1)*n+j] * static_cast<T>(-0.16666666666666666)
+                              +in[i*n+(j+2)] * static_cast<T>(0.08333333333333333)
+                              +in[i*n+(j-2)] * static_cast<T>(-0.08333333333333333)
+                              +in[(i+2)*n+j] * static_cast<T>(0.08333333333333333)
+                              +in[(i-2)*n+j] * static_cast<T>(-0.08333333333333333)
+                              +in[i*n+(j+3)] * static_cast<T>(0.05555555555555555)
+                              +in[i*n+(j-3)] * static_cast<T>(-0.05555555555555555)
+                              +in[(i+3)*n+j] * static_cast<T>(0.05555555555555555)
+                              +in[(i-3)*n+j] * static_cast<T>(-0.05555555555555555);
     });
   });
 }
@@ -238,10 +237,10 @@ void star4(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer
                               +in[i*n+(j-2)] * static_cast<T>(-0.0625)
                               +in[(i+2)*n+j] * static_cast<T>(0.0625)
                               +in[(i-2)*n+j] * static_cast<T>(-0.0625)
-                              +in[i*n+(j+3)] * static_cast<T>(0.0416666666667)
-                              +in[i*n+(j-3)] * static_cast<T>(-0.0416666666667)
-                              +in[(i+3)*n+j] * static_cast<T>(0.0416666666667)
-                              +in[(i-3)*n+j] * static_cast<T>(-0.0416666666667)
+                              +in[i*n+(j+3)] * static_cast<T>(0.041666666666666664)
+                              +in[i*n+(j-3)] * static_cast<T>(-0.041666666666666664)
+                              +in[(i+3)*n+j] * static_cast<T>(0.041666666666666664)
+                              +in[(i-3)*n+j] * static_cast<T>(-0.041666666666666664)
                               +in[i*n+(j+4)] * static_cast<T>(0.03125)
                               +in[i*n+(j-4)] * static_cast<T>(-0.03125)
                               +in[(i+4)*n+j] * static_cast<T>(0.03125)
@@ -277,10 +276,10 @@ void star4(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buf
                    +in[xy-dx2] * static_cast<T>(-0.0625)
                    +in[xy+dy2] * static_cast<T>(0.0625)
                    +in[xy-dy2] * static_cast<T>(-0.0625)
-                   +in[xy+dx3] * static_cast<T>(0.0416666666667)
-                   +in[xy-dx3] * static_cast<T>(-0.0416666666667)
-                   +in[xy+dy3] * static_cast<T>(0.0416666666667)
-                   +in[xy-dy3] * static_cast<T>(-0.0416666666667)
+                   +in[xy+dx3] * static_cast<T>(0.041666666666666664)
+                   +in[xy-dx3] * static_cast<T>(-0.041666666666666664)
+                   +in[xy+dy3] * static_cast<T>(0.041666666666666664)
+                   +in[xy-dy3] * static_cast<T>(-0.041666666666666664)
                    +in[xy+dx4] * static_cast<T>(0.03125)
                    +in[xy-dx4] * static_cast<T>(-0.03125)
                    +in[xy+dy4] * static_cast<T>(0.03125)
@@ -307,10 +306,10 @@ void star4(sycl::queue & q, const size_t n, const T * in, T * out)
                               +in[i*n+(j-2)] * static_cast<T>(-0.0625)
                               +in[(i+2)*n+j] * static_cast<T>(0.0625)
                               +in[(i-2)*n+j] * static_cast<T>(-0.0625)
-                              +in[i*n+(j+3)] * static_cast<T>(0.0416666666667)
-                              +in[i*n+(j-3)] * static_cast<T>(-0.0416666666667)
-                              +in[(i+3)*n+j] * static_cast<T>(0.0416666666667)
-                              +in[(i-3)*n+j] * static_cast<T>(-0.0416666666667)
+                              +in[i*n+(j+3)] * static_cast<T>(0.041666666666666664)
+                              +in[i*n+(j-3)] * static_cast<T>(-0.041666666666666664)
+                              +in[(i+3)*n+j] * static_cast<T>(0.041666666666666664)
+                              +in[(i-3)*n+j] * static_cast<T>(-0.041666666666666664)
                               +in[i*n+(j+4)] * static_cast<T>(0.03125)
                               +in[i*n+(j-4)] * static_cast<T>(-0.03125)
                               +in[(i+4)*n+j] * static_cast<T>(0.03125)
@@ -339,10 +338,10 @@ void star5(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer
                               +in[i*n+(j-2)] * static_cast<T>(-0.05)
                               +in[(i+2)*n+j] * static_cast<T>(0.05)
                               +in[(i-2)*n+j] * static_cast<T>(-0.05)
-                              +in[i*n+(j+3)] * static_cast<T>(0.0333333333333)
-                              +in[i*n+(j-3)] * static_cast<T>(-0.0333333333333)
-                              +in[(i+3)*n+j] * static_cast<T>(0.0333333333333)
-                              +in[(i-3)*n+j] * static_cast<T>(-0.0333333333333)
+                              +in[i*n+(j+3)] * static_cast<T>(0.03333333333333333)
+                              +in[i*n+(j-3)] * static_cast<T>(-0.03333333333333333)
+                              +in[(i+3)*n+j] * static_cast<T>(0.03333333333333333)
+                              +in[(i-3)*n+j] * static_cast<T>(-0.03333333333333333)
                               +in[i*n+(j+4)] * static_cast<T>(0.025)
                               +in[i*n+(j-4)] * static_cast<T>(-0.025)
                               +in[(i+4)*n+j] * static_cast<T>(0.025)
@@ -384,10 +383,10 @@ void star5(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buf
                    +in[xy-dx2] * static_cast<T>(-0.05)
                    +in[xy+dy2] * static_cast<T>(0.05)
                    +in[xy-dy2] * static_cast<T>(-0.05)
-                   +in[xy+dx3] * static_cast<T>(0.0333333333333)
-                   +in[xy-dx3] * static_cast<T>(-0.0333333333333)
-                   +in[xy+dy3] * static_cast<T>(0.0333333333333)
-                   +in[xy-dy3] * static_cast<T>(-0.0333333333333)
+                   +in[xy+dx3] * static_cast<T>(0.03333333333333333)
+                   +in[xy-dx3] * static_cast<T>(-0.03333333333333333)
+                   +in[xy+dy3] * static_cast<T>(0.03333333333333333)
+                   +in[xy-dy3] * static_cast<T>(-0.03333333333333333)
                    +in[xy+dx4] * static_cast<T>(0.025)
                    +in[xy-dx4] * static_cast<T>(-0.025)
                    +in[xy+dy4] * static_cast<T>(0.025)
@@ -418,10 +417,10 @@ void star5(sycl::queue & q, const size_t n, const T * in, T * out)
                               +in[i*n+(j-2)] * static_cast<T>(-0.05)
                               +in[(i+2)*n+j] * static_cast<T>(0.05)
                               +in[(i-2)*n+j] * static_cast<T>(-0.05)
-                              +in[i*n+(j+3)] * static_cast<T>(0.0333333333333)
-                              +in[i*n+(j-3)] * static_cast<T>(-0.0333333333333)
-                              +in[(i+3)*n+j] * static_cast<T>(0.0333333333333)
-                              +in[(i-3)*n+j] * static_cast<T>(-0.0333333333333)
+                              +in[i*n+(j+3)] * static_cast<T>(0.03333333333333333)
+                              +in[i*n+(j-3)] * static_cast<T>(-0.03333333333333333)
+                              +in[(i+3)*n+j] * static_cast<T>(0.03333333333333333)
+                              +in[(i-3)*n+j] * static_cast<T>(-0.03333333333333333)
                               +in[i*n+(j+4)] * static_cast<T>(0.025)
                               +in[i*n+(j-4)] * static_cast<T>(-0.025)
                               +in[(i+4)*n+j] * static_cast<T>(0.025)

From 527e58c517e52c16f8d8b9e41016878fa5c6b9bf Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 7 May 2022 12:02:04 +0300
Subject: [PATCH 24/80] remove unnecessary deprecated offset

Signed-off-by: Jeff Hammond <jeff.science@gmail.com>
---
 Cxx11/stencil-2d-sycl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/stencil-2d-sycl.cc b/Cxx11/stencil-2d-sycl.cc
index ee42e2da0..541273634 100644
--- a/Cxx11/stencil-2d-sycl.cc
+++ b/Cxx11/stencil-2d-sycl.cc
@@ -144,7 +144,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star
       q.submit([&](sycl::handler& h) {
         auto in  = d_in.template get_access<sycl::access::mode::read_write>(h);
         // Add constant to solution to force refresh of neighbor data, if any
-        h.parallel_for<class add<T>>(sycl::range<2> {n, n}, sycl::id<2> {0, 0}, [=] (sycl::item<2> it) {
+        h.parallel_for<class add<T>>(sycl::range<2> {n, n}, [=] (sycl::item<2> it) {
             sycl::id<2> xy = it.get_id();
             in[xy] += static_cast<T>(1);
         });

From 643b7965d2f4e47453cf5eaf4ff19db33a50313e Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 7 May 2022 12:12:31 +0300
Subject: [PATCH 25/80] add a workaround for FP64 problems with DPC++ on TGL

---
 common/make.defs.oneapi | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi
index ec6421d24..246af6200 100644
--- a/common/make.defs.oneapi
+++ b/common/make.defs.oneapi
@@ -19,7 +19,7 @@ CXX=icpx -std=c++20 -pthread #--gcc-toolchain=/opt/gcc/11.2.0
 # Compiler flags
 #
 # -xHOST is appropriate for most cases.
-DEFAULT_OPT_FLAGS=-g -O3 -xHOST
+DEFAULT_OPT_FLAGS=-g3 -O3 -xHOST
 #
 # If you are compiling for KNL on a Xeon login node, use the following:
 # DEFAULT_OPT_FLAGS=-g -O3 -xMIC-AVX512
@@ -32,6 +32,7 @@ OPENMPFLAG=-qopenmp
 OPENMPSIMDFLAG=-qopenmp-simd
 OFFLOADFLAG=-fopenmp-targets=spir64
 OFFLOADFLAG+=-DGPU_SCHEDULE=""
+STDPARFLAG=-parallel -qmkl
 #
 # OpenCL flags
 #
@@ -59,8 +60,12 @@ OPENCLFLAG=-I${OPENCLDIR}/include/sycl -L${OPENCLDIR}/lib -lOpenCL
 #
 SYCLCXX=dpcpp
 SYCLFLAG=-fsycl
-SYCLFLAG+=-std=c++17 -O3
+SYCLFLAG+=-std=c++17 -O3 -g3
 SYCLFLAG+=-DDPCPP
+# this is because the DPC++ compiler will fail to compile run<double> on Tiger Lake
+# even though the code explicitly checks for FP64 support and only instantiates the
+# template when the device query says FP64 is supported.
+SYCLFLAG+=-DDPCPP_NO_DOUBLE
 #
 #
 # OCCA

From 2d2898f1a5dd1b9c711fc8b6b65324ae3fae57c6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 7 May 2022 12:15:37 +0300
Subject: [PATCH 26/80] add a workaround for FP64 problems with DPC++ on TGL

---
 Cxx11/transpose-2d-sycl.cc  | 11 ++++++++++-
 Cxx11/transpose-sycl-usm.cc |  8 +++++++-
 Cxx11/transpose-sycl.cc     |  8 +++++++-
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/Cxx11/transpose-2d-sycl.cc b/Cxx11/transpose-2d-sycl.cc
index 83092891e..2fbe8938b 100644
--- a/Cxx11/transpose-2d-sycl.cc
+++ b/Cxx11/transpose-2d-sycl.cc
@@ -217,7 +217,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::host_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, order, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -234,7 +236,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::cpu_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, order, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -250,13 +254,18 @@ int main(int argc, char * argv[])
   try {
     sycl::queue q{sycl::gpu_selector{}};
     prk::SYCL::print_device_platform(q);
-    bool has_fp64 = prk::SYCL::has_fp64(q);
     run<float>(q, iterations, order, block_size);
+#ifndef DPCPP_NO_DOUBLE
+    bool has_fp64 = prk::SYCL::has_fp64(q);
+    if (has_fp64) {
+      if (prk::SYCL::print_gen12lp_helper(q)) return 1;
+    }
     if (has_fp64) {
       run<double>(q, iterations, order, block_size);
     } else {
       std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
     }
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
diff --git a/Cxx11/transpose-sycl-usm.cc b/Cxx11/transpose-sycl-usm.cc
index c1d9a4fec..1ec5c1470 100644
--- a/Cxx11/transpose-sycl-usm.cc
+++ b/Cxx11/transpose-sycl-usm.cc
@@ -200,7 +200,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::host_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, order, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -217,7 +219,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::cpu_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, order, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -233,16 +237,18 @@ int main(int argc, char * argv[])
   try {
     sycl::queue q{sycl::gpu_selector{}};
     prk::SYCL::print_device_platform(q);
+    run<float>(q, iterations, order, block_size);
+#ifndef DPCPP_NO_DOUBLE
     bool has_fp64 = prk::SYCL::has_fp64(q);
     if (has_fp64) {
       if (prk::SYCL::print_gen12lp_helper(q)) return 1;
     }
-    run<float>(q, iterations, order, block_size);
     if (has_fp64) {
       run<double>(q, iterations, order, block_size);
     } else {
       std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
     }
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index d3bcc0215..da0d596c0 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -216,7 +216,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::host_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, order, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -233,7 +235,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::cpu_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, order, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -249,16 +253,18 @@ int main(int argc, char * argv[])
   try {
     sycl::queue q{sycl::gpu_selector{}};
     prk::SYCL::print_device_platform(q);
+    run<float>(q, iterations, order, block_size);
+#ifndef DPCPP_NO_DOUBLE
     bool has_fp64 = prk::SYCL::has_fp64(q);
     if (has_fp64) {
       if (prk::SYCL::print_gen12lp_helper(q)) return 1;
     }
-    run<float>(q, iterations, order, block_size);
     if (has_fp64) {
       run<double>(q, iterations, order, block_size);
     } else {
       std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
     }
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;

From 7f5b6fa0d40665a297b0e66c8dc94b3a9948a321 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 7 May 2022 12:18:16 +0300
Subject: [PATCH 27/80] add a workaround for FP64 problems with DPC++ on TGL

---
 Cxx11/nstream-sycl-explicit-usm.cc | 8 +++++++-
 Cxx11/nstream-sycl-explicit.cc     | 8 +++++++-
 Cxx11/nstream-sycl-usm.cc          | 8 +++++++-
 Cxx11/nstream-sycl.cc              | 8 +++++++-
 4 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/Cxx11/nstream-sycl-explicit-usm.cc b/Cxx11/nstream-sycl-explicit-usm.cc
index 22325b565..aa5c5c690 100644
--- a/Cxx11/nstream-sycl-explicit-usm.cc
+++ b/Cxx11/nstream-sycl-explicit-usm.cc
@@ -278,7 +278,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::host_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, length, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -295,7 +297,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::cpu_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, length, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -311,16 +315,18 @@ int main(int argc, char * argv[])
   try {
     sycl::queue q{sycl::gpu_selector{}};
     prk::SYCL::print_device_platform(q);
+    run<float>(q, iterations, length, block_size);
+#ifndef DPCPP_NO_DOUBLE
     bool has_fp64 = prk::SYCL::has_fp64(q);
     if (has_fp64) {
       if (prk::SYCL::print_gen12lp_helper(q)) return 1;
     }
-    run<float>(q, iterations, length, block_size);
     if (has_fp64) {
       run<double>(q, iterations, length, block_size);
     } else {
       std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
     }
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc
index a3083a244..adf045d32 100644
--- a/Cxx11/nstream-sycl-explicit.cc
+++ b/Cxx11/nstream-sycl-explicit.cc
@@ -271,7 +271,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::host_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, length, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -288,7 +290,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::cpu_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, length, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -304,16 +308,18 @@ int main(int argc, char * argv[])
   try {
     sycl::queue q{sycl::gpu_selector{}};
     prk::SYCL::print_device_platform(q);
+    run<float>(q, iterations, length, block_size);
+#ifndef DPCPP_NO_DOUBLE
     bool has_fp64 = prk::SYCL::has_fp64(q);
     if (has_fp64) {
       if (prk::SYCL::print_gen12lp_helper(q)) return 1;
     }
-    run<float>(q, iterations, length, block_size);
     if (has_fp64) {
       run<double>(q, iterations, length, block_size);
     } else {
       std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
     }
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
diff --git a/Cxx11/nstream-sycl-usm.cc b/Cxx11/nstream-sycl-usm.cc
index f119746ff..e872a5130 100644
--- a/Cxx11/nstream-sycl-usm.cc
+++ b/Cxx11/nstream-sycl-usm.cc
@@ -256,7 +256,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::host_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, length, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -273,7 +275,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::cpu_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, length, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -289,16 +293,18 @@ int main(int argc, char * argv[])
   try {
     sycl::queue q{sycl::gpu_selector{}};
     prk::SYCL::print_device_platform(q);
+    run<float>(q, iterations, length, block_size);
+#ifndef DPCPP_NO_DOUBLE
     bool has_fp64 = prk::SYCL::has_fp64(q);
     if (has_fp64) {
       if (prk::SYCL::print_gen12lp_helper(q)) return 1;
     }
-    run<float>(q, iterations, length, block_size);
     if (has_fp64) {
       run<double>(q, iterations, length, block_size);
     } else {
       std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
     }
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index a95a163aa..140125f9d 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -253,7 +253,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::host_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, length, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -270,7 +272,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::cpu_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, length, block_size);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -286,16 +290,18 @@ int main(int argc, char * argv[])
   try {
     sycl::queue q{sycl::gpu_selector{}};
     prk::SYCL::print_device_platform(q);
+    run<float>(q, iterations, length, block_size);
+#ifndef DPCPP_NO_DOUBLE
     bool has_fp64 = prk::SYCL::has_fp64(q);
     if (has_fp64) {
       if (prk::SYCL::print_gen12lp_helper(q)) return 1;
     }
-    run<float>(q, iterations, length, block_size);
     if (has_fp64) {
       run<double>(q, iterations, length, block_size);
     } else {
       std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
     }
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;

From 86b6acd2a6c9760d9b6d4bb39e41578561d5f0b3 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 10:09:29 +0300
Subject: [PATCH 28/80] no double stuff

---
 Cxx11/stencil-2d-sycl.cc  | 11 ++++++++++-
 Cxx11/stencil-sycl-usm.cc |  8 +++++++-
 Cxx11/stencil-sycl.cc     |  8 +++++++-
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/Cxx11/stencil-2d-sycl.cc b/Cxx11/stencil-2d-sycl.cc
index 541273634..b6eeb09bc 100644
--- a/Cxx11/stencil-2d-sycl.cc
+++ b/Cxx11/stencil-2d-sycl.cc
@@ -281,7 +281,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::host_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, n, block_size, star, radius);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, n, block_size, star, radius);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -298,7 +300,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::cpu_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, n, block_size, star, radius);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, n, block_size, star, radius);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -314,13 +318,18 @@ int main(int argc, char * argv[])
   try {
     sycl::queue q{sycl::gpu_selector{}};
     prk::SYCL::print_device_platform(q);
-    bool has_fp64 = prk::SYCL::has_fp64(q);
     run<float>(q, iterations, n, block_size, star, radius);
+#ifndef DPCPP_NO_DOUBLE
+    bool has_fp64 = prk::SYCL::has_fp64(q);
+    if (has_fp64) {
+      if (prk::SYCL::print_gen12lp_helper(q)) return 1;
+    }
     if (has_fp64) {
       run<double>(q, iterations, n, block_size, star, radius);
     } else {
       std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
     }
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
diff --git a/Cxx11/stencil-sycl-usm.cc b/Cxx11/stencil-sycl-usm.cc
index 8b7adfac5..b219b24f1 100644
--- a/Cxx11/stencil-sycl-usm.cc
+++ b/Cxx11/stencil-sycl-usm.cc
@@ -270,7 +270,9 @@ int main(int argc, char * argv[])
     sycl::queue q(sycl::host_selector{}, sycl::property::queue::in_order{});
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, n, block_size, star, radius);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, n, block_size, star, radius);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -287,7 +289,9 @@ int main(int argc, char * argv[])
     sycl::queue q(sycl::cpu_selector{}, sycl::property::queue::in_order{});
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, n, block_size, star, radius);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, n, block_size, star, radius);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -303,16 +307,18 @@ int main(int argc, char * argv[])
   try {
     sycl::queue q(sycl::gpu_selector{}, sycl::property::queue::in_order{});
     prk::SYCL::print_device_platform(q);
+    run<float>(q, iterations, n, block_size, star, radius);
+#ifndef DPCPP_NO_DOUBLE
     bool has_fp64 = prk::SYCL::has_fp64(q);
     if (has_fp64) {
       if (prk::SYCL::print_gen12lp_helper(q)) return 1;
     }
-    run<float>(q, iterations, n, block_size, star, radius);
     if (has_fp64) {
       run<double>(q, iterations, n, block_size, star, radius);
     } else {
       std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
     }
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index b78706df2..8947c8dee 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -279,7 +279,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::host_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, n, block_size, star, radius);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, n, block_size, star, radius);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -296,7 +298,9 @@ int main(int argc, char * argv[])
     sycl::queue q{sycl::cpu_selector{}};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, n, block_size, star, radius);
+#ifndef DPCPP_NO_DOUBLE
     run<double>(q, iterations, n, block_size, star, radius);
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;
@@ -312,16 +316,18 @@ int main(int argc, char * argv[])
   try {
     sycl::queue q{sycl::gpu_selector{}};
     prk::SYCL::print_device_platform(q);
+    run<float>(q, iterations, n, block_size, star, radius);
+#ifndef DPCPP_NO_DOUBLE
     bool has_fp64 = prk::SYCL::has_fp64(q);
     if (has_fp64) {
       if (prk::SYCL::print_gen12lp_helper(q)) return 1;
     }
-    run<float>(q, iterations, n, block_size, star, radius);
     if (has_fp64) {
       run<double>(q, iterations, n, block_size, star, radius);
     } else {
       std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
     }
+#endif
   }
   catch (sycl::exception & e) {
     std::cout << e.what() << std::endl;

From bfc6bb94d033852e6b303a253e334fa733a6aee4 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 10:50:08 +0300
Subject: [PATCH 29/80] nstream C OpenACC

---
 C1z/nstream-openacc.c | 173 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 C1z/nstream-openacc.c

diff --git a/C1z/nstream-openacc.c b/C1z/nstream-openacc.c
new file mode 100644
index 000000000..051342f45
--- /dev/null
+++ b/C1z/nstream-openacc.c
@@ -0,0 +1,173 @@
+///
+/// Copyright (c) 2019, Intel Corporation
+/// Copyright (c) 2022, NVIDIA
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors and
+///          the length of the vectors.
+///
+///          <progname> <# iterations> <vector length>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///          Converted to C11 by Jeff Hammond, February 2019.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include <openacc.h>
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  printf("Parallel Research Kernels version %d\n", PRKVERSION );
+  printf("C11/OpenACC STREAM triad: A = B + scalar * C\n");
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  if (argc < 3) {
+    printf("Usage: <# iterations> <vector length>\n");
+    return 1;
+  }
+
+  int iterations = atoi(argv[1]);
+  if (iterations < 1) {
+    printf("ERROR: iterations must be >= 1\n");
+    return 1;
+  }
+
+  // length of a the vector
+  size_t length = atol(argv[2]);
+  if (length <= 0) {
+    printf("ERROR: Vector length must be greater than 0\n");
+    return 1;
+  }
+
+  printf("Number of iterations = %d\n", iterations);
+  printf("Vector length        = %zu\n", length);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double nstream_time = 0.0;
+
+  size_t bytes = length*sizeof(double);
+  double * restrict A = acc_malloc(bytes);
+  double * restrict B = acc_malloc(bytes);
+  double * restrict C = acc_malloc(bytes);
+
+  double scalar = 3.0;
+
+  {
+    #pragma acc parallel loop deviceptr(A,B,C)
+    for (size_t i=0; i<length; i++) {
+      A[i] = 0.0;
+      B[i] = 2.0;
+      C[i] = 2.0;
+    }
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk_wtime();
+
+      #pragma acc parallel loop deviceptr(A,B,C)
+      for (size_t i=0; i<length; i++) {
+          A[i] += B[i] + scalar * C[i];
+      }
+    }
+    nstream_time = prk_wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar = 0.0;
+  double br = 2.0;
+  double cr = 2.0;
+  for (int i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum = 0.0;
+  #pragma acc parallel loop reduction( +:asum ) deviceptr(A,B,C)
+  for (size_t i=0; i<length; i++) {
+      asum += fabs(A[i]);
+  }
+
+  acc_free(A);
+  acc_free(B);
+  acc_free(C);
+
+  double epsilon=1.e-8;
+  if (fabs(ar-asum)/asum > epsilon) {
+      printf("Failed Validation on output array\n"
+             "       Expected checksum: %lf\n"
+             "       Observed checksum: %lf\n"
+             "ERROR: solution did not validate\n", ar, asum);
+      return 1;
+  } else {
+      printf("Solution validates\n");
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime);
+  }
+
+  return 0;
+}
+
+

From e15f271a9153a24ae04d6911bb11dc9148d51929 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 11:07:08 +0300
Subject: [PATCH 30/80] transpose OpenACC

---
 C1z/transpose-openacc.c | 167 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 C1z/transpose-openacc.c

diff --git a/C1z/transpose-openacc.c b/C1z/transpose-openacc.c
new file mode 100644
index 000000000..679afb1d8
--- /dev/null
+++ b/C1z/transpose-openacc.c
@@ -0,0 +1,167 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+/// Copyright (c) 2022, NVIDIA
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations> [tile size]
+///
+///          An optional parameter specifies the tile size used to divide the
+///          individual matrix blocks for improved cache and TLB performance.
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///          C11-ification by Jeff Hammond, June 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include <openacc.h>
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  printf("Parallel Research Kernels version %d\n", PRKVERSION );
+  printf("C11/OpenACC Matrix transpose: B = A^T\n");
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  if (argc < 3) {
+    printf("Usage: <# iterations> <matrix order> [tile size]\n");
+    return 1;
+  }
+
+  // number of times to do the transpose
+  int iterations = atoi(argv[1]);
+  if (iterations < 1) {
+    printf("ERROR: iterations must be >= 1\n");
+    return 1;
+  }
+
+  // order of a the matrix
+  int order = atoi(argv[2]);
+  if (order <= 0) {
+    printf("ERROR: Matrix Order must be greater than 0\n");
+    return 1;
+  }
+
+  // default tile size for tiling of local transpose
+  int tile_size = (argc>3) ? atoi(argv[3]) : 32;
+  // a negative tile size means no tiling of the local transpose
+  if (tile_size <= 0) tile_size = order;
+
+  printf("Number of iterations  = %d\n", iterations);
+  printf("Matrix order          = %d\n", order);
+  printf("Tile size             = %d\n", tile_size);
+
+  //////////////////////////////////////////////////////////////////////
+  /// Allocate space for the input and transpose matrix
+  //////////////////////////////////////////////////////////////////////
+
+  double trans_time = 0.0;
+
+  size_t bytes = order*order*sizeof(double);
+  double * restrict A = acc_malloc(bytes);
+  double * restrict B = acc_malloc(bytes);
+
+  {
+    #pragma acc parallel loop deviceptr(A,B)
+    for (int i=0;i<order; i++) {
+      for (int j=0;j<order;j++) {
+        A[i*order+j] = (double)(i*order+j);
+        B[i*order+j] = 0.0;
+      }
+    }
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) trans_time = prk_wtime();
+
+      #pragma acc parallel loop tile(tile_size,tile_size) deviceptr(A,B)
+      for (int i=0;i<order; i++) {
+        for (int j=0;j<order;j++) {
+          B[i*order+j] += A[j*order+i];
+          A[j*order+i] += 1.0;
+        }
+      }
+    }
+    trans_time = prk_wtime() - trans_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  const double addit = (iterations+1.) * (iterations/2.);
+  double abserr = 0.0;
+  #pragma acc parallel loop reduction( +:abserr ) deviceptr(B)
+  for (int j=0; j<order; j++) {
+    for (int i=0; i<order; i++) {
+      const size_t ij = i*order+j;
+      const size_t ji = j*order+i;
+      const double reference = (double)(ij)*(1.+iterations)+addit;
+      abserr += fabs(B[ji] - reference);
+    }
+  }
+
+  acc_free(A);
+  acc_free(B);
+
+#ifdef VERBOSE
+  printf("Sum of absolute differences: %lf\n", abserr);
+#endif
+
+  const double epsilon = 1.0e-8;
+  if (abserr < epsilon) {
+    printf("Solution validates\n");
+    const double avgtime = trans_time/iterations;
+    printf("Rate (MB/s): %lf Avg time (s): %lf\n", 2.0e-6 * bytes/avgtime, avgtime );
+  } else {
+    printf("ERROR: Aggregate squared error %lf exceeds threshold %lf\n", abserr, epsilon );
+    return 1;
+  }
+
+  return 0;
+}
+
+

From 19b7e2acd0421b8aab8bdf64469d161d386da26f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 11:07:13 +0300
Subject: [PATCH 31/80] transpose OpenACC

---
 C1z/Makefile | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/C1z/Makefile b/C1z/Makefile
index f8927c191..5e01f0894 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -22,34 +22,22 @@ endif
 ASMFLAGS = -fverbose-asm $(CFLAGS)
 
 OMPFLAGS = $(OPENMPFLAG)
+ACCFLAGS = $(OPENACCFLAG)
 TARGETFLAGS = $(OFFLOADFLAG)
 CILKFLAGS = $(CILKFLAG)
 ISPCFLAGS = $(ISPCFLAG)
 
-.PHONY: all clean serial thread openmp target taskloop ispc # cilk
+.PHONY: all clean serial thread openmp tasks target taskloop ispc
 
 EXTRA=
-ifeq ($(shell uname -s),Darwin)
-  ifneq ($(findstring icc,$(CC)),icc)
-    EXTRA += target
-  endif
-else
-  ifneq ($(findstring icx,$(CC)),icx)
-    EXTRA += target
-  endif
-endif
 ifdef ($(ISPC))
   EXTRA += ispc
 endif
 ifneq ($(CILKFLAG),)
   EXTRA += cilk
 endif
-ifeq ($(findstring xlc,$(CC)),xlc)
-  EXTRA = target
-  CFLAGS += -DXLC
-endif
-ifneq ($(findstring icx,$(CC)),icx)
-  EXTRA += tasks
+ifneq ($(OPENACCFLAG),)
+  EXTRA += openacc
 endif
 
 all: serial thread openmp $(EXTRA)
@@ -83,6 +71,8 @@ target: nstream-target stencil-target transpose-target nstream-alloc-target nstr
 
 taskloop: nstream-taskloop stencil-taskloop transpose-taskloop
 
+openacc: nstream-openacc transpose-openacc
+
 cilk: stencil-cilk transpose-cilk
 
 ispc: transpose-ispc
@@ -132,6 +122,9 @@ p2p-2d: p2p-2d.c prk_util.h
 %-openmp: %-openmp.c prk_util.h prk_openmp.h
 	$(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@
 
+%-openacc: %-openacc.c prk_util.h
+	$(CC) $(CFLAGS) $< $(ACCFLAGS) $(EXTRA_CLIBS) -o $@
+
 %-cilk: %-cilk.c prk_util.h
 	$(CC) $(CFLAGS) $< $(CILKFLAGS) $(EXTRA_CLIBS) -o $@
 
@@ -161,6 +154,7 @@ clean:
 	-rm -f p2p-sse p2p-avx p2p-avx3 p2p-avx-tasks-openmp
 	-rm -f *-2d
 	-rm -f *-openmp
+	-rm -f *-openacc
 	-rm -f *-mpi
 	-rm -f *-petsc
 	-rm -f *-target

From dc4f0554be4ae24cb3f2d64977529fa8eac797f6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 11:07:44 +0300
Subject: [PATCH 32/80] ignore and build

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 7843446ba..cb7046a77 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,6 +65,7 @@ C1z/nstream-mmap
 C1z/nstream-mmap-openmp
 C1z/nstream-mpi
 C1z/nstream-openmp
+C1z/nstream-openacc
 C1z/nstream-petsc
 C1z/nstream-target
 C1z/nstream-taskloop
@@ -96,6 +97,7 @@ C1z/transpose-2d-openmp
 C1z/transpose-cilk
 C1z/transpose-ispc
 C1z/transpose-openmp
+C1z/transpose-openacc
 C1z/transpose-petsc
 C1z/transpose-target
 C1z/transpose-taskloop

From a7bb31096312af10dbbb09c9cb3e7b894afcfcc3 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 11:16:35 +0300
Subject: [PATCH 33/80] OpenACC C stencil

---
 .gitignore            |   1 +
 C1z/Makefile          |   2 +-
 C1z/stencil-openacc.c | 230 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 232 insertions(+), 1 deletion(-)
 create mode 100644 C1z/stencil-openacc.c

diff --git a/.gitignore b/.gitignore
index cb7046a77..df8aeaa8f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -89,6 +89,7 @@ C1z/stencil-2d
 C1z/stencil-2d-openmp
 C1z/stencil-cilk
 C1z/stencil-openmp
+C1z/stencil-openacc
 C1z/stencil-target
 C1z/stencil-taskloop
 C1z/transpose
diff --git a/C1z/Makefile b/C1z/Makefile
index 5e01f0894..c8c61ed10 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -71,7 +71,7 @@ target: nstream-target stencil-target transpose-target nstream-alloc-target nstr
 
 taskloop: nstream-taskloop stencil-taskloop transpose-taskloop
 
-openacc: nstream-openacc transpose-openacc
+openacc: nstream-openacc stencil-openacc transpose-openacc
 
 cilk: stencil-cilk transpose-cilk
 
diff --git a/C1z/stencil-openacc.c b/C1z/stencil-openacc.c
new file mode 100644
index 000000000..edc7e994b
--- /dev/null
+++ b/C1z/stencil-openacc.c
@@ -0,0 +1,230 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+/// Copyright (c) 2022, NVIDIA
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Stencil
+///
+/// PURPOSE: This program tests the efficiency with which a space-invariant,
+///          linear, symmetric filter (stencil) can be applied to a square
+///          grid or image.
+///
+/// USAGE:   The program takes as input the linear
+///          dimension of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <grid size>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///          - C99-ification by Jeff Hammond, February 2016.
+///          - C11-ification by Jeff Hammond, June 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include <openacc.h>
+#include "prk_util.h"
+
+typedef void (*stencil_t)(const int, const double * restrict, double * restrict);
+
+void nothing(const int n, const double * restrict in, double * restrict out)
+{
+    printf("You are trying to use a stencil that does not exist.\n");
+    printf("Please generate the new stencil using the code generator.\n");
+    // n will never be zero - this is to silence compiler warnings.
+    if (n==0) printf("%p %p\n", in, out);
+    abort();
+}
+
+#include "stencil_openacc.h"
+
+int main(int argc, char * argv[])
+{
+  printf("Parallel Research Kernels version %d\n", PRKVERSION);
+  printf("C11/OpenACC Stencil execution on 2D grid\n");
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  if (argc < 3){
+    printf("Usage: <# iterations> <array dimension> [<star/grid> <radius>]\n");
+    return 1;
+  }
+
+  int iterations = atoi(argv[1]);
+  if (iterations < 1) {
+    printf("ERROR: iterations must be >= 1\n");
+    return 1;
+  }
+
+  int n  = atoi(argv[2]);
+  if (n < 1) {
+    printf("ERROR: grid dimension must be positive\n");
+    return 1;
+  } else if (n > floor(sqrt(INT_MAX))) {
+    printf("ERROR: grid dimension too large - overflow risk\n");
+    return 1;
+  }
+
+  // stencil pattern
+  bool star = true;
+  if (argc > 3) {
+      char* pattern = argv[3];
+      star = (0==strncmp(pattern,"star",4)) ? true : false;
+  }
+
+  // stencil radius
+  int radius = 2;
+  if (argc > 4) {
+      radius = atoi(argv[4]);
+  }
+
+  if ( (radius < 1) || (2*radius+1 > n) ) {
+    printf("ERROR: Stencil radius negative or too large\n");
+    return 1;
+  }
+
+  printf("Number of iterations      = %d\n", iterations);
+  printf("Grid sizes                = %d\n", n);
+  printf("Type of stencil           = %s\n", (star ? "star" : "grid") );
+  printf("Radius of stencil         = %d\n", radius );
+
+  stencil_t stencil = nothing;
+  if (star) {
+      switch (radius) {
+          case 1: stencil = star1; break;
+          case 2: stencil = star2; break;
+          case 3: stencil = star3; break;
+          case 4: stencil = star4; break;
+          case 5: stencil = star5; break;
+          case 6: stencil = star6; break;
+          case 7: stencil = star7; break;
+          case 8: stencil = star8; break;
+          case 9: stencil = star9; break;
+      }
+  } else {
+      switch (radius) {
+          case 1: stencil = grid1; break;
+          case 2: stencil = grid2; break;
+          case 3: stencil = grid3; break;
+          case 4: stencil = grid4; break;
+          case 5: stencil = grid5; break;
+          case 6: stencil = grid6; break;
+          case 7: stencil = grid7; break;
+          case 8: stencil = grid8; break;
+          case 9: stencil = grid9; break;
+      }
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double stencil_time = 0.0;
+
+  // interior of grid with respect to stencil
+  size_t active_points = (n-2*radius)*(n-2*radius);
+  size_t bytes = n*n*sizeof(double);
+
+  double * restrict in  = acc_malloc(bytes);
+  double * restrict out = acc_malloc(bytes);
+
+  {
+    #pragma acc parallel loop collapse(2) deviceptr(in,out)
+    for (int i=0; i<n; i++) {
+      for (int j=0; j<n; j++) {
+        in[i*n+j]  = (double)(i+j);
+        out[i*n+j] = 0.0;
+      }
+    }
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) stencil_time = prk_wtime();
+
+      stencil(n, in, out);
+
+      #pragma acc parallel loop collapse(2) deviceptr(in,out)
+      for (int i=0; i<n; i++) {
+        for (int j=0; j<n; j++) {
+          in[i*n+j] += 1.0;
+        }
+      }
+    }
+    stencil_time = prk_wtime() - stencil_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  // compute L1 norm in parallel
+  double norm = 0.0;
+  #pragma acc parallel loop reduction( +:norm ) deviceptr(out)
+  for (int i=radius; i<n-radius; i++) {
+    for (int j=radius; j<n-radius; j++) {
+      norm += fabs(out[i*n+j]);
+    }
+  }
+  norm /= active_points;
+
+  acc_free(in);
+  acc_free(out);
+
+  // verify correctness
+  const double epsilon = 1.0e-8;
+  double reference_norm = 2.*(iterations+1.);
+  if (fabs(norm-reference_norm) > epsilon) {
+    printf("ERROR: L1 norm = %lf Reference L1 norm = %lf\n", norm, reference_norm);
+    return 1;
+  } else {
+    printf("Solution validates\n");
+#ifdef VERBOSE
+    printf("L1 norm = %lf Reference L1 norm = %lf\n", norm, reference_norm);
+#endif
+    const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
+    size_t flops = (2*stencil_size+1) * active_points;
+    double avgtime = stencil_time/iterations;
+    printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 1.0e-6 * (double)flops/avgtime, avgtime );
+  }
+
+  return 0;
+}

From c8fe3a913230cfd8a55739d0fef0cf08b39ffc7c Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 12:03:00 +0300
Subject: [PATCH 34/80] stencil for OpenACC

---
 C1z/generate-c-stencil.py |    6 +-
 C1z/stencil_openacc.h     | 3126 +++++++++++++++++++++++++++++++++++++
 2 files changed, 3130 insertions(+), 2 deletions(-)
 create mode 100644 C1z/stencil_openacc.h

diff --git a/C1z/generate-c-stencil.py b/C1z/generate-c-stencil.py
index f6dc86032..20a2c9455 100755
--- a/C1z/generate-c-stencil.py
+++ b/C1z/generate-c-stencil.py
@@ -21,9 +21,11 @@ def codegen(src,pattern,stencil_size,radius,W,model,dim):
     if (model=='openmp'):
         outer += 'OMP_FOR()\n  '
     elif (model=='target'):
-        outer += 'OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )\n  '
+        outer += 'OMP_TARGET( teams distribute parallel for simd collapse(2) )\n  '
     elif (model=='taskloop'):
         outer += 'OMP_TASKLOOP( firstprivate(n) shared(in,out) grainsize(gs) )\n  '
+    elif (model=='openacc'):
+        outer += 'PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )\n  '
     elif (model=='cilk'):
         outer += '_Cilk_'
 
@@ -82,7 +84,7 @@ def instance(src,model,pattern,r,dim):
     codegen(src,pattern,stencil_size,r,W,model,dim)
 
 def main():
-    for model in ['seq','openmp','target','cilk','taskloop']:
+    for model in ['seq','openmp','target','cilk','taskloop','openacc']:
       src = open('stencil_'+model+'.h','w')
       for pattern in ['star','grid']:
         for r in range(1,10):
diff --git a/C1z/stencil_openacc.h b/C1z/stencil_openacc.h
new file mode 100644
index 000000000..09652a00b
--- /dev/null
+++ b/C1z/stencil_openacc.h
@@ -0,0 +1,3126 @@
+void star1(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=1; i<n-1; i++) {
+    for (int j=1; j<n-1; j++) {
+        out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
+                      +in[(i+0)*n+(j+-1)] * -0.5
+                      +in[(i+0)*n+(j+1)] * 0.5
+                      +in[(i+1)*n+(j+0)] * 0.5;
+    }
+  }
+}
+
+void star1_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=1; i<n-1; i++) {
+    for (int j=1; j<n-1; j++) {
+        out[i][j] += +in[(i+-1)][(j+0)] * -0.5
+                      +in[(i+0)][(j+-1)] * -0.5
+                      +in[(i+0)][(j+1)] * 0.5
+                      +in[(i+1)][(j+0)] * 0.5;
+    }
+  }
+}
+
+void star2(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=2; i<n-2; i++) {
+    for (int j=2; j<n-2; j++) {
+        out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
+                      +in[(i+-1)*n+(j+0)] * -0.25
+                      +in[(i+0)*n+(j+-2)] * -0.125
+                      +in[(i+0)*n+(j+-1)] * -0.25
+                      +in[(i+0)*n+(j+1)] * 0.25
+                      +in[(i+0)*n+(j+2)] * 0.125
+                      +in[(i+1)*n+(j+0)] * 0.25
+                      +in[(i+2)*n+(j+0)] * 0.125;
+    }
+  }
+}
+
+void star2_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=2; i<n-2; i++) {
+    for (int j=2; j<n-2; j++) {
+        out[i][j] += +in[(i+-2)][(j+0)] * -0.125
+                      +in[(i+-1)][(j+0)] * -0.25
+                      +in[(i+0)][(j+-2)] * -0.125
+                      +in[(i+0)][(j+-1)] * -0.25
+                      +in[(i+0)][(j+1)] * 0.25
+                      +in[(i+0)][(j+2)] * 0.125
+                      +in[(i+1)][(j+0)] * 0.25
+                      +in[(i+2)][(j+0)] * 0.125;
+    }
+  }
+}
+
+void star3(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=3; i<n-3; i++) {
+    for (int j=3; j<n-3; j++) {
+        out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555
+                      +in[(i+-2)*n+(j+0)] * -0.08333333333333333
+                      +in[(i+-1)*n+(j+0)] * -0.16666666666666666
+                      +in[(i+0)*n+(j+-3)] * -0.05555555555555555
+                      +in[(i+0)*n+(j+-2)] * -0.08333333333333333
+                      +in[(i+0)*n+(j+-1)] * -0.16666666666666666
+                      +in[(i+0)*n+(j+1)] * 0.16666666666666666
+                      +in[(i+0)*n+(j+2)] * 0.08333333333333333
+                      +in[(i+0)*n+(j+3)] * 0.05555555555555555
+                      +in[(i+1)*n+(j+0)] * 0.16666666666666666
+                      +in[(i+2)*n+(j+0)] * 0.08333333333333333
+                      +in[(i+3)*n+(j+0)] * 0.05555555555555555;
+    }
+  }
+}
+
+void star3_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=3; i<n-3; i++) {
+    for (int j=3; j<n-3; j++) {
+        out[i][j] += +in[(i+-3)][(j+0)] * -0.05555555555555555
+                      +in[(i+-2)][(j+0)] * -0.08333333333333333
+                      +in[(i+-1)][(j+0)] * -0.16666666666666666
+                      +in[(i+0)][(j+-3)] * -0.05555555555555555
+                      +in[(i+0)][(j+-2)] * -0.08333333333333333
+                      +in[(i+0)][(j+-1)] * -0.16666666666666666
+                      +in[(i+0)][(j+1)] * 0.16666666666666666
+                      +in[(i+0)][(j+2)] * 0.08333333333333333
+                      +in[(i+0)][(j+3)] * 0.05555555555555555
+                      +in[(i+1)][(j+0)] * 0.16666666666666666
+                      +in[(i+2)][(j+0)] * 0.08333333333333333
+                      +in[(i+3)][(j+0)] * 0.05555555555555555;
+    }
+  }
+}
+
+void star4(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=4; i<n-4; i++) {
+    for (int j=4; j<n-4; j++) {
+        out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
+                      +in[(i+-3)*n+(j+0)] * -0.041666666666666664
+                      +in[(i+-2)*n+(j+0)] * -0.0625
+                      +in[(i+-1)*n+(j+0)] * -0.125
+                      +in[(i+0)*n+(j+-4)] * -0.03125
+                      +in[(i+0)*n+(j+-3)] * -0.041666666666666664
+                      +in[(i+0)*n+(j+-2)] * -0.0625
+                      +in[(i+0)*n+(j+-1)] * -0.125
+                      +in[(i+0)*n+(j+1)] * 0.125
+                      +in[(i+0)*n+(j+2)] * 0.0625
+                      +in[(i+0)*n+(j+3)] * 0.041666666666666664
+                      +in[(i+0)*n+(j+4)] * 0.03125
+                      +in[(i+1)*n+(j+0)] * 0.125
+                      +in[(i+2)*n+(j+0)] * 0.0625
+                      +in[(i+3)*n+(j+0)] * 0.041666666666666664
+                      +in[(i+4)*n+(j+0)] * 0.03125;
+    }
+  }
+}
+
+void star4_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=4; i<n-4; i++) {
+    for (int j=4; j<n-4; j++) {
+        out[i][j] += +in[(i+-4)][(j+0)] * -0.03125
+                      +in[(i+-3)][(j+0)] * -0.041666666666666664
+                      +in[(i+-2)][(j+0)] * -0.0625
+                      +in[(i+-1)][(j+0)] * -0.125
+                      +in[(i+0)][(j+-4)] * -0.03125
+                      +in[(i+0)][(j+-3)] * -0.041666666666666664
+                      +in[(i+0)][(j+-2)] * -0.0625
+                      +in[(i+0)][(j+-1)] * -0.125
+                      +in[(i+0)][(j+1)] * 0.125
+                      +in[(i+0)][(j+2)] * 0.0625
+                      +in[(i+0)][(j+3)] * 0.041666666666666664
+                      +in[(i+0)][(j+4)] * 0.03125
+                      +in[(i+1)][(j+0)] * 0.125
+                      +in[(i+2)][(j+0)] * 0.0625
+                      +in[(i+3)][(j+0)] * 0.041666666666666664
+                      +in[(i+4)][(j+0)] * 0.03125;
+    }
+  }
+}
+
+void star5(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=5; i<n-5; i++) {
+    for (int j=5; j<n-5; j++) {
+        out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
+                      +in[(i+-4)*n+(j+0)] * -0.025
+                      +in[(i+-3)*n+(j+0)] * -0.03333333333333333
+                      +in[(i+-2)*n+(j+0)] * -0.05
+                      +in[(i+-1)*n+(j+0)] * -0.1
+                      +in[(i+0)*n+(j+-5)] * -0.02
+                      +in[(i+0)*n+(j+-4)] * -0.025
+                      +in[(i+0)*n+(j+-3)] * -0.03333333333333333
+                      +in[(i+0)*n+(j+-2)] * -0.05
+                      +in[(i+0)*n+(j+-1)] * -0.1
+                      +in[(i+0)*n+(j+1)] * 0.1
+                      +in[(i+0)*n+(j+2)] * 0.05
+                      +in[(i+0)*n+(j+3)] * 0.03333333333333333
+                      +in[(i+0)*n+(j+4)] * 0.025
+                      +in[(i+0)*n+(j+5)] * 0.02
+                      +in[(i+1)*n+(j+0)] * 0.1
+                      +in[(i+2)*n+(j+0)] * 0.05
+                      +in[(i+3)*n+(j+0)] * 0.03333333333333333
+                      +in[(i+4)*n+(j+0)] * 0.025
+                      +in[(i+5)*n+(j+0)] * 0.02;
+    }
+  }
+}
+
+void star5_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=5; i<n-5; i++) {
+    for (int j=5; j<n-5; j++) {
+        out[i][j] += +in[(i+-5)][(j+0)] * -0.02
+                      +in[(i+-4)][(j+0)] * -0.025
+                      +in[(i+-3)][(j+0)] * -0.03333333333333333
+                      +in[(i+-2)][(j+0)] * -0.05
+                      +in[(i+-1)][(j+0)] * -0.1
+                      +in[(i+0)][(j+-5)] * -0.02
+                      +in[(i+0)][(j+-4)] * -0.025
+                      +in[(i+0)][(j+-3)] * -0.03333333333333333
+                      +in[(i+0)][(j+-2)] * -0.05
+                      +in[(i+0)][(j+-1)] * -0.1
+                      +in[(i+0)][(j+1)] * 0.1
+                      +in[(i+0)][(j+2)] * 0.05
+                      +in[(i+0)][(j+3)] * 0.03333333333333333
+                      +in[(i+0)][(j+4)] * 0.025
+                      +in[(i+0)][(j+5)] * 0.02
+                      +in[(i+1)][(j+0)] * 0.1
+                      +in[(i+2)][(j+0)] * 0.05
+                      +in[(i+3)][(j+0)] * 0.03333333333333333
+                      +in[(i+4)][(j+0)] * 0.025
+                      +in[(i+5)][(j+0)] * 0.02;
+    }
+  }
+}
+
+void star6(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=6; i<n-6; i++) {
+    for (int j=6; j<n-6; j++) {
+        out[i*n+j] += +in[(i+-6)*n+(j+0)] * -0.013888888888888888
+                      +in[(i+-5)*n+(j+0)] * -0.016666666666666666
+                      +in[(i+-4)*n+(j+0)] * -0.020833333333333332
+                      +in[(i+-3)*n+(j+0)] * -0.027777777777777776
+                      +in[(i+-2)*n+(j+0)] * -0.041666666666666664
+                      +in[(i+-1)*n+(j+0)] * -0.08333333333333333
+                      +in[(i+0)*n+(j+-6)] * -0.013888888888888888
+                      +in[(i+0)*n+(j+-5)] * -0.016666666666666666
+                      +in[(i+0)*n+(j+-4)] * -0.020833333333333332
+                      +in[(i+0)*n+(j+-3)] * -0.027777777777777776
+                      +in[(i+0)*n+(j+-2)] * -0.041666666666666664
+                      +in[(i+0)*n+(j+-1)] * -0.08333333333333333
+                      +in[(i+0)*n+(j+1)] * 0.08333333333333333
+                      +in[(i+0)*n+(j+2)] * 0.041666666666666664
+                      +in[(i+0)*n+(j+3)] * 0.027777777777777776
+                      +in[(i+0)*n+(j+4)] * 0.020833333333333332
+                      +in[(i+0)*n+(j+5)] * 0.016666666666666666
+                      +in[(i+0)*n+(j+6)] * 0.013888888888888888
+                      +in[(i+1)*n+(j+0)] * 0.08333333333333333
+                      +in[(i+2)*n+(j+0)] * 0.041666666666666664
+                      +in[(i+3)*n+(j+0)] * 0.027777777777777776
+                      +in[(i+4)*n+(j+0)] * 0.020833333333333332
+                      +in[(i+5)*n+(j+0)] * 0.016666666666666666
+                      +in[(i+6)*n+(j+0)] * 0.013888888888888888;
+    }
+  }
+}
+
+void star6_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=6; i<n-6; i++) {
+    for (int j=6; j<n-6; j++) {
+        out[i][j] += +in[(i+-6)][(j+0)] * -0.013888888888888888
+                      +in[(i+-5)][(j+0)] * -0.016666666666666666
+                      +in[(i+-4)][(j+0)] * -0.020833333333333332
+                      +in[(i+-3)][(j+0)] * -0.027777777777777776
+                      +in[(i+-2)][(j+0)] * -0.041666666666666664
+                      +in[(i+-1)][(j+0)] * -0.08333333333333333
+                      +in[(i+0)][(j+-6)] * -0.013888888888888888
+                      +in[(i+0)][(j+-5)] * -0.016666666666666666
+                      +in[(i+0)][(j+-4)] * -0.020833333333333332
+                      +in[(i+0)][(j+-3)] * -0.027777777777777776
+                      +in[(i+0)][(j+-2)] * -0.041666666666666664
+                      +in[(i+0)][(j+-1)] * -0.08333333333333333
+                      +in[(i+0)][(j+1)] * 0.08333333333333333
+                      +in[(i+0)][(j+2)] * 0.041666666666666664
+                      +in[(i+0)][(j+3)] * 0.027777777777777776
+                      +in[(i+0)][(j+4)] * 0.020833333333333332
+                      +in[(i+0)][(j+5)] * 0.016666666666666666
+                      +in[(i+0)][(j+6)] * 0.013888888888888888
+                      +in[(i+1)][(j+0)] * 0.08333333333333333
+                      +in[(i+2)][(j+0)] * 0.041666666666666664
+                      +in[(i+3)][(j+0)] * 0.027777777777777776
+                      +in[(i+4)][(j+0)] * 0.020833333333333332
+                      +in[(i+5)][(j+0)] * 0.016666666666666666
+                      +in[(i+6)][(j+0)] * 0.013888888888888888;
+    }
+  }
+}
+
+void star7(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=7; i<n-7; i++) {
+    for (int j=7; j<n-7; j++) {
+        out[i*n+j] += +in[(i+-7)*n+(j+0)] * -0.01020408163265306
+                      +in[(i+-6)*n+(j+0)] * -0.011904761904761904
+                      +in[(i+-5)*n+(j+0)] * -0.014285714285714285
+                      +in[(i+-4)*n+(j+0)] * -0.017857142857142856
+                      +in[(i+-3)*n+(j+0)] * -0.023809523809523808
+                      +in[(i+-2)*n+(j+0)] * -0.03571428571428571
+                      +in[(i+-1)*n+(j+0)] * -0.07142857142857142
+                      +in[(i+0)*n+(j+-7)] * -0.01020408163265306
+                      +in[(i+0)*n+(j+-6)] * -0.011904761904761904
+                      +in[(i+0)*n+(j+-5)] * -0.014285714285714285
+                      +in[(i+0)*n+(j+-4)] * -0.017857142857142856
+                      +in[(i+0)*n+(j+-3)] * -0.023809523809523808
+                      +in[(i+0)*n+(j+-2)] * -0.03571428571428571
+                      +in[(i+0)*n+(j+-1)] * -0.07142857142857142
+                      +in[(i+0)*n+(j+1)] * 0.07142857142857142
+                      +in[(i+0)*n+(j+2)] * 0.03571428571428571
+                      +in[(i+0)*n+(j+3)] * 0.023809523809523808
+                      +in[(i+0)*n+(j+4)] * 0.017857142857142856
+                      +in[(i+0)*n+(j+5)] * 0.014285714285714285
+                      +in[(i+0)*n+(j+6)] * 0.011904761904761904
+                      +in[(i+0)*n+(j+7)] * 0.01020408163265306
+                      +in[(i+1)*n+(j+0)] * 0.07142857142857142
+                      +in[(i+2)*n+(j+0)] * 0.03571428571428571
+                      +in[(i+3)*n+(j+0)] * 0.023809523809523808
+                      +in[(i+4)*n+(j+0)] * 0.017857142857142856
+                      +in[(i+5)*n+(j+0)] * 0.014285714285714285
+                      +in[(i+6)*n+(j+0)] * 0.011904761904761904
+                      +in[(i+7)*n+(j+0)] * 0.01020408163265306;
+    }
+  }
+}
+
+void star7_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=7; i<n-7; i++) {
+    for (int j=7; j<n-7; j++) {
+        out[i][j] += +in[(i+-7)][(j+0)] * -0.01020408163265306
+                      +in[(i+-6)][(j+0)] * -0.011904761904761904
+                      +in[(i+-5)][(j+0)] * -0.014285714285714285
+                      +in[(i+-4)][(j+0)] * -0.017857142857142856
+                      +in[(i+-3)][(j+0)] * -0.023809523809523808
+                      +in[(i+-2)][(j+0)] * -0.03571428571428571
+                      +in[(i+-1)][(j+0)] * -0.07142857142857142
+                      +in[(i+0)][(j+-7)] * -0.01020408163265306
+                      +in[(i+0)][(j+-6)] * -0.011904761904761904
+                      +in[(i+0)][(j+-5)] * -0.014285714285714285
+                      +in[(i+0)][(j+-4)] * -0.017857142857142856
+                      +in[(i+0)][(j+-3)] * -0.023809523809523808
+                      +in[(i+0)][(j+-2)] * -0.03571428571428571
+                      +in[(i+0)][(j+-1)] * -0.07142857142857142
+                      +in[(i+0)][(j+1)] * 0.07142857142857142
+                      +in[(i+0)][(j+2)] * 0.03571428571428571
+                      +in[(i+0)][(j+3)] * 0.023809523809523808
+                      +in[(i+0)][(j+4)] * 0.017857142857142856
+                      +in[(i+0)][(j+5)] * 0.014285714285714285
+                      +in[(i+0)][(j+6)] * 0.011904761904761904
+                      +in[(i+0)][(j+7)] * 0.01020408163265306
+                      +in[(i+1)][(j+0)] * 0.07142857142857142
+                      +in[(i+2)][(j+0)] * 0.03571428571428571
+                      +in[(i+3)][(j+0)] * 0.023809523809523808
+                      +in[(i+4)][(j+0)] * 0.017857142857142856
+                      +in[(i+5)][(j+0)] * 0.014285714285714285
+                      +in[(i+6)][(j+0)] * 0.011904761904761904
+                      +in[(i+7)][(j+0)] * 0.01020408163265306;
+    }
+  }
+}
+
+void star8(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=8; i<n-8; i++) {
+    for (int j=8; j<n-8; j++) {
+        out[i*n+j] += +in[(i+-8)*n+(j+0)] * -0.0078125
+                      +in[(i+-7)*n+(j+0)] * -0.008928571428571428
+                      +in[(i+-6)*n+(j+0)] * -0.010416666666666666
+                      +in[(i+-5)*n+(j+0)] * -0.0125
+                      +in[(i+-4)*n+(j+0)] * -0.015625
+                      +in[(i+-3)*n+(j+0)] * -0.020833333333333332
+                      +in[(i+-2)*n+(j+0)] * -0.03125
+                      +in[(i+-1)*n+(j+0)] * -0.0625
+                      +in[(i+0)*n+(j+-8)] * -0.0078125
+                      +in[(i+0)*n+(j+-7)] * -0.008928571428571428
+                      +in[(i+0)*n+(j+-6)] * -0.010416666666666666
+                      +in[(i+0)*n+(j+-5)] * -0.0125
+                      +in[(i+0)*n+(j+-4)] * -0.015625
+                      +in[(i+0)*n+(j+-3)] * -0.020833333333333332
+                      +in[(i+0)*n+(j+-2)] * -0.03125
+                      +in[(i+0)*n+(j+-1)] * -0.0625
+                      +in[(i+0)*n+(j+1)] * 0.0625
+                      +in[(i+0)*n+(j+2)] * 0.03125
+                      +in[(i+0)*n+(j+3)] * 0.020833333333333332
+                      +in[(i+0)*n+(j+4)] * 0.015625
+                      +in[(i+0)*n+(j+5)] * 0.0125
+                      +in[(i+0)*n+(j+6)] * 0.010416666666666666
+                      +in[(i+0)*n+(j+7)] * 0.008928571428571428
+                      +in[(i+0)*n+(j+8)] * 0.0078125
+                      +in[(i+1)*n+(j+0)] * 0.0625
+                      +in[(i+2)*n+(j+0)] * 0.03125
+                      +in[(i+3)*n+(j+0)] * 0.020833333333333332
+                      +in[(i+4)*n+(j+0)] * 0.015625
+                      +in[(i+5)*n+(j+0)] * 0.0125
+                      +in[(i+6)*n+(j+0)] * 0.010416666666666666
+                      +in[(i+7)*n+(j+0)] * 0.008928571428571428
+                      +in[(i+8)*n+(j+0)] * 0.0078125;
+    }
+  }
+}
+
+void star8_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=8; i<n-8; i++) {
+    for (int j=8; j<n-8; j++) {
+        out[i][j] += +in[(i+-8)][(j+0)] * -0.0078125
+                      +in[(i+-7)][(j+0)] * -0.008928571428571428
+                      +in[(i+-6)][(j+0)] * -0.010416666666666666
+                      +in[(i+-5)][(j+0)] * -0.0125
+                      +in[(i+-4)][(j+0)] * -0.015625
+                      +in[(i+-3)][(j+0)] * -0.020833333333333332
+                      +in[(i+-2)][(j+0)] * -0.03125
+                      +in[(i+-1)][(j+0)] * -0.0625
+                      +in[(i+0)][(j+-8)] * -0.0078125
+                      +in[(i+0)][(j+-7)] * -0.008928571428571428
+                      +in[(i+0)][(j+-6)] * -0.010416666666666666
+                      +in[(i+0)][(j+-5)] * -0.0125
+                      +in[(i+0)][(j+-4)] * -0.015625
+                      +in[(i+0)][(j+-3)] * -0.020833333333333332
+                      +in[(i+0)][(j+-2)] * -0.03125
+                      +in[(i+0)][(j+-1)] * -0.0625
+                      +in[(i+0)][(j+1)] * 0.0625
+                      +in[(i+0)][(j+2)] * 0.03125
+                      +in[(i+0)][(j+3)] * 0.020833333333333332
+                      +in[(i+0)][(j+4)] * 0.015625
+                      +in[(i+0)][(j+5)] * 0.0125
+                      +in[(i+0)][(j+6)] * 0.010416666666666666
+                      +in[(i+0)][(j+7)] * 0.008928571428571428
+                      +in[(i+0)][(j+8)] * 0.0078125
+                      +in[(i+1)][(j+0)] * 0.0625
+                      +in[(i+2)][(j+0)] * 0.03125
+                      +in[(i+3)][(j+0)] * 0.020833333333333332
+                      +in[(i+4)][(j+0)] * 0.015625
+                      +in[(i+5)][(j+0)] * 0.0125
+                      +in[(i+6)][(j+0)] * 0.010416666666666666
+                      +in[(i+7)][(j+0)] * 0.008928571428571428
+                      +in[(i+8)][(j+0)] * 0.0078125;
+    }
+  }
+}
+
+void star9(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=9; i<n-9; i++) {
+    for (int j=9; j<n-9; j++) {
+        out[i*n+j] += +in[(i+-9)*n+(j+0)] * -0.006172839506172839
+                      +in[(i+-8)*n+(j+0)] * -0.006944444444444444
+                      +in[(i+-7)*n+(j+0)] * -0.007936507936507936
+                      +in[(i+-6)*n+(j+0)] * -0.009259259259259259
+                      +in[(i+-5)*n+(j+0)] * -0.011111111111111112
+                      +in[(i+-4)*n+(j+0)] * -0.013888888888888888
+                      +in[(i+-3)*n+(j+0)] * -0.018518518518518517
+                      +in[(i+-2)*n+(j+0)] * -0.027777777777777776
+                      +in[(i+-1)*n+(j+0)] * -0.05555555555555555
+                      +in[(i+0)*n+(j+-9)] * -0.006172839506172839
+                      +in[(i+0)*n+(j+-8)] * -0.006944444444444444
+                      +in[(i+0)*n+(j+-7)] * -0.007936507936507936
+                      +in[(i+0)*n+(j+-6)] * -0.009259259259259259
+                      +in[(i+0)*n+(j+-5)] * -0.011111111111111112
+                      +in[(i+0)*n+(j+-4)] * -0.013888888888888888
+                      +in[(i+0)*n+(j+-3)] * -0.018518518518518517
+                      +in[(i+0)*n+(j+-2)] * -0.027777777777777776
+                      +in[(i+0)*n+(j+-1)] * -0.05555555555555555
+                      +in[(i+0)*n+(j+1)] * 0.05555555555555555
+                      +in[(i+0)*n+(j+2)] * 0.027777777777777776
+                      +in[(i+0)*n+(j+3)] * 0.018518518518518517
+                      +in[(i+0)*n+(j+4)] * 0.013888888888888888
+                      +in[(i+0)*n+(j+5)] * 0.011111111111111112
+                      +in[(i+0)*n+(j+6)] * 0.009259259259259259
+                      +in[(i+0)*n+(j+7)] * 0.007936507936507936
+                      +in[(i+0)*n+(j+8)] * 0.006944444444444444
+                      +in[(i+0)*n+(j+9)] * 0.006172839506172839
+                      +in[(i+1)*n+(j+0)] * 0.05555555555555555
+                      +in[(i+2)*n+(j+0)] * 0.027777777777777776
+                      +in[(i+3)*n+(j+0)] * 0.018518518518518517
+                      +in[(i+4)*n+(j+0)] * 0.013888888888888888
+                      +in[(i+5)*n+(j+0)] * 0.011111111111111112
+                      +in[(i+6)*n+(j+0)] * 0.009259259259259259
+                      +in[(i+7)*n+(j+0)] * 0.007936507936507936
+                      +in[(i+8)*n+(j+0)] * 0.006944444444444444
+                      +in[(i+9)*n+(j+0)] * 0.006172839506172839;
+    }
+  }
+}
+
+void star9_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=9; i<n-9; i++) {
+    for (int j=9; j<n-9; j++) {
+        out[i][j] += +in[(i+-9)][(j+0)] * -0.006172839506172839
+                      +in[(i+-8)][(j+0)] * -0.006944444444444444
+                      +in[(i+-7)][(j+0)] * -0.007936507936507936
+                      +in[(i+-6)][(j+0)] * -0.009259259259259259
+                      +in[(i+-5)][(j+0)] * -0.011111111111111112
+                      +in[(i+-4)][(j+0)] * -0.013888888888888888
+                      +in[(i+-3)][(j+0)] * -0.018518518518518517
+                      +in[(i+-2)][(j+0)] * -0.027777777777777776
+                      +in[(i+-1)][(j+0)] * -0.05555555555555555
+                      +in[(i+0)][(j+-9)] * -0.006172839506172839
+                      +in[(i+0)][(j+-8)] * -0.006944444444444444
+                      +in[(i+0)][(j+-7)] * -0.007936507936507936
+                      +in[(i+0)][(j+-6)] * -0.009259259259259259
+                      +in[(i+0)][(j+-5)] * -0.011111111111111112
+                      +in[(i+0)][(j+-4)] * -0.013888888888888888
+                      +in[(i+0)][(j+-3)] * -0.018518518518518517
+                      +in[(i+0)][(j+-2)] * -0.027777777777777776
+                      +in[(i+0)][(j+-1)] * -0.05555555555555555
+                      +in[(i+0)][(j+1)] * 0.05555555555555555
+                      +in[(i+0)][(j+2)] * 0.027777777777777776
+                      +in[(i+0)][(j+3)] * 0.018518518518518517
+                      +in[(i+0)][(j+4)] * 0.013888888888888888
+                      +in[(i+0)][(j+5)] * 0.011111111111111112
+                      +in[(i+0)][(j+6)] * 0.009259259259259259
+                      +in[(i+0)][(j+7)] * 0.007936507936507936
+                      +in[(i+0)][(j+8)] * 0.006944444444444444
+                      +in[(i+0)][(j+9)] * 0.006172839506172839
+                      +in[(i+1)][(j+0)] * 0.05555555555555555
+                      +in[(i+2)][(j+0)] * 0.027777777777777776
+                      +in[(i+3)][(j+0)] * 0.018518518518518517
+                      +in[(i+4)][(j+0)] * 0.013888888888888888
+                      +in[(i+5)][(j+0)] * 0.011111111111111112
+                      +in[(i+6)][(j+0)] * 0.009259259259259259
+                      +in[(i+7)][(j+0)] * 0.007936507936507936
+                      +in[(i+8)][(j+0)] * 0.006944444444444444
+                      +in[(i+9)][(j+0)] * 0.006172839506172839;
+    }
+  }
+}
+
+void grid1(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=1; i<n-1; i++) {
+    for (int j=1; j<n-1; j++) {
+        out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
+                      +in[(i+-1)*n+(j+0)] * -0.25
+                      +in[(i+0)*n+(j+-1)] * -0.25
+                      +in[(i+0)*n+(j+1)] * 0.25
+                      +in[(i+1)*n+(j+0)] * 0.25
+                      +in[(i+1)*n+(j+1)] * 0.25
+                      ;
+    }
+  }
+}
+
+void grid1_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=1; i<n-1; i++) {
+    for (int j=1; j<n-1; j++) {
+        out[i][j] += +in[(i+-1)][(j+-1)] * -0.25
+                      +in[(i+-1)][(j+0)] * -0.25
+                      +in[(i+0)][(j+-1)] * -0.25
+                      +in[(i+0)][(j+1)] * 0.25
+                      +in[(i+1)][(j+0)] * 0.25
+                      +in[(i+1)][(j+1)] * 0.25
+                      ;
+    }
+  }
+}
+
+void grid2(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=2; i<n-2; i++) {
+    for (int j=2; j<n-2; j++) {
+        out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
+                      +in[(i+-2)*n+(j+-1)] * -0.020833333333333332
+                      +in[(i+-2)*n+(j+0)] * -0.020833333333333332
+                      +in[(i+-2)*n+(j+1)] * -0.020833333333333332
+                      +in[(i+-1)*n+(j+-2)] * -0.020833333333333332
+                      +in[(i+-1)*n+(j+-1)] * -0.125
+                      +in[(i+-1)*n+(j+0)] * -0.125
+                      +in[(i+-1)*n+(j+2)] * 0.020833333333333332
+                      +in[(i+0)*n+(j+-2)] * -0.020833333333333332
+                      +in[(i+0)*n+(j+-1)] * -0.125
+                      +in[(i+0)*n+(j+1)] * 0.125
+                      +in[(i+0)*n+(j+2)] * 0.020833333333333332
+                      +in[(i+1)*n+(j+-2)] * -0.020833333333333332
+                      +in[(i+1)*n+(j+0)] * 0.125
+                      +in[(i+1)*n+(j+1)] * 0.125
+                      +in[(i+1)*n+(j+2)] * 0.020833333333333332
+                      +in[(i+2)*n+(j+-1)] * 0.020833333333333332
+                      +in[(i+2)*n+(j+0)] * 0.020833333333333332
+                      +in[(i+2)*n+(j+1)] * 0.020833333333333332
+                      +in[(i+2)*n+(j+2)] * 0.0625
+                      ;
+    }
+  }
+}
+
+void grid2_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=2; i<n-2; i++) {
+    for (int j=2; j<n-2; j++) {
+        out[i][j] += +in[(i+-2)][(j+-2)] * -0.0625
+                      +in[(i+-2)][(j+-1)] * -0.020833333333333332
+                      +in[(i+-2)][(j+0)] * -0.020833333333333332
+                      +in[(i+-2)][(j+1)] * -0.020833333333333332
+                      +in[(i+-1)][(j+-2)] * -0.020833333333333332
+                      +in[(i+-1)][(j+-1)] * -0.125
+                      +in[(i+-1)][(j+0)] * -0.125
+                      +in[(i+-1)][(j+2)] * 0.020833333333333332
+                      +in[(i+0)][(j+-2)] * -0.020833333333333332
+                      +in[(i+0)][(j+-1)] * -0.125
+                      +in[(i+0)][(j+1)] * 0.125
+                      +in[(i+0)][(j+2)] * 0.020833333333333332
+                      +in[(i+1)][(j+-2)] * -0.020833333333333332
+                      +in[(i+1)][(j+0)] * 0.125
+                      +in[(i+1)][(j+1)] * 0.125
+                      +in[(i+1)][(j+2)] * 0.020833333333333332
+                      +in[(i+2)][(j+-1)] * 0.020833333333333332
+                      +in[(i+2)][(j+0)] * 0.020833333333333332
+                      +in[(i+2)][(j+1)] * 0.020833333333333332
+                      +in[(i+2)][(j+2)] * 0.0625
+                      ;
+    }
+  }
+}
+
+void grid3(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=3; i<n-3; i++) {
+    for (int j=3; j<n-3; j++) {
+        out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776
+                      +in[(i+-3)*n+(j+-2)] * -0.005555555555555556
+                      +in[(i+-3)*n+(j+-1)] * -0.005555555555555556
+                      +in[(i+-3)*n+(j+0)] * -0.005555555555555556
+                      +in[(i+-3)*n+(j+1)] * -0.005555555555555556
+                      +in[(i+-3)*n+(j+2)] * -0.005555555555555556
+                      +in[(i+-2)*n+(j+-3)] * -0.005555555555555556
+                      +in[(i+-2)*n+(j+-2)] * -0.041666666666666664
+                      +in[(i+-2)*n+(j+-1)] * -0.013888888888888888
+                      +in[(i+-2)*n+(j+0)] * -0.013888888888888888
+                      +in[(i+-2)*n+(j+1)] * -0.013888888888888888
+                      +in[(i+-2)*n+(j+3)] * 0.005555555555555556
+                      +in[(i+-1)*n+(j+-3)] * -0.005555555555555556
+                      +in[(i+-1)*n+(j+-2)] * -0.013888888888888888
+                      +in[(i+-1)*n+(j+-1)] * -0.08333333333333333
+                      +in[(i+-1)*n+(j+0)] * -0.08333333333333333
+                      +in[(i+-1)*n+(j+2)] * 0.013888888888888888
+                      +in[(i+-1)*n+(j+3)] * 0.005555555555555556
+                      +in[(i+0)*n+(j+-3)] * -0.005555555555555556
+                      +in[(i+0)*n+(j+-2)] * -0.013888888888888888
+                      +in[(i+0)*n+(j+-1)] * -0.08333333333333333
+                      +in[(i+0)*n+(j+1)] * 0.08333333333333333
+                      +in[(i+0)*n+(j+2)] * 0.013888888888888888
+                      +in[(i+0)*n+(j+3)] * 0.005555555555555556
+                      +in[(i+1)*n+(j+-3)] * -0.005555555555555556
+                      +in[(i+1)*n+(j+-2)] * -0.013888888888888888
+                      +in[(i+1)*n+(j+0)] * 0.08333333333333333
+                      +in[(i+1)*n+(j+1)] * 0.08333333333333333
+                      +in[(i+1)*n+(j+2)] * 0.013888888888888888
+                      +in[(i+1)*n+(j+3)] * 0.005555555555555556
+                      +in[(i+2)*n+(j+-3)] * -0.005555555555555556
+                      +in[(i+2)*n+(j+-1)] * 0.013888888888888888
+                      +in[(i+2)*n+(j+0)] * 0.013888888888888888
+                      +in[(i+2)*n+(j+1)] * 0.013888888888888888
+                      +in[(i+2)*n+(j+2)] * 0.041666666666666664
+                      +in[(i+2)*n+(j+3)] * 0.005555555555555556
+                      +in[(i+3)*n+(j+-2)] * 0.005555555555555556
+                      +in[(i+3)*n+(j+-1)] * 0.005555555555555556
+                      +in[(i+3)*n+(j+0)] * 0.005555555555555556
+                      +in[(i+3)*n+(j+1)] * 0.005555555555555556
+                      +in[(i+3)*n+(j+2)] * 0.005555555555555556
+                      +in[(i+3)*n+(j+3)] * 0.027777777777777776
+                      ;
+    }
+  }
+}
+
+void grid3_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=3; i<n-3; i++) {
+    for (int j=3; j<n-3; j++) {
+        out[i][j] += +in[(i+-3)][(j+-3)] * -0.027777777777777776
+                      +in[(i+-3)][(j+-2)] * -0.005555555555555556
+                      +in[(i+-3)][(j+-1)] * -0.005555555555555556
+                      +in[(i+-3)][(j+0)] * -0.005555555555555556
+                      +in[(i+-3)][(j+1)] * -0.005555555555555556
+                      +in[(i+-3)][(j+2)] * -0.005555555555555556
+                      +in[(i+-2)][(j+-3)] * -0.005555555555555556
+                      +in[(i+-2)][(j+-2)] * -0.041666666666666664
+                      +in[(i+-2)][(j+-1)] * -0.013888888888888888
+                      +in[(i+-2)][(j+0)] * -0.013888888888888888
+                      +in[(i+-2)][(j+1)] * -0.013888888888888888
+                      +in[(i+-2)][(j+3)] * 0.005555555555555556
+                      +in[(i+-1)][(j+-3)] * -0.005555555555555556
+                      +in[(i+-1)][(j+-2)] * -0.013888888888888888
+                      +in[(i+-1)][(j+-1)] * -0.08333333333333333
+                      +in[(i+-1)][(j+0)] * -0.08333333333333333
+                      +in[(i+-1)][(j+2)] * 0.013888888888888888
+                      +in[(i+-1)][(j+3)] * 0.005555555555555556
+                      +in[(i+0)][(j+-3)] * -0.005555555555555556
+                      +in[(i+0)][(j+-2)] * -0.013888888888888888
+                      +in[(i+0)][(j+-1)] * -0.08333333333333333
+                      +in[(i+0)][(j+1)] * 0.08333333333333333
+                      +in[(i+0)][(j+2)] * 0.013888888888888888
+                      +in[(i+0)][(j+3)] * 0.005555555555555556
+                      +in[(i+1)][(j+-3)] * -0.005555555555555556
+                      +in[(i+1)][(j+-2)] * -0.013888888888888888
+                      +in[(i+1)][(j+0)] * 0.08333333333333333
+                      +in[(i+1)][(j+1)] * 0.08333333333333333
+                      +in[(i+1)][(j+2)] * 0.013888888888888888
+                      +in[(i+1)][(j+3)] * 0.005555555555555556
+                      +in[(i+2)][(j+-3)] * -0.005555555555555556
+                      +in[(i+2)][(j+-1)] * 0.013888888888888888
+                      +in[(i+2)][(j+0)] * 0.013888888888888888
+                      +in[(i+2)][(j+1)] * 0.013888888888888888
+                      +in[(i+2)][(j+2)] * 0.041666666666666664
+                      +in[(i+2)][(j+3)] * 0.005555555555555556
+                      +in[(i+3)][(j+-2)] * 0.005555555555555556
+                      +in[(i+3)][(j+-1)] * 0.005555555555555556
+                      +in[(i+3)][(j+0)] * 0.005555555555555556
+                      +in[(i+3)][(j+1)] * 0.005555555555555556
+                      +in[(i+3)][(j+2)] * 0.005555555555555556
+                      +in[(i+3)][(j+3)] * 0.027777777777777776
+                      ;
+    }
+  }
+}
+
+void grid4(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=4; i<n-4; i++) {
+    for (int j=4; j<n-4; j++) {
+        out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
+                      +in[(i+-4)*n+(j+-3)] * -0.002232142857142857
+                      +in[(i+-4)*n+(j+-2)] * -0.002232142857142857
+                      +in[(i+-4)*n+(j+-1)] * -0.002232142857142857
+                      +in[(i+-4)*n+(j+0)] * -0.002232142857142857
+                      +in[(i+-4)*n+(j+1)] * -0.002232142857142857
+                      +in[(i+-4)*n+(j+2)] * -0.002232142857142857
+                      +in[(i+-4)*n+(j+3)] * -0.002232142857142857
+                      +in[(i+-3)*n+(j+-4)] * -0.002232142857142857
+                      +in[(i+-3)*n+(j+-3)] * -0.020833333333333332
+                      +in[(i+-3)*n+(j+-2)] * -0.004166666666666667
+                      +in[(i+-3)*n+(j+-1)] * -0.004166666666666667
+                      +in[(i+-3)*n+(j+0)] * -0.004166666666666667
+                      +in[(i+-3)*n+(j+1)] * -0.004166666666666667
+                      +in[(i+-3)*n+(j+2)] * -0.004166666666666667
+                      +in[(i+-3)*n+(j+4)] * 0.002232142857142857
+                      +in[(i+-2)*n+(j+-4)] * -0.002232142857142857
+                      +in[(i+-2)*n+(j+-3)] * -0.004166666666666667
+                      +in[(i+-2)*n+(j+-2)] * -0.03125
+                      +in[(i+-2)*n+(j+-1)] * -0.010416666666666666
+                      +in[(i+-2)*n+(j+0)] * -0.010416666666666666
+                      +in[(i+-2)*n+(j+1)] * -0.010416666666666666
+                      +in[(i+-2)*n+(j+3)] * 0.004166666666666667
+                      +in[(i+-2)*n+(j+4)] * 0.002232142857142857
+                      +in[(i+-1)*n+(j+-4)] * -0.002232142857142857
+                      +in[(i+-1)*n+(j+-3)] * -0.004166666666666667
+                      +in[(i+-1)*n+(j+-2)] * -0.010416666666666666
+                      +in[(i+-1)*n+(j+-1)] * -0.0625
+                      +in[(i+-1)*n+(j+0)] * -0.0625
+                      +in[(i+-1)*n+(j+2)] * 0.010416666666666666
+                      +in[(i+-1)*n+(j+3)] * 0.004166666666666667
+                      +in[(i+-1)*n+(j+4)] * 0.002232142857142857
+                      +in[(i+0)*n+(j+-4)] * -0.002232142857142857
+                      +in[(i+0)*n+(j+-3)] * -0.004166666666666667
+                      +in[(i+0)*n+(j+-2)] * -0.010416666666666666
+                      +in[(i+0)*n+(j+-1)] * -0.0625
+                      +in[(i+0)*n+(j+1)] * 0.0625
+                      +in[(i+0)*n+(j+2)] * 0.010416666666666666
+                      +in[(i+0)*n+(j+3)] * 0.004166666666666667
+                      +in[(i+0)*n+(j+4)] * 0.002232142857142857
+                      +in[(i+1)*n+(j+-4)] * -0.002232142857142857
+                      +in[(i+1)*n+(j+-3)] * -0.004166666666666667
+                      +in[(i+1)*n+(j+-2)] * -0.010416666666666666
+                      +in[(i+1)*n+(j+0)] * 0.0625
+                      +in[(i+1)*n+(j+1)] * 0.0625
+                      +in[(i+1)*n+(j+2)] * 0.010416666666666666
+                      +in[(i+1)*n+(j+3)] * 0.004166666666666667
+                      +in[(i+1)*n+(j+4)] * 0.002232142857142857
+                      +in[(i+2)*n+(j+-4)] * -0.002232142857142857
+                      +in[(i+2)*n+(j+-3)] * -0.004166666666666667
+                      +in[(i+2)*n+(j+-1)] * 0.010416666666666666
+                      +in[(i+2)*n+(j+0)] * 0.010416666666666666
+                      +in[(i+2)*n+(j+1)] * 0.010416666666666666
+                      +in[(i+2)*n+(j+2)] * 0.03125
+                      +in[(i+2)*n+(j+3)] * 0.004166666666666667
+                      +in[(i+2)*n+(j+4)] * 0.002232142857142857
+                      +in[(i+3)*n+(j+-4)] * -0.002232142857142857
+                      +in[(i+3)*n+(j+-2)] * 0.004166666666666667
+                      +in[(i+3)*n+(j+-1)] * 0.004166666666666667
+                      +in[(i+3)*n+(j+0)] * 0.004166666666666667
+                      +in[(i+3)*n+(j+1)] * 0.004166666666666667
+                      +in[(i+3)*n+(j+2)] * 0.004166666666666667
+                      +in[(i+3)*n+(j+3)] * 0.020833333333333332
+                      +in[(i+3)*n+(j+4)] * 0.002232142857142857
+                      +in[(i+4)*n+(j+-3)] * 0.002232142857142857
+                      +in[(i+4)*n+(j+-2)] * 0.002232142857142857
+                      +in[(i+4)*n+(j+-1)] * 0.002232142857142857
+                      +in[(i+4)*n+(j+0)] * 0.002232142857142857
+                      +in[(i+4)*n+(j+1)] * 0.002232142857142857
+                      +in[(i+4)*n+(j+2)] * 0.002232142857142857
+                      +in[(i+4)*n+(j+3)] * 0.002232142857142857
+                      +in[(i+4)*n+(j+4)] * 0.015625
+                      ;
+    }
+  }
+}
+
+void grid4_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=4; i<n-4; i++) {
+    for (int j=4; j<n-4; j++) {
+        out[i][j] += +in[(i+-4)][(j+-4)] * -0.015625
+                      +in[(i+-4)][(j+-3)] * -0.002232142857142857
+                      +in[(i+-4)][(j+-2)] * -0.002232142857142857
+                      +in[(i+-4)][(j+-1)] * -0.002232142857142857
+                      +in[(i+-4)][(j+0)] * -0.002232142857142857
+                      +in[(i+-4)][(j+1)] * -0.002232142857142857
+                      +in[(i+-4)][(j+2)] * -0.002232142857142857
+                      +in[(i+-4)][(j+3)] * -0.002232142857142857
+                      +in[(i+-3)][(j+-4)] * -0.002232142857142857
+                      +in[(i+-3)][(j+-3)] * -0.020833333333333332
+                      +in[(i+-3)][(j+-2)] * -0.004166666666666667
+                      +in[(i+-3)][(j+-1)] * -0.004166666666666667
+                      +in[(i+-3)][(j+0)] * -0.004166666666666667
+                      +in[(i+-3)][(j+1)] * -0.004166666666666667
+                      +in[(i+-3)][(j+2)] * -0.004166666666666667
+                      +in[(i+-3)][(j+4)] * 0.002232142857142857
+                      +in[(i+-2)][(j+-4)] * -0.002232142857142857
+                      +in[(i+-2)][(j+-3)] * -0.004166666666666667
+                      +in[(i+-2)][(j+-2)] * -0.03125
+                      +in[(i+-2)][(j+-1)] * -0.010416666666666666
+                      +in[(i+-2)][(j+0)] * -0.010416666666666666
+                      +in[(i+-2)][(j+1)] * -0.010416666666666666
+                      +in[(i+-2)][(j+3)] * 0.004166666666666667
+                      +in[(i+-2)][(j+4)] * 0.002232142857142857
+                      +in[(i+-1)][(j+-4)] * -0.002232142857142857
+                      +in[(i+-1)][(j+-3)] * -0.004166666666666667
+                      +in[(i+-1)][(j+-2)] * -0.010416666666666666
+                      +in[(i+-1)][(j+-1)] * -0.0625
+                      +in[(i+-1)][(j+0)] * -0.0625
+                      +in[(i+-1)][(j+2)] * 0.010416666666666666
+                      +in[(i+-1)][(j+3)] * 0.004166666666666667
+                      +in[(i+-1)][(j+4)] * 0.002232142857142857
+                      +in[(i+0)][(j+-4)] * -0.002232142857142857
+                      +in[(i+0)][(j+-3)] * -0.004166666666666667
+                      +in[(i+0)][(j+-2)] * -0.010416666666666666
+                      +in[(i+0)][(j+-1)] * -0.0625
+                      +in[(i+0)][(j+1)] * 0.0625
+                      +in[(i+0)][(j+2)] * 0.010416666666666666
+                      +in[(i+0)][(j+3)] * 0.004166666666666667
+                      +in[(i+0)][(j+4)] * 0.002232142857142857
+                      +in[(i+1)][(j+-4)] * -0.002232142857142857
+                      +in[(i+1)][(j+-3)] * -0.004166666666666667
+                      +in[(i+1)][(j+-2)] * -0.010416666666666666
+                      +in[(i+1)][(j+0)] * 0.0625
+                      +in[(i+1)][(j+1)] * 0.0625
+                      +in[(i+1)][(j+2)] * 0.010416666666666666
+                      +in[(i+1)][(j+3)] * 0.004166666666666667
+                      +in[(i+1)][(j+4)] * 0.002232142857142857
+                      +in[(i+2)][(j+-4)] * -0.002232142857142857
+                      +in[(i+2)][(j+-3)] * -0.004166666666666667
+                      +in[(i+2)][(j+-1)] * 0.010416666666666666
+                      +in[(i+2)][(j+0)] * 0.010416666666666666
+                      +in[(i+2)][(j+1)] * 0.010416666666666666
+                      +in[(i+2)][(j+2)] * 0.03125
+                      +in[(i+2)][(j+3)] * 0.004166666666666667
+                      +in[(i+2)][(j+4)] * 0.002232142857142857
+                      +in[(i+3)][(j+-4)] * -0.002232142857142857
+                      +in[(i+3)][(j+-2)] * 0.004166666666666667
+                      +in[(i+3)][(j+-1)] * 0.004166666666666667
+                      +in[(i+3)][(j+0)] * 0.004166666666666667
+                      +in[(i+3)][(j+1)] * 0.004166666666666667
+                      +in[(i+3)][(j+2)] * 0.004166666666666667
+                      +in[(i+3)][(j+3)] * 0.020833333333333332
+                      +in[(i+3)][(j+4)] * 0.002232142857142857
+                      +in[(i+4)][(j+-3)] * 0.002232142857142857
+                      +in[(i+4)][(j+-2)] * 0.002232142857142857
+                      +in[(i+4)][(j+-1)] * 0.002232142857142857
+                      +in[(i+4)][(j+0)] * 0.002232142857142857
+                      +in[(i+4)][(j+1)] * 0.002232142857142857
+                      +in[(i+4)][(j+2)] * 0.002232142857142857
+                      +in[(i+4)][(j+3)] * 0.002232142857142857
+                      +in[(i+4)][(j+4)] * 0.015625
+                      ;
+    }
+  }
+}
+
+void grid5(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=5; i<n-5; i++) {
+    for (int j=5; j<n-5; j++) {
+        out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
+                      +in[(i+-5)*n+(j+-4)] * -0.0011111111111111111
+                      +in[(i+-5)*n+(j+-3)] * -0.0011111111111111111
+                      +in[(i+-5)*n+(j+-2)] * -0.0011111111111111111
+                      +in[(i+-5)*n+(j+-1)] * -0.0011111111111111111
+                      +in[(i+-5)*n+(j+0)] * -0.0011111111111111111
+                      +in[(i+-5)*n+(j+1)] * -0.0011111111111111111
+                      +in[(i+-5)*n+(j+2)] * -0.0011111111111111111
+                      +in[(i+-5)*n+(j+3)] * -0.0011111111111111111
+                      +in[(i+-5)*n+(j+4)] * -0.0011111111111111111
+                      +in[(i+-4)*n+(j+-5)] * -0.0011111111111111111
+                      +in[(i+-4)*n+(j+-4)] * -0.0125
+                      +in[(i+-4)*n+(j+-3)] * -0.0017857142857142857
+                      +in[(i+-4)*n+(j+-2)] * -0.0017857142857142857
+                      +in[(i+-4)*n+(j+-1)] * -0.0017857142857142857
+                      +in[(i+-4)*n+(j+0)] * -0.0017857142857142857
+                      +in[(i+-4)*n+(j+1)] * -0.0017857142857142857
+                      +in[(i+-4)*n+(j+2)] * -0.0017857142857142857
+                      +in[(i+-4)*n+(j+3)] * -0.0017857142857142857
+                      +in[(i+-4)*n+(j+5)] * 0.0011111111111111111
+                      +in[(i+-3)*n+(j+-5)] * -0.0011111111111111111
+                      +in[(i+-3)*n+(j+-4)] * -0.0017857142857142857
+                      +in[(i+-3)*n+(j+-3)] * -0.016666666666666666
+                      +in[(i+-3)*n+(j+-2)] * -0.0033333333333333335
+                      +in[(i+-3)*n+(j+-1)] * -0.0033333333333333335
+                      +in[(i+-3)*n+(j+0)] * -0.0033333333333333335
+                      +in[(i+-3)*n+(j+1)] * -0.0033333333333333335
+                      +in[(i+-3)*n+(j+2)] * -0.0033333333333333335
+                      +in[(i+-3)*n+(j+4)] * 0.0017857142857142857
+                      +in[(i+-3)*n+(j+5)] * 0.0011111111111111111
+                      +in[(i+-2)*n+(j+-5)] * -0.0011111111111111111
+                      +in[(i+-2)*n+(j+-4)] * -0.0017857142857142857
+                      +in[(i+-2)*n+(j+-3)] * -0.0033333333333333335
+                      +in[(i+-2)*n+(j+-2)] * -0.025
+                      +in[(i+-2)*n+(j+-1)] * -0.008333333333333333
+                      +in[(i+-2)*n+(j+0)] * -0.008333333333333333
+                      +in[(i+-2)*n+(j+1)] * -0.008333333333333333
+                      +in[(i+-2)*n+(j+3)] * 0.0033333333333333335
+                      +in[(i+-2)*n+(j+4)] * 0.0017857142857142857
+                      +in[(i+-2)*n+(j+5)] * 0.0011111111111111111
+                      +in[(i+-1)*n+(j+-5)] * -0.0011111111111111111
+                      +in[(i+-1)*n+(j+-4)] * -0.0017857142857142857
+                      +in[(i+-1)*n+(j+-3)] * -0.0033333333333333335
+                      +in[(i+-1)*n+(j+-2)] * -0.008333333333333333
+                      +in[(i+-1)*n+(j+-1)] * -0.05
+                      +in[(i+-1)*n+(j+0)] * -0.05
+                      +in[(i+-1)*n+(j+2)] * 0.008333333333333333
+                      +in[(i+-1)*n+(j+3)] * 0.0033333333333333335
+                      +in[(i+-1)*n+(j+4)] * 0.0017857142857142857
+                      +in[(i+-1)*n+(j+5)] * 0.0011111111111111111
+                      +in[(i+0)*n+(j+-5)] * -0.0011111111111111111
+                      +in[(i+0)*n+(j+-4)] * -0.0017857142857142857
+                      +in[(i+0)*n+(j+-3)] * -0.0033333333333333335
+                      +in[(i+0)*n+(j+-2)] * -0.008333333333333333
+                      +in[(i+0)*n+(j+-1)] * -0.05
+                      +in[(i+0)*n+(j+1)] * 0.05
+                      +in[(i+0)*n+(j+2)] * 0.008333333333333333
+                      +in[(i+0)*n+(j+3)] * 0.0033333333333333335
+                      +in[(i+0)*n+(j+4)] * 0.0017857142857142857
+                      +in[(i+0)*n+(j+5)] * 0.0011111111111111111
+                      +in[(i+1)*n+(j+-5)] * -0.0011111111111111111
+                      +in[(i+1)*n+(j+-4)] * -0.0017857142857142857
+                      +in[(i+1)*n+(j+-3)] * -0.0033333333333333335
+                      +in[(i+1)*n+(j+-2)] * -0.008333333333333333
+                      +in[(i+1)*n+(j+0)] * 0.05
+                      +in[(i+1)*n+(j+1)] * 0.05
+                      +in[(i+1)*n+(j+2)] * 0.008333333333333333
+                      +in[(i+1)*n+(j+3)] * 0.0033333333333333335
+                      +in[(i+1)*n+(j+4)] * 0.0017857142857142857
+                      +in[(i+1)*n+(j+5)] * 0.0011111111111111111
+                      +in[(i+2)*n+(j+-5)] * -0.0011111111111111111
+                      +in[(i+2)*n+(j+-4)] * -0.0017857142857142857
+                      +in[(i+2)*n+(j+-3)] * -0.0033333333333333335
+                      +in[(i+2)*n+(j+-1)] * 0.008333333333333333
+                      +in[(i+2)*n+(j+0)] * 0.008333333333333333
+                      +in[(i+2)*n+(j+1)] * 0.008333333333333333
+                      +in[(i+2)*n+(j+2)] * 0.025
+                      +in[(i+2)*n+(j+3)] * 0.0033333333333333335
+                      +in[(i+2)*n+(j+4)] * 0.0017857142857142857
+                      +in[(i+2)*n+(j+5)] * 0.0011111111111111111
+                      +in[(i+3)*n+(j+-5)] * -0.0011111111111111111
+                      +in[(i+3)*n+(j+-4)] * -0.0017857142857142857
+                      +in[(i+3)*n+(j+-2)] * 0.0033333333333333335
+                      +in[(i+3)*n+(j+-1)] * 0.0033333333333333335
+                      +in[(i+3)*n+(j+0)] * 0.0033333333333333335
+                      +in[(i+3)*n+(j+1)] * 0.0033333333333333335
+                      +in[(i+3)*n+(j+2)] * 0.0033333333333333335
+                      +in[(i+3)*n+(j+3)] * 0.016666666666666666
+                      +in[(i+3)*n+(j+4)] * 0.0017857142857142857
+                      +in[(i+3)*n+(j+5)] * 0.0011111111111111111
+                      +in[(i+4)*n+(j+-5)] * -0.0011111111111111111
+                      +in[(i+4)*n+(j+-3)] * 0.0017857142857142857
+                      +in[(i+4)*n+(j+-2)] * 0.0017857142857142857
+                      +in[(i+4)*n+(j+-1)] * 0.0017857142857142857
+                      +in[(i+4)*n+(j+0)] * 0.0017857142857142857
+                      +in[(i+4)*n+(j+1)] * 0.0017857142857142857
+                      +in[(i+4)*n+(j+2)] * 0.0017857142857142857
+                      +in[(i+4)*n+(j+3)] * 0.0017857142857142857
+                      +in[(i+4)*n+(j+4)] * 0.0125
+                      +in[(i+4)*n+(j+5)] * 0.0011111111111111111
+                      +in[(i+5)*n+(j+-4)] * 0.0011111111111111111
+                      +in[(i+5)*n+(j+-3)] * 0.0011111111111111111
+                      +in[(i+5)*n+(j+-2)] * 0.0011111111111111111
+                      +in[(i+5)*n+(j+-1)] * 0.0011111111111111111
+                      +in[(i+5)*n+(j+0)] * 0.0011111111111111111
+                      +in[(i+5)*n+(j+1)] * 0.0011111111111111111
+                      +in[(i+5)*n+(j+2)] * 0.0011111111111111111
+                      +in[(i+5)*n+(j+3)] * 0.0011111111111111111
+                      +in[(i+5)*n+(j+4)] * 0.0011111111111111111
+                      +in[(i+5)*n+(j+5)] * 0.01
+                      ;
+    }
+  }
+}
+
+void grid5_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=5; i<n-5; i++) {
+    for (int j=5; j<n-5; j++) {
+        out[i][j] += +in[(i+-5)][(j+-5)] * -0.01
+                      +in[(i+-5)][(j+-4)] * -0.0011111111111111111
+                      +in[(i+-5)][(j+-3)] * -0.0011111111111111111
+                      +in[(i+-5)][(j+-2)] * -0.0011111111111111111
+                      +in[(i+-5)][(j+-1)] * -0.0011111111111111111
+                      +in[(i+-5)][(j+0)] * -0.0011111111111111111
+                      +in[(i+-5)][(j+1)] * -0.0011111111111111111
+                      +in[(i+-5)][(j+2)] * -0.0011111111111111111
+                      +in[(i+-5)][(j+3)] * -0.0011111111111111111
+                      +in[(i+-5)][(j+4)] * -0.0011111111111111111
+                      +in[(i+-4)][(j+-5)] * -0.0011111111111111111
+                      +in[(i+-4)][(j+-4)] * -0.0125
+                      +in[(i+-4)][(j+-3)] * -0.0017857142857142857
+                      +in[(i+-4)][(j+-2)] * -0.0017857142857142857
+                      +in[(i+-4)][(j+-1)] * -0.0017857142857142857
+                      +in[(i+-4)][(j+0)] * -0.0017857142857142857
+                      +in[(i+-4)][(j+1)] * -0.0017857142857142857
+                      +in[(i+-4)][(j+2)] * -0.0017857142857142857
+                      +in[(i+-4)][(j+3)] * -0.0017857142857142857
+                      +in[(i+-4)][(j+5)] * 0.0011111111111111111
+                      +in[(i+-3)][(j+-5)] * -0.0011111111111111111
+                      +in[(i+-3)][(j+-4)] * -0.0017857142857142857
+                      +in[(i+-3)][(j+-3)] * -0.016666666666666666
+                      +in[(i+-3)][(j+-2)] * -0.0033333333333333335
+                      +in[(i+-3)][(j+-1)] * -0.0033333333333333335
+                      +in[(i+-3)][(j+0)] * -0.0033333333333333335
+                      +in[(i+-3)][(j+1)] * -0.0033333333333333335
+                      +in[(i+-3)][(j+2)] * -0.0033333333333333335
+                      +in[(i+-3)][(j+4)] * 0.0017857142857142857
+                      +in[(i+-3)][(j+5)] * 0.0011111111111111111
+                      +in[(i+-2)][(j+-5)] * -0.0011111111111111111
+                      +in[(i+-2)][(j+-4)] * -0.0017857142857142857
+                      +in[(i+-2)][(j+-3)] * -0.0033333333333333335
+                      +in[(i+-2)][(j+-2)] * -0.025
+                      +in[(i+-2)][(j+-1)] * -0.008333333333333333
+                      +in[(i+-2)][(j+0)] * -0.008333333333333333
+                      +in[(i+-2)][(j+1)] * -0.008333333333333333
+                      +in[(i+-2)][(j+3)] * 0.0033333333333333335
+                      +in[(i+-2)][(j+4)] * 0.0017857142857142857
+                      +in[(i+-2)][(j+5)] * 0.0011111111111111111
+                      +in[(i+-1)][(j+-5)] * -0.0011111111111111111
+                      +in[(i+-1)][(j+-4)] * -0.0017857142857142857
+                      +in[(i+-1)][(j+-3)] * -0.0033333333333333335
+                      +in[(i+-1)][(j+-2)] * -0.008333333333333333
+                      +in[(i+-1)][(j+-1)] * -0.05
+                      +in[(i+-1)][(j+0)] * -0.05
+                      +in[(i+-1)][(j+2)] * 0.008333333333333333
+                      +in[(i+-1)][(j+3)] * 0.0033333333333333335
+                      +in[(i+-1)][(j+4)] * 0.0017857142857142857
+                      +in[(i+-1)][(j+5)] * 0.0011111111111111111
+                      +in[(i+0)][(j+-5)] * -0.0011111111111111111
+                      +in[(i+0)][(j+-4)] * -0.0017857142857142857
+                      +in[(i+0)][(j+-3)] * -0.0033333333333333335
+                      +in[(i+0)][(j+-2)] * -0.008333333333333333
+                      +in[(i+0)][(j+-1)] * -0.05
+                      +in[(i+0)][(j+1)] * 0.05
+                      +in[(i+0)][(j+2)] * 0.008333333333333333
+                      +in[(i+0)][(j+3)] * 0.0033333333333333335
+                      +in[(i+0)][(j+4)] * 0.0017857142857142857
+                      +in[(i+0)][(j+5)] * 0.0011111111111111111
+                      +in[(i+1)][(j+-5)] * -0.0011111111111111111
+                      +in[(i+1)][(j+-4)] * -0.0017857142857142857
+                      +in[(i+1)][(j+-3)] * -0.0033333333333333335
+                      +in[(i+1)][(j+-2)] * -0.008333333333333333
+                      +in[(i+1)][(j+0)] * 0.05
+                      +in[(i+1)][(j+1)] * 0.05
+                      +in[(i+1)][(j+2)] * 0.008333333333333333
+                      +in[(i+1)][(j+3)] * 0.0033333333333333335
+                      +in[(i+1)][(j+4)] * 0.0017857142857142857
+                      +in[(i+1)][(j+5)] * 0.0011111111111111111
+                      +in[(i+2)][(j+-5)] * -0.0011111111111111111
+                      +in[(i+2)][(j+-4)] * -0.0017857142857142857
+                      +in[(i+2)][(j+-3)] * -0.0033333333333333335
+                      +in[(i+2)][(j+-1)] * 0.008333333333333333
+                      +in[(i+2)][(j+0)] * 0.008333333333333333
+                      +in[(i+2)][(j+1)] * 0.008333333333333333
+                      +in[(i+2)][(j+2)] * 0.025
+                      +in[(i+2)][(j+3)] * 0.0033333333333333335
+                      +in[(i+2)][(j+4)] * 0.0017857142857142857
+                      +in[(i+2)][(j+5)] * 0.0011111111111111111
+                      +in[(i+3)][(j+-5)] * -0.0011111111111111111
+                      +in[(i+3)][(j+-4)] * -0.0017857142857142857
+                      +in[(i+3)][(j+-2)] * 0.0033333333333333335
+                      +in[(i+3)][(j+-1)] * 0.0033333333333333335
+                      +in[(i+3)][(j+0)] * 0.0033333333333333335
+                      +in[(i+3)][(j+1)] * 0.0033333333333333335
+                      +in[(i+3)][(j+2)] * 0.0033333333333333335
+                      +in[(i+3)][(j+3)] * 0.016666666666666666
+                      +in[(i+3)][(j+4)] * 0.0017857142857142857
+                      +in[(i+3)][(j+5)] * 0.0011111111111111111
+                      +in[(i+4)][(j+-5)] * -0.0011111111111111111
+                      +in[(i+4)][(j+-3)] * 0.0017857142857142857
+                      +in[(i+4)][(j+-2)] * 0.0017857142857142857
+                      +in[(i+4)][(j+-1)] * 0.0017857142857142857
+                      +in[(i+4)][(j+0)] * 0.0017857142857142857
+                      +in[(i+4)][(j+1)] * 0.0017857142857142857
+                      +in[(i+4)][(j+2)] * 0.0017857142857142857
+                      +in[(i+4)][(j+3)] * 0.0017857142857142857
+                      +in[(i+4)][(j+4)] * 0.0125
+                      +in[(i+4)][(j+5)] * 0.0011111111111111111
+                      +in[(i+5)][(j+-4)] * 0.0011111111111111111
+                      +in[(i+5)][(j+-3)] * 0.0011111111111111111
+                      +in[(i+5)][(j+-2)] * 0.0011111111111111111
+                      +in[(i+5)][(j+-1)] * 0.0011111111111111111
+                      +in[(i+5)][(j+0)] * 0.0011111111111111111
+                      +in[(i+5)][(j+1)] * 0.0011111111111111111
+                      +in[(i+5)][(j+2)] * 0.0011111111111111111
+                      +in[(i+5)][(j+3)] * 0.0011111111111111111
+                      +in[(i+5)][(j+4)] * 0.0011111111111111111
+                      +in[(i+5)][(j+5)] * 0.01
+                      ;
+    }
+  }
+}
+
+void grid6(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=6; i<n-6; i++) {
+    for (int j=6; j<n-6; j++) {
+        out[i*n+j] += +in[(i+-6)*n+(j+-6)] * -0.006944444444444444
+                      +in[(i+-6)*n+(j+-5)] * -0.0006313131313131314
+                      +in[(i+-6)*n+(j+-4)] * -0.0006313131313131314
+                      +in[(i+-6)*n+(j+-3)] * -0.0006313131313131314
+                      +in[(i+-6)*n+(j+-2)] * -0.0006313131313131314
+                      +in[(i+-6)*n+(j+-1)] * -0.0006313131313131314
+                      +in[(i+-6)*n+(j+0)] * -0.0006313131313131314
+                      +in[(i+-6)*n+(j+1)] * -0.0006313131313131314
+                      +in[(i+-6)*n+(j+2)] * -0.0006313131313131314
+                      +in[(i+-6)*n+(j+3)] * -0.0006313131313131314
+                      +in[(i+-6)*n+(j+4)] * -0.0006313131313131314
+                      +in[(i+-6)*n+(j+5)] * -0.0006313131313131314
+                      +in[(i+-5)*n+(j+-6)] * -0.0006313131313131314
+                      +in[(i+-5)*n+(j+-5)] * -0.008333333333333333
+                      +in[(i+-5)*n+(j+-4)] * -0.000925925925925926
+                      +in[(i+-5)*n+(j+-3)] * -0.000925925925925926
+                      +in[(i+-5)*n+(j+-2)] * -0.000925925925925926
+                      +in[(i+-5)*n+(j+-1)] * -0.000925925925925926
+                      +in[(i+-5)*n+(j+0)] * -0.000925925925925926
+                      +in[(i+-5)*n+(j+1)] * -0.000925925925925926
+                      +in[(i+-5)*n+(j+2)] * -0.000925925925925926
+                      +in[(i+-5)*n+(j+3)] * -0.000925925925925926
+                      +in[(i+-5)*n+(j+4)] * -0.000925925925925926
+                      +in[(i+-5)*n+(j+6)] * 0.0006313131313131314
+                      +in[(i+-4)*n+(j+-6)] * -0.0006313131313131314
+                      +in[(i+-4)*n+(j+-5)] * -0.000925925925925926
+                      +in[(i+-4)*n+(j+-4)] * -0.010416666666666666
+                      +in[(i+-4)*n+(j+-3)] * -0.001488095238095238
+                      +in[(i+-4)*n+(j+-2)] * -0.001488095238095238
+                      +in[(i+-4)*n+(j+-1)] * -0.001488095238095238
+                      +in[(i+-4)*n+(j+0)] * -0.001488095238095238
+                      +in[(i+-4)*n+(j+1)] * -0.001488095238095238
+                      +in[(i+-4)*n+(j+2)] * -0.001488095238095238
+                      +in[(i+-4)*n+(j+3)] * -0.001488095238095238
+                      +in[(i+-4)*n+(j+5)] * 0.000925925925925926
+                      +in[(i+-4)*n+(j+6)] * 0.0006313131313131314
+                      +in[(i+-3)*n+(j+-6)] * -0.0006313131313131314
+                      +in[(i+-3)*n+(j+-5)] * -0.000925925925925926
+                      +in[(i+-3)*n+(j+-4)] * -0.001488095238095238
+                      +in[(i+-3)*n+(j+-3)] * -0.013888888888888888
+                      +in[(i+-3)*n+(j+-2)] * -0.002777777777777778
+                      +in[(i+-3)*n+(j+-1)] * -0.002777777777777778
+                      +in[(i+-3)*n+(j+0)] * -0.002777777777777778
+                      +in[(i+-3)*n+(j+1)] * -0.002777777777777778
+                      +in[(i+-3)*n+(j+2)] * -0.002777777777777778
+                      +in[(i+-3)*n+(j+4)] * 0.001488095238095238
+                      +in[(i+-3)*n+(j+5)] * 0.000925925925925926
+                      +in[(i+-3)*n+(j+6)] * 0.0006313131313131314
+                      +in[(i+-2)*n+(j+-6)] * -0.0006313131313131314
+                      +in[(i+-2)*n+(j+-5)] * -0.000925925925925926
+                      +in[(i+-2)*n+(j+-4)] * -0.001488095238095238
+                      +in[(i+-2)*n+(j+-3)] * -0.002777777777777778
+                      +in[(i+-2)*n+(j+-2)] * -0.020833333333333332
+                      +in[(i+-2)*n+(j+-1)] * -0.006944444444444444
+                      +in[(i+-2)*n+(j+0)] * -0.006944444444444444
+                      +in[(i+-2)*n+(j+1)] * -0.006944444444444444
+                      +in[(i+-2)*n+(j+3)] * 0.002777777777777778
+                      +in[(i+-2)*n+(j+4)] * 0.001488095238095238
+                      +in[(i+-2)*n+(j+5)] * 0.000925925925925926
+                      +in[(i+-2)*n+(j+6)] * 0.0006313131313131314
+                      +in[(i+-1)*n+(j+-6)] * -0.0006313131313131314
+                      +in[(i+-1)*n+(j+-5)] * -0.000925925925925926
+                      +in[(i+-1)*n+(j+-4)] * -0.001488095238095238
+                      +in[(i+-1)*n+(j+-3)] * -0.002777777777777778
+                      +in[(i+-1)*n+(j+-2)] * -0.006944444444444444
+                      +in[(i+-1)*n+(j+-1)] * -0.041666666666666664
+                      +in[(i+-1)*n+(j+0)] * -0.041666666666666664
+                      +in[(i+-1)*n+(j+2)] * 0.006944444444444444
+                      +in[(i+-1)*n+(j+3)] * 0.002777777777777778
+                      +in[(i+-1)*n+(j+4)] * 0.001488095238095238
+                      +in[(i+-1)*n+(j+5)] * 0.000925925925925926
+                      +in[(i+-1)*n+(j+6)] * 0.0006313131313131314
+                      +in[(i+0)*n+(j+-6)] * -0.0006313131313131314
+                      +in[(i+0)*n+(j+-5)] * -0.000925925925925926
+                      +in[(i+0)*n+(j+-4)] * -0.001488095238095238
+                      +in[(i+0)*n+(j+-3)] * -0.002777777777777778
+                      +in[(i+0)*n+(j+-2)] * -0.006944444444444444
+                      +in[(i+0)*n+(j+-1)] * -0.041666666666666664
+                      +in[(i+0)*n+(j+1)] * 0.041666666666666664
+                      +in[(i+0)*n+(j+2)] * 0.006944444444444444
+                      +in[(i+0)*n+(j+3)] * 0.002777777777777778
+                      +in[(i+0)*n+(j+4)] * 0.001488095238095238
+                      +in[(i+0)*n+(j+5)] * 0.000925925925925926
+                      +in[(i+0)*n+(j+6)] * 0.0006313131313131314
+                      +in[(i+1)*n+(j+-6)] * -0.0006313131313131314
+                      +in[(i+1)*n+(j+-5)] * -0.000925925925925926
+                      +in[(i+1)*n+(j+-4)] * -0.001488095238095238
+                      +in[(i+1)*n+(j+-3)] * -0.002777777777777778
+                      +in[(i+1)*n+(j+-2)] * -0.006944444444444444
+                      +in[(i+1)*n+(j+0)] * 0.041666666666666664
+                      +in[(i+1)*n+(j+1)] * 0.041666666666666664
+                      +in[(i+1)*n+(j+2)] * 0.006944444444444444
+                      +in[(i+1)*n+(j+3)] * 0.002777777777777778
+                      +in[(i+1)*n+(j+4)] * 0.001488095238095238
+                      +in[(i+1)*n+(j+5)] * 0.000925925925925926
+                      +in[(i+1)*n+(j+6)] * 0.0006313131313131314
+                      +in[(i+2)*n+(j+-6)] * -0.0006313131313131314
+                      +in[(i+2)*n+(j+-5)] * -0.000925925925925926
+                      +in[(i+2)*n+(j+-4)] * -0.001488095238095238
+                      +in[(i+2)*n+(j+-3)] * -0.002777777777777778
+                      +in[(i+2)*n+(j+-1)] * 0.006944444444444444
+                      +in[(i+2)*n+(j+0)] * 0.006944444444444444
+                      +in[(i+2)*n+(j+1)] * 0.006944444444444444
+                      +in[(i+2)*n+(j+2)] * 0.020833333333333332
+                      +in[(i+2)*n+(j+3)] * 0.002777777777777778
+                      +in[(i+2)*n+(j+4)] * 0.001488095238095238
+                      +in[(i+2)*n+(j+5)] * 0.000925925925925926
+                      +in[(i+2)*n+(j+6)] * 0.0006313131313131314
+                      +in[(i+3)*n+(j+-6)] * -0.0006313131313131314
+                      +in[(i+3)*n+(j+-5)] * -0.000925925925925926
+                      +in[(i+3)*n+(j+-4)] * -0.001488095238095238
+                      +in[(i+3)*n+(j+-2)] * 0.002777777777777778
+                      +in[(i+3)*n+(j+-1)] * 0.002777777777777778
+                      +in[(i+3)*n+(j+0)] * 0.002777777777777778
+                      +in[(i+3)*n+(j+1)] * 0.002777777777777778
+                      +in[(i+3)*n+(j+2)] * 0.002777777777777778
+                      +in[(i+3)*n+(j+3)] * 0.013888888888888888
+                      +in[(i+3)*n+(j+4)] * 0.001488095238095238
+                      +in[(i+3)*n+(j+5)] * 0.000925925925925926
+                      +in[(i+3)*n+(j+6)] * 0.0006313131313131314
+                      +in[(i+4)*n+(j+-6)] * -0.0006313131313131314
+                      +in[(i+4)*n+(j+-5)] * -0.000925925925925926
+                      +in[(i+4)*n+(j+-3)] * 0.001488095238095238
+                      +in[(i+4)*n+(j+-2)] * 0.001488095238095238
+                      +in[(i+4)*n+(j+-1)] * 0.001488095238095238
+                      +in[(i+4)*n+(j+0)] * 0.001488095238095238
+                      +in[(i+4)*n+(j+1)] * 0.001488095238095238
+                      +in[(i+4)*n+(j+2)] * 0.001488095238095238
+                      +in[(i+4)*n+(j+3)] * 0.001488095238095238
+                      +in[(i+4)*n+(j+4)] * 0.010416666666666666
+                      +in[(i+4)*n+(j+5)] * 0.000925925925925926
+                      +in[(i+4)*n+(j+6)] * 0.0006313131313131314
+                      +in[(i+5)*n+(j+-6)] * -0.0006313131313131314
+                      +in[(i+5)*n+(j+-4)] * 0.000925925925925926
+                      +in[(i+5)*n+(j+-3)] * 0.000925925925925926
+                      +in[(i+5)*n+(j+-2)] * 0.000925925925925926
+                      +in[(i+5)*n+(j+-1)] * 0.000925925925925926
+                      +in[(i+5)*n+(j+0)] * 0.000925925925925926
+                      +in[(i+5)*n+(j+1)] * 0.000925925925925926
+                      +in[(i+5)*n+(j+2)] * 0.000925925925925926
+                      +in[(i+5)*n+(j+3)] * 0.000925925925925926
+                      +in[(i+5)*n+(j+4)] * 0.000925925925925926
+                      +in[(i+5)*n+(j+5)] * 0.008333333333333333
+                      +in[(i+5)*n+(j+6)] * 0.0006313131313131314
+                      +in[(i+6)*n+(j+-5)] * 0.0006313131313131314
+                      +in[(i+6)*n+(j+-4)] * 0.0006313131313131314
+                      +in[(i+6)*n+(j+-3)] * 0.0006313131313131314
+                      +in[(i+6)*n+(j+-2)] * 0.0006313131313131314
+                      +in[(i+6)*n+(j+-1)] * 0.0006313131313131314
+                      +in[(i+6)*n+(j+0)] * 0.0006313131313131314
+                      +in[(i+6)*n+(j+1)] * 0.0006313131313131314
+                      +in[(i+6)*n+(j+2)] * 0.0006313131313131314
+                      +in[(i+6)*n+(j+3)] * 0.0006313131313131314
+                      +in[(i+6)*n+(j+4)] * 0.0006313131313131314
+                      +in[(i+6)*n+(j+5)] * 0.0006313131313131314
+                      +in[(i+6)*n+(j+6)] * 0.006944444444444444
+                      ;
+    }
+  }
+}
+
+void grid6_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=6; i<n-6; i++) {
+    for (int j=6; j<n-6; j++) {
+        out[i][j] += +in[(i+-6)][(j+-6)] * -0.006944444444444444
+                      +in[(i+-6)][(j+-5)] * -0.0006313131313131314
+                      +in[(i+-6)][(j+-4)] * -0.0006313131313131314
+                      +in[(i+-6)][(j+-3)] * -0.0006313131313131314
+                      +in[(i+-6)][(j+-2)] * -0.0006313131313131314
+                      +in[(i+-6)][(j+-1)] * -0.0006313131313131314
+                      +in[(i+-6)][(j+0)] * -0.0006313131313131314
+                      +in[(i+-6)][(j+1)] * -0.0006313131313131314
+                      +in[(i+-6)][(j+2)] * -0.0006313131313131314
+                      +in[(i+-6)][(j+3)] * -0.0006313131313131314
+                      +in[(i+-6)][(j+4)] * -0.0006313131313131314
+                      +in[(i+-6)][(j+5)] * -0.0006313131313131314
+                      +in[(i+-5)][(j+-6)] * -0.0006313131313131314
+                      +in[(i+-5)][(j+-5)] * -0.008333333333333333
+                      +in[(i+-5)][(j+-4)] * -0.000925925925925926
+                      +in[(i+-5)][(j+-3)] * -0.000925925925925926
+                      +in[(i+-5)][(j+-2)] * -0.000925925925925926
+                      +in[(i+-5)][(j+-1)] * -0.000925925925925926
+                      +in[(i+-5)][(j+0)] * -0.000925925925925926
+                      +in[(i+-5)][(j+1)] * -0.000925925925925926
+                      +in[(i+-5)][(j+2)] * -0.000925925925925926
+                      +in[(i+-5)][(j+3)] * -0.000925925925925926
+                      +in[(i+-5)][(j+4)] * -0.000925925925925926
+                      +in[(i+-5)][(j+6)] * 0.0006313131313131314
+                      +in[(i+-4)][(j+-6)] * -0.0006313131313131314
+                      +in[(i+-4)][(j+-5)] * -0.000925925925925926
+                      +in[(i+-4)][(j+-4)] * -0.010416666666666666
+                      +in[(i+-4)][(j+-3)] * -0.001488095238095238
+                      +in[(i+-4)][(j+-2)] * -0.001488095238095238
+                      +in[(i+-4)][(j+-1)] * -0.001488095238095238
+                      +in[(i+-4)][(j+0)] * -0.001488095238095238
+                      +in[(i+-4)][(j+1)] * -0.001488095238095238
+                      +in[(i+-4)][(j+2)] * -0.001488095238095238
+                      +in[(i+-4)][(j+3)] * -0.001488095238095238
+                      +in[(i+-4)][(j+5)] * 0.000925925925925926
+                      +in[(i+-4)][(j+6)] * 0.0006313131313131314
+                      +in[(i+-3)][(j+-6)] * -0.0006313131313131314
+                      +in[(i+-3)][(j+-5)] * -0.000925925925925926
+                      +in[(i+-3)][(j+-4)] * -0.001488095238095238
+                      +in[(i+-3)][(j+-3)] * -0.013888888888888888
+                      +in[(i+-3)][(j+-2)] * -0.002777777777777778
+                      +in[(i+-3)][(j+-1)] * -0.002777777777777778
+                      +in[(i+-3)][(j+0)] * -0.002777777777777778
+                      +in[(i+-3)][(j+1)] * -0.002777777777777778
+                      +in[(i+-3)][(j+2)] * -0.002777777777777778
+                      +in[(i+-3)][(j+4)] * 0.001488095238095238
+                      +in[(i+-3)][(j+5)] * 0.000925925925925926
+                      +in[(i+-3)][(j+6)] * 0.0006313131313131314
+                      +in[(i+-2)][(j+-6)] * -0.0006313131313131314
+                      +in[(i+-2)][(j+-5)] * -0.000925925925925926
+                      +in[(i+-2)][(j+-4)] * -0.001488095238095238
+                      +in[(i+-2)][(j+-3)] * -0.002777777777777778
+                      +in[(i+-2)][(j+-2)] * -0.020833333333333332
+                      +in[(i+-2)][(j+-1)] * -0.006944444444444444
+                      +in[(i+-2)][(j+0)] * -0.006944444444444444
+                      +in[(i+-2)][(j+1)] * -0.006944444444444444
+                      +in[(i+-2)][(j+3)] * 0.002777777777777778
+                      +in[(i+-2)][(j+4)] * 0.001488095238095238
+                      +in[(i+-2)][(j+5)] * 0.000925925925925926
+                      +in[(i+-2)][(j+6)] * 0.0006313131313131314
+                      +in[(i+-1)][(j+-6)] * -0.0006313131313131314
+                      +in[(i+-1)][(j+-5)] * -0.000925925925925926
+                      +in[(i+-1)][(j+-4)] * -0.001488095238095238
+                      +in[(i+-1)][(j+-3)] * -0.002777777777777778
+                      +in[(i+-1)][(j+-2)] * -0.006944444444444444
+                      +in[(i+-1)][(j+-1)] * -0.041666666666666664
+                      +in[(i+-1)][(j+0)] * -0.041666666666666664
+                      +in[(i+-1)][(j+2)] * 0.006944444444444444
+                      +in[(i+-1)][(j+3)] * 0.002777777777777778
+                      +in[(i+-1)][(j+4)] * 0.001488095238095238
+                      +in[(i+-1)][(j+5)] * 0.000925925925925926
+                      +in[(i+-1)][(j+6)] * 0.0006313131313131314
+                      +in[(i+0)][(j+-6)] * -0.0006313131313131314
+                      +in[(i+0)][(j+-5)] * -0.000925925925925926
+                      +in[(i+0)][(j+-4)] * -0.001488095238095238
+                      +in[(i+0)][(j+-3)] * -0.002777777777777778
+                      +in[(i+0)][(j+-2)] * -0.006944444444444444
+                      +in[(i+0)][(j+-1)] * -0.041666666666666664
+                      +in[(i+0)][(j+1)] * 0.041666666666666664
+                      +in[(i+0)][(j+2)] * 0.006944444444444444
+                      +in[(i+0)][(j+3)] * 0.002777777777777778
+                      +in[(i+0)][(j+4)] * 0.001488095238095238
+                      +in[(i+0)][(j+5)] * 0.000925925925925926
+                      +in[(i+0)][(j+6)] * 0.0006313131313131314
+                      +in[(i+1)][(j+-6)] * -0.0006313131313131314
+                      +in[(i+1)][(j+-5)] * -0.000925925925925926
+                      +in[(i+1)][(j+-4)] * -0.001488095238095238
+                      +in[(i+1)][(j+-3)] * -0.002777777777777778
+                      +in[(i+1)][(j+-2)] * -0.006944444444444444
+                      +in[(i+1)][(j+0)] * 0.041666666666666664
+                      +in[(i+1)][(j+1)] * 0.041666666666666664
+                      +in[(i+1)][(j+2)] * 0.006944444444444444
+                      +in[(i+1)][(j+3)] * 0.002777777777777778
+                      +in[(i+1)][(j+4)] * 0.001488095238095238
+                      +in[(i+1)][(j+5)] * 0.000925925925925926
+                      +in[(i+1)][(j+6)] * 0.0006313131313131314
+                      +in[(i+2)][(j+-6)] * -0.0006313131313131314
+                      +in[(i+2)][(j+-5)] * -0.000925925925925926
+                      +in[(i+2)][(j+-4)] * -0.001488095238095238
+                      +in[(i+2)][(j+-3)] * -0.002777777777777778
+                      +in[(i+2)][(j+-1)] * 0.006944444444444444
+                      +in[(i+2)][(j+0)] * 0.006944444444444444
+                      +in[(i+2)][(j+1)] * 0.006944444444444444
+                      +in[(i+2)][(j+2)] * 0.020833333333333332
+                      +in[(i+2)][(j+3)] * 0.002777777777777778
+                      +in[(i+2)][(j+4)] * 0.001488095238095238
+                      +in[(i+2)][(j+5)] * 0.000925925925925926
+                      +in[(i+2)][(j+6)] * 0.0006313131313131314
+                      +in[(i+3)][(j+-6)] * -0.0006313131313131314
+                      +in[(i+3)][(j+-5)] * -0.000925925925925926
+                      +in[(i+3)][(j+-4)] * -0.001488095238095238
+                      +in[(i+3)][(j+-2)] * 0.002777777777777778
+                      +in[(i+3)][(j+-1)] * 0.002777777777777778
+                      +in[(i+3)][(j+0)] * 0.002777777777777778
+                      +in[(i+3)][(j+1)] * 0.002777777777777778
+                      +in[(i+3)][(j+2)] * 0.002777777777777778
+                      +in[(i+3)][(j+3)] * 0.013888888888888888
+                      +in[(i+3)][(j+4)] * 0.001488095238095238
+                      +in[(i+3)][(j+5)] * 0.000925925925925926
+                      +in[(i+3)][(j+6)] * 0.0006313131313131314
+                      +in[(i+4)][(j+-6)] * -0.0006313131313131314
+                      +in[(i+4)][(j+-5)] * -0.000925925925925926
+                      +in[(i+4)][(j+-3)] * 0.001488095238095238
+                      +in[(i+4)][(j+-2)] * 0.001488095238095238
+                      +in[(i+4)][(j+-1)] * 0.001488095238095238
+                      +in[(i+4)][(j+0)] * 0.001488095238095238
+                      +in[(i+4)][(j+1)] * 0.001488095238095238
+                      +in[(i+4)][(j+2)] * 0.001488095238095238
+                      +in[(i+4)][(j+3)] * 0.001488095238095238
+                      +in[(i+4)][(j+4)] * 0.010416666666666666
+                      +in[(i+4)][(j+5)] * 0.000925925925925926
+                      +in[(i+4)][(j+6)] * 0.0006313131313131314
+                      +in[(i+5)][(j+-6)] * -0.0006313131313131314
+                      +in[(i+5)][(j+-4)] * 0.000925925925925926
+                      +in[(i+5)][(j+-3)] * 0.000925925925925926
+                      +in[(i+5)][(j+-2)] * 0.000925925925925926
+                      +in[(i+5)][(j+-1)] * 0.000925925925925926
+                      +in[(i+5)][(j+0)] * 0.000925925925925926
+                      +in[(i+5)][(j+1)] * 0.000925925925925926
+                      +in[(i+5)][(j+2)] * 0.000925925925925926
+                      +in[(i+5)][(j+3)] * 0.000925925925925926
+                      +in[(i+5)][(j+4)] * 0.000925925925925926
+                      +in[(i+5)][(j+5)] * 0.008333333333333333
+                      +in[(i+5)][(j+6)] * 0.0006313131313131314
+                      +in[(i+6)][(j+-5)] * 0.0006313131313131314
+                      +in[(i+6)][(j+-4)] * 0.0006313131313131314
+                      +in[(i+6)][(j+-3)] * 0.0006313131313131314
+                      +in[(i+6)][(j+-2)] * 0.0006313131313131314
+                      +in[(i+6)][(j+-1)] * 0.0006313131313131314
+                      +in[(i+6)][(j+0)] * 0.0006313131313131314
+                      +in[(i+6)][(j+1)] * 0.0006313131313131314
+                      +in[(i+6)][(j+2)] * 0.0006313131313131314
+                      +in[(i+6)][(j+3)] * 0.0006313131313131314
+                      +in[(i+6)][(j+4)] * 0.0006313131313131314
+                      +in[(i+6)][(j+5)] * 0.0006313131313131314
+                      +in[(i+6)][(j+6)] * 0.006944444444444444
+                      ;
+    }
+  }
+}
+
+void grid7(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=7; i<n-7; i++) {
+    for (int j=7; j<n-7; j++) {
+        out[i*n+j] += +in[(i+-7)*n+(j+-7)] * -0.00510204081632653
+                      +in[(i+-7)*n+(j+-6)] * -0.0003924646781789639
+                      +in[(i+-7)*n+(j+-5)] * -0.0003924646781789639
+                      +in[(i+-7)*n+(j+-4)] * -0.0003924646781789639
+                      +in[(i+-7)*n+(j+-3)] * -0.0003924646781789639
+                      +in[(i+-7)*n+(j+-2)] * -0.0003924646781789639
+                      +in[(i+-7)*n+(j+-1)] * -0.0003924646781789639
+                      +in[(i+-7)*n+(j+0)] * -0.0003924646781789639
+                      +in[(i+-7)*n+(j+1)] * -0.0003924646781789639
+                      +in[(i+-7)*n+(j+2)] * -0.0003924646781789639
+                      +in[(i+-7)*n+(j+3)] * -0.0003924646781789639
+                      +in[(i+-7)*n+(j+4)] * -0.0003924646781789639
+                      +in[(i+-7)*n+(j+5)] * -0.0003924646781789639
+                      +in[(i+-7)*n+(j+6)] * -0.0003924646781789639
+                      +in[(i+-6)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+-6)*n+(j+-6)] * -0.005952380952380952
+                      +in[(i+-6)*n+(j+-5)] * -0.0005411255411255411
+                      +in[(i+-6)*n+(j+-4)] * -0.0005411255411255411
+                      +in[(i+-6)*n+(j+-3)] * -0.0005411255411255411
+                      +in[(i+-6)*n+(j+-2)] * -0.0005411255411255411
+                      +in[(i+-6)*n+(j+-1)] * -0.0005411255411255411
+                      +in[(i+-6)*n+(j+0)] * -0.0005411255411255411
+                      +in[(i+-6)*n+(j+1)] * -0.0005411255411255411
+                      +in[(i+-6)*n+(j+2)] * -0.0005411255411255411
+                      +in[(i+-6)*n+(j+3)] * -0.0005411255411255411
+                      +in[(i+-6)*n+(j+4)] * -0.0005411255411255411
+                      +in[(i+-6)*n+(j+5)] * -0.0005411255411255411
+                      +in[(i+-6)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+-5)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+-5)*n+(j+-6)] * -0.0005411255411255411
+                      +in[(i+-5)*n+(j+-5)] * -0.007142857142857143
+                      +in[(i+-5)*n+(j+-4)] * -0.0007936507936507937
+                      +in[(i+-5)*n+(j+-3)] * -0.0007936507936507937
+                      +in[(i+-5)*n+(j+-2)] * -0.0007936507936507937
+                      +in[(i+-5)*n+(j+-1)] * -0.0007936507936507937
+                      +in[(i+-5)*n+(j+0)] * -0.0007936507936507937
+                      +in[(i+-5)*n+(j+1)] * -0.0007936507936507937
+                      +in[(i+-5)*n+(j+2)] * -0.0007936507936507937
+                      +in[(i+-5)*n+(j+3)] * -0.0007936507936507937
+                      +in[(i+-5)*n+(j+4)] * -0.0007936507936507937
+                      +in[(i+-5)*n+(j+6)] * 0.0005411255411255411
+                      +in[(i+-5)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+-4)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+-4)*n+(j+-6)] * -0.0005411255411255411
+                      +in[(i+-4)*n+(j+-5)] * -0.0007936507936507937
+                      +in[(i+-4)*n+(j+-4)] * -0.008928571428571428
+                      +in[(i+-4)*n+(j+-3)] * -0.0012755102040816326
+                      +in[(i+-4)*n+(j+-2)] * -0.0012755102040816326
+                      +in[(i+-4)*n+(j+-1)] * -0.0012755102040816326
+                      +in[(i+-4)*n+(j+0)] * -0.0012755102040816326
+                      +in[(i+-4)*n+(j+1)] * -0.0012755102040816326
+                      +in[(i+-4)*n+(j+2)] * -0.0012755102040816326
+                      +in[(i+-4)*n+(j+3)] * -0.0012755102040816326
+                      +in[(i+-4)*n+(j+5)] * 0.0007936507936507937
+                      +in[(i+-4)*n+(j+6)] * 0.0005411255411255411
+                      +in[(i+-4)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+-3)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+-3)*n+(j+-6)] * -0.0005411255411255411
+                      +in[(i+-3)*n+(j+-5)] * -0.0007936507936507937
+                      +in[(i+-3)*n+(j+-4)] * -0.0012755102040816326
+                      +in[(i+-3)*n+(j+-3)] * -0.011904761904761904
+                      +in[(i+-3)*n+(j+-2)] * -0.002380952380952381
+                      +in[(i+-3)*n+(j+-1)] * -0.002380952380952381
+                      +in[(i+-3)*n+(j+0)] * -0.002380952380952381
+                      +in[(i+-3)*n+(j+1)] * -0.002380952380952381
+                      +in[(i+-3)*n+(j+2)] * -0.002380952380952381
+                      +in[(i+-3)*n+(j+4)] * 0.0012755102040816326
+                      +in[(i+-3)*n+(j+5)] * 0.0007936507936507937
+                      +in[(i+-3)*n+(j+6)] * 0.0005411255411255411
+                      +in[(i+-3)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+-2)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+-2)*n+(j+-6)] * -0.0005411255411255411
+                      +in[(i+-2)*n+(j+-5)] * -0.0007936507936507937
+                      +in[(i+-2)*n+(j+-4)] * -0.0012755102040816326
+                      +in[(i+-2)*n+(j+-3)] * -0.002380952380952381
+                      +in[(i+-2)*n+(j+-2)] * -0.017857142857142856
+                      +in[(i+-2)*n+(j+-1)] * -0.005952380952380952
+                      +in[(i+-2)*n+(j+0)] * -0.005952380952380952
+                      +in[(i+-2)*n+(j+1)] * -0.005952380952380952
+                      +in[(i+-2)*n+(j+3)] * 0.002380952380952381
+                      +in[(i+-2)*n+(j+4)] * 0.0012755102040816326
+                      +in[(i+-2)*n+(j+5)] * 0.0007936507936507937
+                      +in[(i+-2)*n+(j+6)] * 0.0005411255411255411
+                      +in[(i+-2)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+-1)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+-1)*n+(j+-6)] * -0.0005411255411255411
+                      +in[(i+-1)*n+(j+-5)] * -0.0007936507936507937
+                      +in[(i+-1)*n+(j+-4)] * -0.0012755102040816326
+                      +in[(i+-1)*n+(j+-3)] * -0.002380952380952381
+                      +in[(i+-1)*n+(j+-2)] * -0.005952380952380952
+                      +in[(i+-1)*n+(j+-1)] * -0.03571428571428571
+                      +in[(i+-1)*n+(j+0)] * -0.03571428571428571
+                      +in[(i+-1)*n+(j+2)] * 0.005952380952380952
+                      +in[(i+-1)*n+(j+3)] * 0.002380952380952381
+                      +in[(i+-1)*n+(j+4)] * 0.0012755102040816326
+                      +in[(i+-1)*n+(j+5)] * 0.0007936507936507937
+                      +in[(i+-1)*n+(j+6)] * 0.0005411255411255411
+                      +in[(i+-1)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+0)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+0)*n+(j+-6)] * -0.0005411255411255411
+                      +in[(i+0)*n+(j+-5)] * -0.0007936507936507937
+                      +in[(i+0)*n+(j+-4)] * -0.0012755102040816326
+                      +in[(i+0)*n+(j+-3)] * -0.002380952380952381
+                      +in[(i+0)*n+(j+-2)] * -0.005952380952380952
+                      +in[(i+0)*n+(j+-1)] * -0.03571428571428571
+                      +in[(i+0)*n+(j+1)] * 0.03571428571428571
+                      +in[(i+0)*n+(j+2)] * 0.005952380952380952
+                      +in[(i+0)*n+(j+3)] * 0.002380952380952381
+                      +in[(i+0)*n+(j+4)] * 0.0012755102040816326
+                      +in[(i+0)*n+(j+5)] * 0.0007936507936507937
+                      +in[(i+0)*n+(j+6)] * 0.0005411255411255411
+                      +in[(i+0)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+1)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+1)*n+(j+-6)] * -0.0005411255411255411
+                      +in[(i+1)*n+(j+-5)] * -0.0007936507936507937
+                      +in[(i+1)*n+(j+-4)] * -0.0012755102040816326
+                      +in[(i+1)*n+(j+-3)] * -0.002380952380952381
+                      +in[(i+1)*n+(j+-2)] * -0.005952380952380952
+                      +in[(i+1)*n+(j+0)] * 0.03571428571428571
+                      +in[(i+1)*n+(j+1)] * 0.03571428571428571
+                      +in[(i+1)*n+(j+2)] * 0.005952380952380952
+                      +in[(i+1)*n+(j+3)] * 0.002380952380952381
+                      +in[(i+1)*n+(j+4)] * 0.0012755102040816326
+                      +in[(i+1)*n+(j+5)] * 0.0007936507936507937
+                      +in[(i+1)*n+(j+6)] * 0.0005411255411255411
+                      +in[(i+1)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+2)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+2)*n+(j+-6)] * -0.0005411255411255411
+                      +in[(i+2)*n+(j+-5)] * -0.0007936507936507937
+                      +in[(i+2)*n+(j+-4)] * -0.0012755102040816326
+                      +in[(i+2)*n+(j+-3)] * -0.002380952380952381
+                      +in[(i+2)*n+(j+-1)] * 0.005952380952380952
+                      +in[(i+2)*n+(j+0)] * 0.005952380952380952
+                      +in[(i+2)*n+(j+1)] * 0.005952380952380952
+                      +in[(i+2)*n+(j+2)] * 0.017857142857142856
+                      +in[(i+2)*n+(j+3)] * 0.002380952380952381
+                      +in[(i+2)*n+(j+4)] * 0.0012755102040816326
+                      +in[(i+2)*n+(j+5)] * 0.0007936507936507937
+                      +in[(i+2)*n+(j+6)] * 0.0005411255411255411
+                      +in[(i+2)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+3)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+3)*n+(j+-6)] * -0.0005411255411255411
+                      +in[(i+3)*n+(j+-5)] * -0.0007936507936507937
+                      +in[(i+3)*n+(j+-4)] * -0.0012755102040816326
+                      +in[(i+3)*n+(j+-2)] * 0.002380952380952381
+                      +in[(i+3)*n+(j+-1)] * 0.002380952380952381
+                      +in[(i+3)*n+(j+0)] * 0.002380952380952381
+                      +in[(i+3)*n+(j+1)] * 0.002380952380952381
+                      +in[(i+3)*n+(j+2)] * 0.002380952380952381
+                      +in[(i+3)*n+(j+3)] * 0.011904761904761904
+                      +in[(i+3)*n+(j+4)] * 0.0012755102040816326
+                      +in[(i+3)*n+(j+5)] * 0.0007936507936507937
+                      +in[(i+3)*n+(j+6)] * 0.0005411255411255411
+                      +in[(i+3)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+4)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+4)*n+(j+-6)] * -0.0005411255411255411
+                      +in[(i+4)*n+(j+-5)] * -0.0007936507936507937
+                      +in[(i+4)*n+(j+-3)] * 0.0012755102040816326
+                      +in[(i+4)*n+(j+-2)] * 0.0012755102040816326
+                      +in[(i+4)*n+(j+-1)] * 0.0012755102040816326
+                      +in[(i+4)*n+(j+0)] * 0.0012755102040816326
+                      +in[(i+4)*n+(j+1)] * 0.0012755102040816326
+                      +in[(i+4)*n+(j+2)] * 0.0012755102040816326
+                      +in[(i+4)*n+(j+3)] * 0.0012755102040816326
+                      +in[(i+4)*n+(j+4)] * 0.008928571428571428
+                      +in[(i+4)*n+(j+5)] * 0.0007936507936507937
+                      +in[(i+4)*n+(j+6)] * 0.0005411255411255411
+                      +in[(i+4)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+5)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+5)*n+(j+-6)] * -0.0005411255411255411
+                      +in[(i+5)*n+(j+-4)] * 0.0007936507936507937
+                      +in[(i+5)*n+(j+-3)] * 0.0007936507936507937
+                      +in[(i+5)*n+(j+-2)] * 0.0007936507936507937
+                      +in[(i+5)*n+(j+-1)] * 0.0007936507936507937
+                      +in[(i+5)*n+(j+0)] * 0.0007936507936507937
+                      +in[(i+5)*n+(j+1)] * 0.0007936507936507937
+                      +in[(i+5)*n+(j+2)] * 0.0007936507936507937
+                      +in[(i+5)*n+(j+3)] * 0.0007936507936507937
+                      +in[(i+5)*n+(j+4)] * 0.0007936507936507937
+                      +in[(i+5)*n+(j+5)] * 0.007142857142857143
+                      +in[(i+5)*n+(j+6)] * 0.0005411255411255411
+                      +in[(i+5)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+6)*n+(j+-7)] * -0.0003924646781789639
+                      +in[(i+6)*n+(j+-5)] * 0.0005411255411255411
+                      +in[(i+6)*n+(j+-4)] * 0.0005411255411255411
+                      +in[(i+6)*n+(j+-3)] * 0.0005411255411255411
+                      +in[(i+6)*n+(j+-2)] * 0.0005411255411255411
+                      +in[(i+6)*n+(j+-1)] * 0.0005411255411255411
+                      +in[(i+6)*n+(j+0)] * 0.0005411255411255411
+                      +in[(i+6)*n+(j+1)] * 0.0005411255411255411
+                      +in[(i+6)*n+(j+2)] * 0.0005411255411255411
+                      +in[(i+6)*n+(j+3)] * 0.0005411255411255411
+                      +in[(i+6)*n+(j+4)] * 0.0005411255411255411
+                      +in[(i+6)*n+(j+5)] * 0.0005411255411255411
+                      +in[(i+6)*n+(j+6)] * 0.005952380952380952
+                      +in[(i+6)*n+(j+7)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+-6)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+-5)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+-4)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+-3)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+-2)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+-1)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+0)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+1)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+2)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+3)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+4)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+5)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+6)] * 0.0003924646781789639
+                      +in[(i+7)*n+(j+7)] * 0.00510204081632653
+                      ;
+    }
+  }
+}
+
+void grid7_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=7; i<n-7; i++) {
+    for (int j=7; j<n-7; j++) {
+        out[i][j] += +in[(i+-7)][(j+-7)] * -0.00510204081632653
+                      +in[(i+-7)][(j+-6)] * -0.0003924646781789639
+                      +in[(i+-7)][(j+-5)] * -0.0003924646781789639
+                      +in[(i+-7)][(j+-4)] * -0.0003924646781789639
+                      +in[(i+-7)][(j+-3)] * -0.0003924646781789639
+                      +in[(i+-7)][(j+-2)] * -0.0003924646781789639
+                      +in[(i+-7)][(j+-1)] * -0.0003924646781789639
+                      +in[(i+-7)][(j+0)] * -0.0003924646781789639
+                      +in[(i+-7)][(j+1)] * -0.0003924646781789639
+                      +in[(i+-7)][(j+2)] * -0.0003924646781789639
+                      +in[(i+-7)][(j+3)] * -0.0003924646781789639
+                      +in[(i+-7)][(j+4)] * -0.0003924646781789639
+                      +in[(i+-7)][(j+5)] * -0.0003924646781789639
+                      +in[(i+-7)][(j+6)] * -0.0003924646781789639
+                      +in[(i+-6)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+-6)][(j+-6)] * -0.005952380952380952
+                      +in[(i+-6)][(j+-5)] * -0.0005411255411255411
+                      +in[(i+-6)][(j+-4)] * -0.0005411255411255411
+                      +in[(i+-6)][(j+-3)] * -0.0005411255411255411
+                      +in[(i+-6)][(j+-2)] * -0.0005411255411255411
+                      +in[(i+-6)][(j+-1)] * -0.0005411255411255411
+                      +in[(i+-6)][(j+0)] * -0.0005411255411255411
+                      +in[(i+-6)][(j+1)] * -0.0005411255411255411
+                      +in[(i+-6)][(j+2)] * -0.0005411255411255411
+                      +in[(i+-6)][(j+3)] * -0.0005411255411255411
+                      +in[(i+-6)][(j+4)] * -0.0005411255411255411
+                      +in[(i+-6)][(j+5)] * -0.0005411255411255411
+                      +in[(i+-6)][(j+7)] * 0.0003924646781789639
+                      +in[(i+-5)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+-5)][(j+-6)] * -0.0005411255411255411
+                      +in[(i+-5)][(j+-5)] * -0.007142857142857143
+                      +in[(i+-5)][(j+-4)] * -0.0007936507936507937
+                      +in[(i+-5)][(j+-3)] * -0.0007936507936507937
+                      +in[(i+-5)][(j+-2)] * -0.0007936507936507937
+                      +in[(i+-5)][(j+-1)] * -0.0007936507936507937
+                      +in[(i+-5)][(j+0)] * -0.0007936507936507937
+                      +in[(i+-5)][(j+1)] * -0.0007936507936507937
+                      +in[(i+-5)][(j+2)] * -0.0007936507936507937
+                      +in[(i+-5)][(j+3)] * -0.0007936507936507937
+                      +in[(i+-5)][(j+4)] * -0.0007936507936507937
+                      +in[(i+-5)][(j+6)] * 0.0005411255411255411
+                      +in[(i+-5)][(j+7)] * 0.0003924646781789639
+                      +in[(i+-4)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+-4)][(j+-6)] * -0.0005411255411255411
+                      +in[(i+-4)][(j+-5)] * -0.0007936507936507937
+                      +in[(i+-4)][(j+-4)] * -0.008928571428571428
+                      +in[(i+-4)][(j+-3)] * -0.0012755102040816326
+                      +in[(i+-4)][(j+-2)] * -0.0012755102040816326
+                      +in[(i+-4)][(j+-1)] * -0.0012755102040816326
+                      +in[(i+-4)][(j+0)] * -0.0012755102040816326
+                      +in[(i+-4)][(j+1)] * -0.0012755102040816326
+                      +in[(i+-4)][(j+2)] * -0.0012755102040816326
+                      +in[(i+-4)][(j+3)] * -0.0012755102040816326
+                      +in[(i+-4)][(j+5)] * 0.0007936507936507937
+                      +in[(i+-4)][(j+6)] * 0.0005411255411255411
+                      +in[(i+-4)][(j+7)] * 0.0003924646781789639
+                      +in[(i+-3)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+-3)][(j+-6)] * -0.0005411255411255411
+                      +in[(i+-3)][(j+-5)] * -0.0007936507936507937
+                      +in[(i+-3)][(j+-4)] * -0.0012755102040816326
+                      +in[(i+-3)][(j+-3)] * -0.011904761904761904
+                      +in[(i+-3)][(j+-2)] * -0.002380952380952381
+                      +in[(i+-3)][(j+-1)] * -0.002380952380952381
+                      +in[(i+-3)][(j+0)] * -0.002380952380952381
+                      +in[(i+-3)][(j+1)] * -0.002380952380952381
+                      +in[(i+-3)][(j+2)] * -0.002380952380952381
+                      +in[(i+-3)][(j+4)] * 0.0012755102040816326
+                      +in[(i+-3)][(j+5)] * 0.0007936507936507937
+                      +in[(i+-3)][(j+6)] * 0.0005411255411255411
+                      +in[(i+-3)][(j+7)] * 0.0003924646781789639
+                      +in[(i+-2)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+-2)][(j+-6)] * -0.0005411255411255411
+                      +in[(i+-2)][(j+-5)] * -0.0007936507936507937
+                      +in[(i+-2)][(j+-4)] * -0.0012755102040816326
+                      +in[(i+-2)][(j+-3)] * -0.002380952380952381
+                      +in[(i+-2)][(j+-2)] * -0.017857142857142856
+                      +in[(i+-2)][(j+-1)] * -0.005952380952380952
+                      +in[(i+-2)][(j+0)] * -0.005952380952380952
+                      +in[(i+-2)][(j+1)] * -0.005952380952380952
+                      +in[(i+-2)][(j+3)] * 0.002380952380952381
+                      +in[(i+-2)][(j+4)] * 0.0012755102040816326
+                      +in[(i+-2)][(j+5)] * 0.0007936507936507937
+                      +in[(i+-2)][(j+6)] * 0.0005411255411255411
+                      +in[(i+-2)][(j+7)] * 0.0003924646781789639
+                      +in[(i+-1)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+-1)][(j+-6)] * -0.0005411255411255411
+                      +in[(i+-1)][(j+-5)] * -0.0007936507936507937
+                      +in[(i+-1)][(j+-4)] * -0.0012755102040816326
+                      +in[(i+-1)][(j+-3)] * -0.002380952380952381
+                      +in[(i+-1)][(j+-2)] * -0.005952380952380952
+                      +in[(i+-1)][(j+-1)] * -0.03571428571428571
+                      +in[(i+-1)][(j+0)] * -0.03571428571428571
+                      +in[(i+-1)][(j+2)] * 0.005952380952380952
+                      +in[(i+-1)][(j+3)] * 0.002380952380952381
+                      +in[(i+-1)][(j+4)] * 0.0012755102040816326
+                      +in[(i+-1)][(j+5)] * 0.0007936507936507937
+                      +in[(i+-1)][(j+6)] * 0.0005411255411255411
+                      +in[(i+-1)][(j+7)] * 0.0003924646781789639
+                      +in[(i+0)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+0)][(j+-6)] * -0.0005411255411255411
+                      +in[(i+0)][(j+-5)] * -0.0007936507936507937
+                      +in[(i+0)][(j+-4)] * -0.0012755102040816326
+                      +in[(i+0)][(j+-3)] * -0.002380952380952381
+                      +in[(i+0)][(j+-2)] * -0.005952380952380952
+                      +in[(i+0)][(j+-1)] * -0.03571428571428571
+                      +in[(i+0)][(j+1)] * 0.03571428571428571
+                      +in[(i+0)][(j+2)] * 0.005952380952380952
+                      +in[(i+0)][(j+3)] * 0.002380952380952381
+                      +in[(i+0)][(j+4)] * 0.0012755102040816326
+                      +in[(i+0)][(j+5)] * 0.0007936507936507937
+                      +in[(i+0)][(j+6)] * 0.0005411255411255411
+                      +in[(i+0)][(j+7)] * 0.0003924646781789639
+                      +in[(i+1)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+1)][(j+-6)] * -0.0005411255411255411
+                      +in[(i+1)][(j+-5)] * -0.0007936507936507937
+                      +in[(i+1)][(j+-4)] * -0.0012755102040816326
+                      +in[(i+1)][(j+-3)] * -0.002380952380952381
+                      +in[(i+1)][(j+-2)] * -0.005952380952380952
+                      +in[(i+1)][(j+0)] * 0.03571428571428571
+                      +in[(i+1)][(j+1)] * 0.03571428571428571
+                      +in[(i+1)][(j+2)] * 0.005952380952380952
+                      +in[(i+1)][(j+3)] * 0.002380952380952381
+                      +in[(i+1)][(j+4)] * 0.0012755102040816326
+                      +in[(i+1)][(j+5)] * 0.0007936507936507937
+                      +in[(i+1)][(j+6)] * 0.0005411255411255411
+                      +in[(i+1)][(j+7)] * 0.0003924646781789639
+                      +in[(i+2)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+2)][(j+-6)] * -0.0005411255411255411
+                      +in[(i+2)][(j+-5)] * -0.0007936507936507937
+                      +in[(i+2)][(j+-4)] * -0.0012755102040816326
+                      +in[(i+2)][(j+-3)] * -0.002380952380952381
+                      +in[(i+2)][(j+-1)] * 0.005952380952380952
+                      +in[(i+2)][(j+0)] * 0.005952380952380952
+                      +in[(i+2)][(j+1)] * 0.005952380952380952
+                      +in[(i+2)][(j+2)] * 0.017857142857142856
+                      +in[(i+2)][(j+3)] * 0.002380952380952381
+                      +in[(i+2)][(j+4)] * 0.0012755102040816326
+                      +in[(i+2)][(j+5)] * 0.0007936507936507937
+                      +in[(i+2)][(j+6)] * 0.0005411255411255411
+                      +in[(i+2)][(j+7)] * 0.0003924646781789639
+                      +in[(i+3)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+3)][(j+-6)] * -0.0005411255411255411
+                      +in[(i+3)][(j+-5)] * -0.0007936507936507937
+                      +in[(i+3)][(j+-4)] * -0.0012755102040816326
+                      +in[(i+3)][(j+-2)] * 0.002380952380952381
+                      +in[(i+3)][(j+-1)] * 0.002380952380952381
+                      +in[(i+3)][(j+0)] * 0.002380952380952381
+                      +in[(i+3)][(j+1)] * 0.002380952380952381
+                      +in[(i+3)][(j+2)] * 0.002380952380952381
+                      +in[(i+3)][(j+3)] * 0.011904761904761904
+                      +in[(i+3)][(j+4)] * 0.0012755102040816326
+                      +in[(i+3)][(j+5)] * 0.0007936507936507937
+                      +in[(i+3)][(j+6)] * 0.0005411255411255411
+                      +in[(i+3)][(j+7)] * 0.0003924646781789639
+                      +in[(i+4)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+4)][(j+-6)] * -0.0005411255411255411
+                      +in[(i+4)][(j+-5)] * -0.0007936507936507937
+                      +in[(i+4)][(j+-3)] * 0.0012755102040816326
+                      +in[(i+4)][(j+-2)] * 0.0012755102040816326
+                      +in[(i+4)][(j+-1)] * 0.0012755102040816326
+                      +in[(i+4)][(j+0)] * 0.0012755102040816326
+                      +in[(i+4)][(j+1)] * 0.0012755102040816326
+                      +in[(i+4)][(j+2)] * 0.0012755102040816326
+                      +in[(i+4)][(j+3)] * 0.0012755102040816326
+                      +in[(i+4)][(j+4)] * 0.008928571428571428
+                      +in[(i+4)][(j+5)] * 0.0007936507936507937
+                      +in[(i+4)][(j+6)] * 0.0005411255411255411
+                      +in[(i+4)][(j+7)] * 0.0003924646781789639
+                      +in[(i+5)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+5)][(j+-6)] * -0.0005411255411255411
+                      +in[(i+5)][(j+-4)] * 0.0007936507936507937
+                      +in[(i+5)][(j+-3)] * 0.0007936507936507937
+                      +in[(i+5)][(j+-2)] * 0.0007936507936507937
+                      +in[(i+5)][(j+-1)] * 0.0007936507936507937
+                      +in[(i+5)][(j+0)] * 0.0007936507936507937
+                      +in[(i+5)][(j+1)] * 0.0007936507936507937
+                      +in[(i+5)][(j+2)] * 0.0007936507936507937
+                      +in[(i+5)][(j+3)] * 0.0007936507936507937
+                      +in[(i+5)][(j+4)] * 0.0007936507936507937
+                      +in[(i+5)][(j+5)] * 0.007142857142857143
+                      +in[(i+5)][(j+6)] * 0.0005411255411255411
+                      +in[(i+5)][(j+7)] * 0.0003924646781789639
+                      +in[(i+6)][(j+-7)] * -0.0003924646781789639
+                      +in[(i+6)][(j+-5)] * 0.0005411255411255411
+                      +in[(i+6)][(j+-4)] * 0.0005411255411255411
+                      +in[(i+6)][(j+-3)] * 0.0005411255411255411
+                      +in[(i+6)][(j+-2)] * 0.0005411255411255411
+                      +in[(i+6)][(j+-1)] * 0.0005411255411255411
+                      +in[(i+6)][(j+0)] * 0.0005411255411255411
+                      +in[(i+6)][(j+1)] * 0.0005411255411255411
+                      +in[(i+6)][(j+2)] * 0.0005411255411255411
+                      +in[(i+6)][(j+3)] * 0.0005411255411255411
+                      +in[(i+6)][(j+4)] * 0.0005411255411255411
+                      +in[(i+6)][(j+5)] * 0.0005411255411255411
+                      +in[(i+6)][(j+6)] * 0.005952380952380952
+                      +in[(i+6)][(j+7)] * 0.0003924646781789639
+                      +in[(i+7)][(j+-6)] * 0.0003924646781789639
+                      +in[(i+7)][(j+-5)] * 0.0003924646781789639
+                      +in[(i+7)][(j+-4)] * 0.0003924646781789639
+                      +in[(i+7)][(j+-3)] * 0.0003924646781789639
+                      +in[(i+7)][(j+-2)] * 0.0003924646781789639
+                      +in[(i+7)][(j+-1)] * 0.0003924646781789639
+                      +in[(i+7)][(j+0)] * 0.0003924646781789639
+                      +in[(i+7)][(j+1)] * 0.0003924646781789639
+                      +in[(i+7)][(j+2)] * 0.0003924646781789639
+                      +in[(i+7)][(j+3)] * 0.0003924646781789639
+                      +in[(i+7)][(j+4)] * 0.0003924646781789639
+                      +in[(i+7)][(j+5)] * 0.0003924646781789639
+                      +in[(i+7)][(j+6)] * 0.0003924646781789639
+                      +in[(i+7)][(j+7)] * 0.00510204081632653
+                      ;
+    }
+  }
+}
+
+void grid8(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=8; i<n-8; i++) {
+    for (int j=8; j<n-8; j++) {
+        out[i*n+j] += +in[(i+-8)*n+(j+-8)] * -0.00390625
+                      +in[(i+-8)*n+(j+-7)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+-6)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+-5)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+-4)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+-3)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+-2)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+-1)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+0)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+1)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+2)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+3)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+4)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+5)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+6)] * -0.00026041666666666666
+                      +in[(i+-8)*n+(j+7)] * -0.00026041666666666666
+                      +in[(i+-7)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+-7)*n+(j+-7)] * -0.004464285714285714
+                      +in[(i+-7)*n+(j+-6)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+-5)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+-4)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+-3)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+-2)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+-1)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+0)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+1)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+2)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+3)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+4)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+5)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+6)] * -0.00034340659340659343
+                      +in[(i+-7)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+-6)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+-6)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+-6)*n+(j+-6)] * -0.005208333333333333
+                      +in[(i+-6)*n+(j+-5)] * -0.0004734848484848485
+                      +in[(i+-6)*n+(j+-4)] * -0.0004734848484848485
+                      +in[(i+-6)*n+(j+-3)] * -0.0004734848484848485
+                      +in[(i+-6)*n+(j+-2)] * -0.0004734848484848485
+                      +in[(i+-6)*n+(j+-1)] * -0.0004734848484848485
+                      +in[(i+-6)*n+(j+0)] * -0.0004734848484848485
+                      +in[(i+-6)*n+(j+1)] * -0.0004734848484848485
+                      +in[(i+-6)*n+(j+2)] * -0.0004734848484848485
+                      +in[(i+-6)*n+(j+3)] * -0.0004734848484848485
+                      +in[(i+-6)*n+(j+4)] * -0.0004734848484848485
+                      +in[(i+-6)*n+(j+5)] * -0.0004734848484848485
+                      +in[(i+-6)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+-6)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+-5)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+-5)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+-5)*n+(j+-6)] * -0.0004734848484848485
+                      +in[(i+-5)*n+(j+-5)] * -0.00625
+                      +in[(i+-5)*n+(j+-4)] * -0.0006944444444444445
+                      +in[(i+-5)*n+(j+-3)] * -0.0006944444444444445
+                      +in[(i+-5)*n+(j+-2)] * -0.0006944444444444445
+                      +in[(i+-5)*n+(j+-1)] * -0.0006944444444444445
+                      +in[(i+-5)*n+(j+0)] * -0.0006944444444444445
+                      +in[(i+-5)*n+(j+1)] * -0.0006944444444444445
+                      +in[(i+-5)*n+(j+2)] * -0.0006944444444444445
+                      +in[(i+-5)*n+(j+3)] * -0.0006944444444444445
+                      +in[(i+-5)*n+(j+4)] * -0.0006944444444444445
+                      +in[(i+-5)*n+(j+6)] * 0.0004734848484848485
+                      +in[(i+-5)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+-5)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+-4)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+-4)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+-4)*n+(j+-6)] * -0.0004734848484848485
+                      +in[(i+-4)*n+(j+-5)] * -0.0006944444444444445
+                      +in[(i+-4)*n+(j+-4)] * -0.0078125
+                      +in[(i+-4)*n+(j+-3)] * -0.0011160714285714285
+                      +in[(i+-4)*n+(j+-2)] * -0.0011160714285714285
+                      +in[(i+-4)*n+(j+-1)] * -0.0011160714285714285
+                      +in[(i+-4)*n+(j+0)] * -0.0011160714285714285
+                      +in[(i+-4)*n+(j+1)] * -0.0011160714285714285
+                      +in[(i+-4)*n+(j+2)] * -0.0011160714285714285
+                      +in[(i+-4)*n+(j+3)] * -0.0011160714285714285
+                      +in[(i+-4)*n+(j+5)] * 0.0006944444444444445
+                      +in[(i+-4)*n+(j+6)] * 0.0004734848484848485
+                      +in[(i+-4)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+-4)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+-3)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+-3)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+-3)*n+(j+-6)] * -0.0004734848484848485
+                      +in[(i+-3)*n+(j+-5)] * -0.0006944444444444445
+                      +in[(i+-3)*n+(j+-4)] * -0.0011160714285714285
+                      +in[(i+-3)*n+(j+-3)] * -0.010416666666666666
+                      +in[(i+-3)*n+(j+-2)] * -0.0020833333333333333
+                      +in[(i+-3)*n+(j+-1)] * -0.0020833333333333333
+                      +in[(i+-3)*n+(j+0)] * -0.0020833333333333333
+                      +in[(i+-3)*n+(j+1)] * -0.0020833333333333333
+                      +in[(i+-3)*n+(j+2)] * -0.0020833333333333333
+                      +in[(i+-3)*n+(j+4)] * 0.0011160714285714285
+                      +in[(i+-3)*n+(j+5)] * 0.0006944444444444445
+                      +in[(i+-3)*n+(j+6)] * 0.0004734848484848485
+                      +in[(i+-3)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+-3)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+-2)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+-2)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+-2)*n+(j+-6)] * -0.0004734848484848485
+                      +in[(i+-2)*n+(j+-5)] * -0.0006944444444444445
+                      +in[(i+-2)*n+(j+-4)] * -0.0011160714285714285
+                      +in[(i+-2)*n+(j+-3)] * -0.0020833333333333333
+                      +in[(i+-2)*n+(j+-2)] * -0.015625
+                      +in[(i+-2)*n+(j+-1)] * -0.005208333333333333
+                      +in[(i+-2)*n+(j+0)] * -0.005208333333333333
+                      +in[(i+-2)*n+(j+1)] * -0.005208333333333333
+                      +in[(i+-2)*n+(j+3)] * 0.0020833333333333333
+                      +in[(i+-2)*n+(j+4)] * 0.0011160714285714285
+                      +in[(i+-2)*n+(j+5)] * 0.0006944444444444445
+                      +in[(i+-2)*n+(j+6)] * 0.0004734848484848485
+                      +in[(i+-2)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+-2)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+-1)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+-1)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+-1)*n+(j+-6)] * -0.0004734848484848485
+                      +in[(i+-1)*n+(j+-5)] * -0.0006944444444444445
+                      +in[(i+-1)*n+(j+-4)] * -0.0011160714285714285
+                      +in[(i+-1)*n+(j+-3)] * -0.0020833333333333333
+                      +in[(i+-1)*n+(j+-2)] * -0.005208333333333333
+                      +in[(i+-1)*n+(j+-1)] * -0.03125
+                      +in[(i+-1)*n+(j+0)] * -0.03125
+                      +in[(i+-1)*n+(j+2)] * 0.005208333333333333
+                      +in[(i+-1)*n+(j+3)] * 0.0020833333333333333
+                      +in[(i+-1)*n+(j+4)] * 0.0011160714285714285
+                      +in[(i+-1)*n+(j+5)] * 0.0006944444444444445
+                      +in[(i+-1)*n+(j+6)] * 0.0004734848484848485
+                      +in[(i+-1)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+-1)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+0)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+0)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+0)*n+(j+-6)] * -0.0004734848484848485
+                      +in[(i+0)*n+(j+-5)] * -0.0006944444444444445
+                      +in[(i+0)*n+(j+-4)] * -0.0011160714285714285
+                      +in[(i+0)*n+(j+-3)] * -0.0020833333333333333
+                      +in[(i+0)*n+(j+-2)] * -0.005208333333333333
+                      +in[(i+0)*n+(j+-1)] * -0.03125
+                      +in[(i+0)*n+(j+1)] * 0.03125
+                      +in[(i+0)*n+(j+2)] * 0.005208333333333333
+                      +in[(i+0)*n+(j+3)] * 0.0020833333333333333
+                      +in[(i+0)*n+(j+4)] * 0.0011160714285714285
+                      +in[(i+0)*n+(j+5)] * 0.0006944444444444445
+                      +in[(i+0)*n+(j+6)] * 0.0004734848484848485
+                      +in[(i+0)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+0)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+1)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+1)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+1)*n+(j+-6)] * -0.0004734848484848485
+                      +in[(i+1)*n+(j+-5)] * -0.0006944444444444445
+                      +in[(i+1)*n+(j+-4)] * -0.0011160714285714285
+                      +in[(i+1)*n+(j+-3)] * -0.0020833333333333333
+                      +in[(i+1)*n+(j+-2)] * -0.005208333333333333
+                      +in[(i+1)*n+(j+0)] * 0.03125
+                      +in[(i+1)*n+(j+1)] * 0.03125
+                      +in[(i+1)*n+(j+2)] * 0.005208333333333333
+                      +in[(i+1)*n+(j+3)] * 0.0020833333333333333
+                      +in[(i+1)*n+(j+4)] * 0.0011160714285714285
+                      +in[(i+1)*n+(j+5)] * 0.0006944444444444445
+                      +in[(i+1)*n+(j+6)] * 0.0004734848484848485
+                      +in[(i+1)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+1)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+2)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+2)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+2)*n+(j+-6)] * -0.0004734848484848485
+                      +in[(i+2)*n+(j+-5)] * -0.0006944444444444445
+                      +in[(i+2)*n+(j+-4)] * -0.0011160714285714285
+                      +in[(i+2)*n+(j+-3)] * -0.0020833333333333333
+                      +in[(i+2)*n+(j+-1)] * 0.005208333333333333
+                      +in[(i+2)*n+(j+0)] * 0.005208333333333333
+                      +in[(i+2)*n+(j+1)] * 0.005208333333333333
+                      +in[(i+2)*n+(j+2)] * 0.015625
+                      +in[(i+2)*n+(j+3)] * 0.0020833333333333333
+                      +in[(i+2)*n+(j+4)] * 0.0011160714285714285
+                      +in[(i+2)*n+(j+5)] * 0.0006944444444444445
+                      +in[(i+2)*n+(j+6)] * 0.0004734848484848485
+                      +in[(i+2)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+2)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+3)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+3)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+3)*n+(j+-6)] * -0.0004734848484848485
+                      +in[(i+3)*n+(j+-5)] * -0.0006944444444444445
+                      +in[(i+3)*n+(j+-4)] * -0.0011160714285714285
+                      +in[(i+3)*n+(j+-2)] * 0.0020833333333333333
+                      +in[(i+3)*n+(j+-1)] * 0.0020833333333333333
+                      +in[(i+3)*n+(j+0)] * 0.0020833333333333333
+                      +in[(i+3)*n+(j+1)] * 0.0020833333333333333
+                      +in[(i+3)*n+(j+2)] * 0.0020833333333333333
+                      +in[(i+3)*n+(j+3)] * 0.010416666666666666
+                      +in[(i+3)*n+(j+4)] * 0.0011160714285714285
+                      +in[(i+3)*n+(j+5)] * 0.0006944444444444445
+                      +in[(i+3)*n+(j+6)] * 0.0004734848484848485
+                      +in[(i+3)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+3)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+4)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+4)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+4)*n+(j+-6)] * -0.0004734848484848485
+                      +in[(i+4)*n+(j+-5)] * -0.0006944444444444445
+                      +in[(i+4)*n+(j+-3)] * 0.0011160714285714285
+                      +in[(i+4)*n+(j+-2)] * 0.0011160714285714285
+                      +in[(i+4)*n+(j+-1)] * 0.0011160714285714285
+                      +in[(i+4)*n+(j+0)] * 0.0011160714285714285
+                      +in[(i+4)*n+(j+1)] * 0.0011160714285714285
+                      +in[(i+4)*n+(j+2)] * 0.0011160714285714285
+                      +in[(i+4)*n+(j+3)] * 0.0011160714285714285
+                      +in[(i+4)*n+(j+4)] * 0.0078125
+                      +in[(i+4)*n+(j+5)] * 0.0006944444444444445
+                      +in[(i+4)*n+(j+6)] * 0.0004734848484848485
+                      +in[(i+4)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+4)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+5)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+5)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+5)*n+(j+-6)] * -0.0004734848484848485
+                      +in[(i+5)*n+(j+-4)] * 0.0006944444444444445
+                      +in[(i+5)*n+(j+-3)] * 0.0006944444444444445
+                      +in[(i+5)*n+(j+-2)] * 0.0006944444444444445
+                      +in[(i+5)*n+(j+-1)] * 0.0006944444444444445
+                      +in[(i+5)*n+(j+0)] * 0.0006944444444444445
+                      +in[(i+5)*n+(j+1)] * 0.0006944444444444445
+                      +in[(i+5)*n+(j+2)] * 0.0006944444444444445
+                      +in[(i+5)*n+(j+3)] * 0.0006944444444444445
+                      +in[(i+5)*n+(j+4)] * 0.0006944444444444445
+                      +in[(i+5)*n+(j+5)] * 0.00625
+                      +in[(i+5)*n+(j+6)] * 0.0004734848484848485
+                      +in[(i+5)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+5)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+6)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+6)*n+(j+-7)] * -0.00034340659340659343
+                      +in[(i+6)*n+(j+-5)] * 0.0004734848484848485
+                      +in[(i+6)*n+(j+-4)] * 0.0004734848484848485
+                      +in[(i+6)*n+(j+-3)] * 0.0004734848484848485
+                      +in[(i+6)*n+(j+-2)] * 0.0004734848484848485
+                      +in[(i+6)*n+(j+-1)] * 0.0004734848484848485
+                      +in[(i+6)*n+(j+0)] * 0.0004734848484848485
+                      +in[(i+6)*n+(j+1)] * 0.0004734848484848485
+                      +in[(i+6)*n+(j+2)] * 0.0004734848484848485
+                      +in[(i+6)*n+(j+3)] * 0.0004734848484848485
+                      +in[(i+6)*n+(j+4)] * 0.0004734848484848485
+                      +in[(i+6)*n+(j+5)] * 0.0004734848484848485
+                      +in[(i+6)*n+(j+6)] * 0.005208333333333333
+                      +in[(i+6)*n+(j+7)] * 0.00034340659340659343
+                      +in[(i+6)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+7)*n+(j+-8)] * -0.00026041666666666666
+                      +in[(i+7)*n+(j+-6)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+-5)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+-4)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+-3)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+-2)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+-1)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+0)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+1)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+2)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+3)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+4)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+5)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+6)] * 0.00034340659340659343
+                      +in[(i+7)*n+(j+7)] * 0.004464285714285714
+                      +in[(i+7)*n+(j+8)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+-7)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+-6)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+-5)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+-4)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+-3)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+-2)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+-1)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+0)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+1)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+2)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+3)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+4)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+5)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+6)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+7)] * 0.00026041666666666666
+                      +in[(i+8)*n+(j+8)] * 0.00390625
+                      ;
+    }
+  }
+}
+
+void grid8_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=8; i<n-8; i++) {
+    for (int j=8; j<n-8; j++) {
+        out[i][j] += +in[(i+-8)][(j+-8)] * -0.00390625
+                      +in[(i+-8)][(j+-7)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+-6)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+-5)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+-4)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+-3)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+-2)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+-1)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+0)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+1)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+2)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+3)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+4)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+5)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+6)] * -0.00026041666666666666
+                      +in[(i+-8)][(j+7)] * -0.00026041666666666666
+                      +in[(i+-7)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+-7)][(j+-7)] * -0.004464285714285714
+                      +in[(i+-7)][(j+-6)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+-5)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+-4)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+-3)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+-2)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+-1)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+0)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+1)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+2)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+3)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+4)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+5)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+6)] * -0.00034340659340659343
+                      +in[(i+-7)][(j+8)] * 0.00026041666666666666
+                      +in[(i+-6)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+-6)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+-6)][(j+-6)] * -0.005208333333333333
+                      +in[(i+-6)][(j+-5)] * -0.0004734848484848485
+                      +in[(i+-6)][(j+-4)] * -0.0004734848484848485
+                      +in[(i+-6)][(j+-3)] * -0.0004734848484848485
+                      +in[(i+-6)][(j+-2)] * -0.0004734848484848485
+                      +in[(i+-6)][(j+-1)] * -0.0004734848484848485
+                      +in[(i+-6)][(j+0)] * -0.0004734848484848485
+                      +in[(i+-6)][(j+1)] * -0.0004734848484848485
+                      +in[(i+-6)][(j+2)] * -0.0004734848484848485
+                      +in[(i+-6)][(j+3)] * -0.0004734848484848485
+                      +in[(i+-6)][(j+4)] * -0.0004734848484848485
+                      +in[(i+-6)][(j+5)] * -0.0004734848484848485
+                      +in[(i+-6)][(j+7)] * 0.00034340659340659343
+                      +in[(i+-6)][(j+8)] * 0.00026041666666666666
+                      +in[(i+-5)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+-5)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+-5)][(j+-6)] * -0.0004734848484848485
+                      +in[(i+-5)][(j+-5)] * -0.00625
+                      +in[(i+-5)][(j+-4)] * -0.0006944444444444445
+                      +in[(i+-5)][(j+-3)] * -0.0006944444444444445
+                      +in[(i+-5)][(j+-2)] * -0.0006944444444444445
+                      +in[(i+-5)][(j+-1)] * -0.0006944444444444445
+                      +in[(i+-5)][(j+0)] * -0.0006944444444444445
+                      +in[(i+-5)][(j+1)] * -0.0006944444444444445
+                      +in[(i+-5)][(j+2)] * -0.0006944444444444445
+                      +in[(i+-5)][(j+3)] * -0.0006944444444444445
+                      +in[(i+-5)][(j+4)] * -0.0006944444444444445
+                      +in[(i+-5)][(j+6)] * 0.0004734848484848485
+                      +in[(i+-5)][(j+7)] * 0.00034340659340659343
+                      +in[(i+-5)][(j+8)] * 0.00026041666666666666
+                      +in[(i+-4)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+-4)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+-4)][(j+-6)] * -0.0004734848484848485
+                      +in[(i+-4)][(j+-5)] * -0.0006944444444444445
+                      +in[(i+-4)][(j+-4)] * -0.0078125
+                      +in[(i+-4)][(j+-3)] * -0.0011160714285714285
+                      +in[(i+-4)][(j+-2)] * -0.0011160714285714285
+                      +in[(i+-4)][(j+-1)] * -0.0011160714285714285
+                      +in[(i+-4)][(j+0)] * -0.0011160714285714285
+                      +in[(i+-4)][(j+1)] * -0.0011160714285714285
+                      +in[(i+-4)][(j+2)] * -0.0011160714285714285
+                      +in[(i+-4)][(j+3)] * -0.0011160714285714285
+                      +in[(i+-4)][(j+5)] * 0.0006944444444444445
+                      +in[(i+-4)][(j+6)] * 0.0004734848484848485
+                      +in[(i+-4)][(j+7)] * 0.00034340659340659343
+                      +in[(i+-4)][(j+8)] * 0.00026041666666666666
+                      +in[(i+-3)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+-3)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+-3)][(j+-6)] * -0.0004734848484848485
+                      +in[(i+-3)][(j+-5)] * -0.0006944444444444445
+                      +in[(i+-3)][(j+-4)] * -0.0011160714285714285
+                      +in[(i+-3)][(j+-3)] * -0.010416666666666666
+                      +in[(i+-3)][(j+-2)] * -0.0020833333333333333
+                      +in[(i+-3)][(j+-1)] * -0.0020833333333333333
+                      +in[(i+-3)][(j+0)] * -0.0020833333333333333
+                      +in[(i+-3)][(j+1)] * -0.0020833333333333333
+                      +in[(i+-3)][(j+2)] * -0.0020833333333333333
+                      +in[(i+-3)][(j+4)] * 0.0011160714285714285
+                      +in[(i+-3)][(j+5)] * 0.0006944444444444445
+                      +in[(i+-3)][(j+6)] * 0.0004734848484848485
+                      +in[(i+-3)][(j+7)] * 0.00034340659340659343
+                      +in[(i+-3)][(j+8)] * 0.00026041666666666666
+                      +in[(i+-2)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+-2)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+-2)][(j+-6)] * -0.0004734848484848485
+                      +in[(i+-2)][(j+-5)] * -0.0006944444444444445
+                      +in[(i+-2)][(j+-4)] * -0.0011160714285714285
+                      +in[(i+-2)][(j+-3)] * -0.0020833333333333333
+                      +in[(i+-2)][(j+-2)] * -0.015625
+                      +in[(i+-2)][(j+-1)] * -0.005208333333333333
+                      +in[(i+-2)][(j+0)] * -0.005208333333333333
+                      +in[(i+-2)][(j+1)] * -0.005208333333333333
+                      +in[(i+-2)][(j+3)] * 0.0020833333333333333
+                      +in[(i+-2)][(j+4)] * 0.0011160714285714285
+                      +in[(i+-2)][(j+5)] * 0.0006944444444444445
+                      +in[(i+-2)][(j+6)] * 0.0004734848484848485
+                      +in[(i+-2)][(j+7)] * 0.00034340659340659343
+                      +in[(i+-2)][(j+8)] * 0.00026041666666666666
+                      +in[(i+-1)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+-1)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+-1)][(j+-6)] * -0.0004734848484848485
+                      +in[(i+-1)][(j+-5)] * -0.0006944444444444445
+                      +in[(i+-1)][(j+-4)] * -0.0011160714285714285
+                      +in[(i+-1)][(j+-3)] * -0.0020833333333333333
+                      +in[(i+-1)][(j+-2)] * -0.005208333333333333
+                      +in[(i+-1)][(j+-1)] * -0.03125
+                      +in[(i+-1)][(j+0)] * -0.03125
+                      +in[(i+-1)][(j+2)] * 0.005208333333333333
+                      +in[(i+-1)][(j+3)] * 0.0020833333333333333
+                      +in[(i+-1)][(j+4)] * 0.0011160714285714285
+                      +in[(i+-1)][(j+5)] * 0.0006944444444444445
+                      +in[(i+-1)][(j+6)] * 0.0004734848484848485
+                      +in[(i+-1)][(j+7)] * 0.00034340659340659343
+                      +in[(i+-1)][(j+8)] * 0.00026041666666666666
+                      +in[(i+0)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+0)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+0)][(j+-6)] * -0.0004734848484848485
+                      +in[(i+0)][(j+-5)] * -0.0006944444444444445
+                      +in[(i+0)][(j+-4)] * -0.0011160714285714285
+                      +in[(i+0)][(j+-3)] * -0.0020833333333333333
+                      +in[(i+0)][(j+-2)] * -0.005208333333333333
+                      +in[(i+0)][(j+-1)] * -0.03125
+                      +in[(i+0)][(j+1)] * 0.03125
+                      +in[(i+0)][(j+2)] * 0.005208333333333333
+                      +in[(i+0)][(j+3)] * 0.0020833333333333333
+                      +in[(i+0)][(j+4)] * 0.0011160714285714285
+                      +in[(i+0)][(j+5)] * 0.0006944444444444445
+                      +in[(i+0)][(j+6)] * 0.0004734848484848485
+                      +in[(i+0)][(j+7)] * 0.00034340659340659343
+                      +in[(i+0)][(j+8)] * 0.00026041666666666666
+                      +in[(i+1)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+1)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+1)][(j+-6)] * -0.0004734848484848485
+                      +in[(i+1)][(j+-5)] * -0.0006944444444444445
+                      +in[(i+1)][(j+-4)] * -0.0011160714285714285
+                      +in[(i+1)][(j+-3)] * -0.0020833333333333333
+                      +in[(i+1)][(j+-2)] * -0.005208333333333333
+                      +in[(i+1)][(j+0)] * 0.03125
+                      +in[(i+1)][(j+1)] * 0.03125
+                      +in[(i+1)][(j+2)] * 0.005208333333333333
+                      +in[(i+1)][(j+3)] * 0.0020833333333333333
+                      +in[(i+1)][(j+4)] * 0.0011160714285714285
+                      +in[(i+1)][(j+5)] * 0.0006944444444444445
+                      +in[(i+1)][(j+6)] * 0.0004734848484848485
+                      +in[(i+1)][(j+7)] * 0.00034340659340659343
+                      +in[(i+1)][(j+8)] * 0.00026041666666666666
+                      +in[(i+2)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+2)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+2)][(j+-6)] * -0.0004734848484848485
+                      +in[(i+2)][(j+-5)] * -0.0006944444444444445
+                      +in[(i+2)][(j+-4)] * -0.0011160714285714285
+                      +in[(i+2)][(j+-3)] * -0.0020833333333333333
+                      +in[(i+2)][(j+-1)] * 0.005208333333333333
+                      +in[(i+2)][(j+0)] * 0.005208333333333333
+                      +in[(i+2)][(j+1)] * 0.005208333333333333
+                      +in[(i+2)][(j+2)] * 0.015625
+                      +in[(i+2)][(j+3)] * 0.0020833333333333333
+                      +in[(i+2)][(j+4)] * 0.0011160714285714285
+                      +in[(i+2)][(j+5)] * 0.0006944444444444445
+                      +in[(i+2)][(j+6)] * 0.0004734848484848485
+                      +in[(i+2)][(j+7)] * 0.00034340659340659343
+                      +in[(i+2)][(j+8)] * 0.00026041666666666666
+                      +in[(i+3)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+3)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+3)][(j+-6)] * -0.0004734848484848485
+                      +in[(i+3)][(j+-5)] * -0.0006944444444444445
+                      +in[(i+3)][(j+-4)] * -0.0011160714285714285
+                      +in[(i+3)][(j+-2)] * 0.0020833333333333333
+                      +in[(i+3)][(j+-1)] * 0.0020833333333333333
+                      +in[(i+3)][(j+0)] * 0.0020833333333333333
+                      +in[(i+3)][(j+1)] * 0.0020833333333333333
+                      +in[(i+3)][(j+2)] * 0.0020833333333333333
+                      +in[(i+3)][(j+3)] * 0.010416666666666666
+                      +in[(i+3)][(j+4)] * 0.0011160714285714285
+                      +in[(i+3)][(j+5)] * 0.0006944444444444445
+                      +in[(i+3)][(j+6)] * 0.0004734848484848485
+                      +in[(i+3)][(j+7)] * 0.00034340659340659343
+                      +in[(i+3)][(j+8)] * 0.00026041666666666666
+                      +in[(i+4)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+4)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+4)][(j+-6)] * -0.0004734848484848485
+                      +in[(i+4)][(j+-5)] * -0.0006944444444444445
+                      +in[(i+4)][(j+-3)] * 0.0011160714285714285
+                      +in[(i+4)][(j+-2)] * 0.0011160714285714285
+                      +in[(i+4)][(j+-1)] * 0.0011160714285714285
+                      +in[(i+4)][(j+0)] * 0.0011160714285714285
+                      +in[(i+4)][(j+1)] * 0.0011160714285714285
+                      +in[(i+4)][(j+2)] * 0.0011160714285714285
+                      +in[(i+4)][(j+3)] * 0.0011160714285714285
+                      +in[(i+4)][(j+4)] * 0.0078125
+                      +in[(i+4)][(j+5)] * 0.0006944444444444445
+                      +in[(i+4)][(j+6)] * 0.0004734848484848485
+                      +in[(i+4)][(j+7)] * 0.00034340659340659343
+                      +in[(i+4)][(j+8)] * 0.00026041666666666666
+                      +in[(i+5)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+5)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+5)][(j+-6)] * -0.0004734848484848485
+                      +in[(i+5)][(j+-4)] * 0.0006944444444444445
+                      +in[(i+5)][(j+-3)] * 0.0006944444444444445
+                      +in[(i+5)][(j+-2)] * 0.0006944444444444445
+                      +in[(i+5)][(j+-1)] * 0.0006944444444444445
+                      +in[(i+5)][(j+0)] * 0.0006944444444444445
+                      +in[(i+5)][(j+1)] * 0.0006944444444444445
+                      +in[(i+5)][(j+2)] * 0.0006944444444444445
+                      +in[(i+5)][(j+3)] * 0.0006944444444444445
+                      +in[(i+5)][(j+4)] * 0.0006944444444444445
+                      +in[(i+5)][(j+5)] * 0.00625
+                      +in[(i+5)][(j+6)] * 0.0004734848484848485
+                      +in[(i+5)][(j+7)] * 0.00034340659340659343
+                      +in[(i+5)][(j+8)] * 0.00026041666666666666
+                      +in[(i+6)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+6)][(j+-7)] * -0.00034340659340659343
+                      +in[(i+6)][(j+-5)] * 0.0004734848484848485
+                      +in[(i+6)][(j+-4)] * 0.0004734848484848485
+                      +in[(i+6)][(j+-3)] * 0.0004734848484848485
+                      +in[(i+6)][(j+-2)] * 0.0004734848484848485
+                      +in[(i+6)][(j+-1)] * 0.0004734848484848485
+                      +in[(i+6)][(j+0)] * 0.0004734848484848485
+                      +in[(i+6)][(j+1)] * 0.0004734848484848485
+                      +in[(i+6)][(j+2)] * 0.0004734848484848485
+                      +in[(i+6)][(j+3)] * 0.0004734848484848485
+                      +in[(i+6)][(j+4)] * 0.0004734848484848485
+                      +in[(i+6)][(j+5)] * 0.0004734848484848485
+                      +in[(i+6)][(j+6)] * 0.005208333333333333
+                      +in[(i+6)][(j+7)] * 0.00034340659340659343
+                      +in[(i+6)][(j+8)] * 0.00026041666666666666
+                      +in[(i+7)][(j+-8)] * -0.00026041666666666666
+                      +in[(i+7)][(j+-6)] * 0.00034340659340659343
+                      +in[(i+7)][(j+-5)] * 0.00034340659340659343
+                      +in[(i+7)][(j+-4)] * 0.00034340659340659343
+                      +in[(i+7)][(j+-3)] * 0.00034340659340659343
+                      +in[(i+7)][(j+-2)] * 0.00034340659340659343
+                      +in[(i+7)][(j+-1)] * 0.00034340659340659343
+                      +in[(i+7)][(j+0)] * 0.00034340659340659343
+                      +in[(i+7)][(j+1)] * 0.00034340659340659343
+                      +in[(i+7)][(j+2)] * 0.00034340659340659343
+                      +in[(i+7)][(j+3)] * 0.00034340659340659343
+                      +in[(i+7)][(j+4)] * 0.00034340659340659343
+                      +in[(i+7)][(j+5)] * 0.00034340659340659343
+                      +in[(i+7)][(j+6)] * 0.00034340659340659343
+                      +in[(i+7)][(j+7)] * 0.004464285714285714
+                      +in[(i+7)][(j+8)] * 0.00026041666666666666
+                      +in[(i+8)][(j+-7)] * 0.00026041666666666666
+                      +in[(i+8)][(j+-6)] * 0.00026041666666666666
+                      +in[(i+8)][(j+-5)] * 0.00026041666666666666
+                      +in[(i+8)][(j+-4)] * 0.00026041666666666666
+                      +in[(i+8)][(j+-3)] * 0.00026041666666666666
+                      +in[(i+8)][(j+-2)] * 0.00026041666666666666
+                      +in[(i+8)][(j+-1)] * 0.00026041666666666666
+                      +in[(i+8)][(j+0)] * 0.00026041666666666666
+                      +in[(i+8)][(j+1)] * 0.00026041666666666666
+                      +in[(i+8)][(j+2)] * 0.00026041666666666666
+                      +in[(i+8)][(j+3)] * 0.00026041666666666666
+                      +in[(i+8)][(j+4)] * 0.00026041666666666666
+                      +in[(i+8)][(j+5)] * 0.00026041666666666666
+                      +in[(i+8)][(j+6)] * 0.00026041666666666666
+                      +in[(i+8)][(j+7)] * 0.00026041666666666666
+                      +in[(i+8)][(j+8)] * 0.00390625
+                      ;
+    }
+  }
+}
+
+void grid9(const int n, const double * restrict in, double * restrict out) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=9; i<n-9; i++) {
+    for (int j=9; j<n-9; j++) {
+        out[i*n+j] += +in[(i+-9)*n+(j+-9)] * -0.0030864197530864196
+                      +in[(i+-9)*n+(j+-8)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+-7)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+-6)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+-5)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+-4)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+-3)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+-2)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+-1)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+0)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+1)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+2)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+3)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+4)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+5)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+6)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+7)] * -0.00018155410312273057
+                      +in[(i+-9)*n+(j+8)] * -0.00018155410312273057
+                      +in[(i+-8)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+-8)*n+(j+-8)] * -0.003472222222222222
+                      +in[(i+-8)*n+(j+-7)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+-6)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+-5)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+-4)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+-3)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+-2)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+-1)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+0)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+1)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+2)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+3)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+4)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+5)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+6)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+7)] * -0.0002314814814814815
+                      +in[(i+-8)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+-7)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+-7)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+-7)*n+(j+-7)] * -0.003968253968253968
+                      +in[(i+-7)*n+(j+-6)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+-5)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+-4)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+-3)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+-2)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+-1)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+0)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+1)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+2)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+3)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+4)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+5)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+6)] * -0.00030525030525030525
+                      +in[(i+-7)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+-7)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+-6)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+-6)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+-6)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+-6)*n+(j+-6)] * -0.004629629629629629
+                      +in[(i+-6)*n+(j+-5)] * -0.00042087542087542086
+                      +in[(i+-6)*n+(j+-4)] * -0.00042087542087542086
+                      +in[(i+-6)*n+(j+-3)] * -0.00042087542087542086
+                      +in[(i+-6)*n+(j+-2)] * -0.00042087542087542086
+                      +in[(i+-6)*n+(j+-1)] * -0.00042087542087542086
+                      +in[(i+-6)*n+(j+0)] * -0.00042087542087542086
+                      +in[(i+-6)*n+(j+1)] * -0.00042087542087542086
+                      +in[(i+-6)*n+(j+2)] * -0.00042087542087542086
+                      +in[(i+-6)*n+(j+3)] * -0.00042087542087542086
+                      +in[(i+-6)*n+(j+4)] * -0.00042087542087542086
+                      +in[(i+-6)*n+(j+5)] * -0.00042087542087542086
+                      +in[(i+-6)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+-6)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+-6)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+-5)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+-5)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+-5)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+-5)*n+(j+-6)] * -0.00042087542087542086
+                      +in[(i+-5)*n+(j+-5)] * -0.005555555555555556
+                      +in[(i+-5)*n+(j+-4)] * -0.0006172839506172839
+                      +in[(i+-5)*n+(j+-3)] * -0.0006172839506172839
+                      +in[(i+-5)*n+(j+-2)] * -0.0006172839506172839
+                      +in[(i+-5)*n+(j+-1)] * -0.0006172839506172839
+                      +in[(i+-5)*n+(j+0)] * -0.0006172839506172839
+                      +in[(i+-5)*n+(j+1)] * -0.0006172839506172839
+                      +in[(i+-5)*n+(j+2)] * -0.0006172839506172839
+                      +in[(i+-5)*n+(j+3)] * -0.0006172839506172839
+                      +in[(i+-5)*n+(j+4)] * -0.0006172839506172839
+                      +in[(i+-5)*n+(j+6)] * 0.00042087542087542086
+                      +in[(i+-5)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+-5)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+-5)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+-4)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+-4)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+-4)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+-4)*n+(j+-6)] * -0.00042087542087542086
+                      +in[(i+-4)*n+(j+-5)] * -0.0006172839506172839
+                      +in[(i+-4)*n+(j+-4)] * -0.006944444444444444
+                      +in[(i+-4)*n+(j+-3)] * -0.000992063492063492
+                      +in[(i+-4)*n+(j+-2)] * -0.000992063492063492
+                      +in[(i+-4)*n+(j+-1)] * -0.000992063492063492
+                      +in[(i+-4)*n+(j+0)] * -0.000992063492063492
+                      +in[(i+-4)*n+(j+1)] * -0.000992063492063492
+                      +in[(i+-4)*n+(j+2)] * -0.000992063492063492
+                      +in[(i+-4)*n+(j+3)] * -0.000992063492063492
+                      +in[(i+-4)*n+(j+5)] * 0.0006172839506172839
+                      +in[(i+-4)*n+(j+6)] * 0.00042087542087542086
+                      +in[(i+-4)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+-4)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+-4)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+-3)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+-3)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+-3)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+-3)*n+(j+-6)] * -0.00042087542087542086
+                      +in[(i+-3)*n+(j+-5)] * -0.0006172839506172839
+                      +in[(i+-3)*n+(j+-4)] * -0.000992063492063492
+                      +in[(i+-3)*n+(j+-3)] * -0.009259259259259259
+                      +in[(i+-3)*n+(j+-2)] * -0.001851851851851852
+                      +in[(i+-3)*n+(j+-1)] * -0.001851851851851852
+                      +in[(i+-3)*n+(j+0)] * -0.001851851851851852
+                      +in[(i+-3)*n+(j+1)] * -0.001851851851851852
+                      +in[(i+-3)*n+(j+2)] * -0.001851851851851852
+                      +in[(i+-3)*n+(j+4)] * 0.000992063492063492
+                      +in[(i+-3)*n+(j+5)] * 0.0006172839506172839
+                      +in[(i+-3)*n+(j+6)] * 0.00042087542087542086
+                      +in[(i+-3)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+-3)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+-3)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+-2)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+-2)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+-2)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+-2)*n+(j+-6)] * -0.00042087542087542086
+                      +in[(i+-2)*n+(j+-5)] * -0.0006172839506172839
+                      +in[(i+-2)*n+(j+-4)] * -0.000992063492063492
+                      +in[(i+-2)*n+(j+-3)] * -0.001851851851851852
+                      +in[(i+-2)*n+(j+-2)] * -0.013888888888888888
+                      +in[(i+-2)*n+(j+-1)] * -0.004629629629629629
+                      +in[(i+-2)*n+(j+0)] * -0.004629629629629629
+                      +in[(i+-2)*n+(j+1)] * -0.004629629629629629
+                      +in[(i+-2)*n+(j+3)] * 0.001851851851851852
+                      +in[(i+-2)*n+(j+4)] * 0.000992063492063492
+                      +in[(i+-2)*n+(j+5)] * 0.0006172839506172839
+                      +in[(i+-2)*n+(j+6)] * 0.00042087542087542086
+                      +in[(i+-2)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+-2)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+-2)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+-1)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+-1)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+-1)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+-1)*n+(j+-6)] * -0.00042087542087542086
+                      +in[(i+-1)*n+(j+-5)] * -0.0006172839506172839
+                      +in[(i+-1)*n+(j+-4)] * -0.000992063492063492
+                      +in[(i+-1)*n+(j+-3)] * -0.001851851851851852
+                      +in[(i+-1)*n+(j+-2)] * -0.004629629629629629
+                      +in[(i+-1)*n+(j+-1)] * -0.027777777777777776
+                      +in[(i+-1)*n+(j+0)] * -0.027777777777777776
+                      +in[(i+-1)*n+(j+2)] * 0.004629629629629629
+                      +in[(i+-1)*n+(j+3)] * 0.001851851851851852
+                      +in[(i+-1)*n+(j+4)] * 0.000992063492063492
+                      +in[(i+-1)*n+(j+5)] * 0.0006172839506172839
+                      +in[(i+-1)*n+(j+6)] * 0.00042087542087542086
+                      +in[(i+-1)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+-1)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+-1)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+0)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+0)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+0)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+0)*n+(j+-6)] * -0.00042087542087542086
+                      +in[(i+0)*n+(j+-5)] * -0.0006172839506172839
+                      +in[(i+0)*n+(j+-4)] * -0.000992063492063492
+                      +in[(i+0)*n+(j+-3)] * -0.001851851851851852
+                      +in[(i+0)*n+(j+-2)] * -0.004629629629629629
+                      +in[(i+0)*n+(j+-1)] * -0.027777777777777776
+                      +in[(i+0)*n+(j+1)] * 0.027777777777777776
+                      +in[(i+0)*n+(j+2)] * 0.004629629629629629
+                      +in[(i+0)*n+(j+3)] * 0.001851851851851852
+                      +in[(i+0)*n+(j+4)] * 0.000992063492063492
+                      +in[(i+0)*n+(j+5)] * 0.0006172839506172839
+                      +in[(i+0)*n+(j+6)] * 0.00042087542087542086
+                      +in[(i+0)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+0)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+0)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+1)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+1)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+1)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+1)*n+(j+-6)] * -0.00042087542087542086
+                      +in[(i+1)*n+(j+-5)] * -0.0006172839506172839
+                      +in[(i+1)*n+(j+-4)] * -0.000992063492063492
+                      +in[(i+1)*n+(j+-3)] * -0.001851851851851852
+                      +in[(i+1)*n+(j+-2)] * -0.004629629629629629
+                      +in[(i+1)*n+(j+0)] * 0.027777777777777776
+                      +in[(i+1)*n+(j+1)] * 0.027777777777777776
+                      +in[(i+1)*n+(j+2)] * 0.004629629629629629
+                      +in[(i+1)*n+(j+3)] * 0.001851851851851852
+                      +in[(i+1)*n+(j+4)] * 0.000992063492063492
+                      +in[(i+1)*n+(j+5)] * 0.0006172839506172839
+                      +in[(i+1)*n+(j+6)] * 0.00042087542087542086
+                      +in[(i+1)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+1)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+1)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+2)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+2)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+2)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+2)*n+(j+-6)] * -0.00042087542087542086
+                      +in[(i+2)*n+(j+-5)] * -0.0006172839506172839
+                      +in[(i+2)*n+(j+-4)] * -0.000992063492063492
+                      +in[(i+2)*n+(j+-3)] * -0.001851851851851852
+                      +in[(i+2)*n+(j+-1)] * 0.004629629629629629
+                      +in[(i+2)*n+(j+0)] * 0.004629629629629629
+                      +in[(i+2)*n+(j+1)] * 0.004629629629629629
+                      +in[(i+2)*n+(j+2)] * 0.013888888888888888
+                      +in[(i+2)*n+(j+3)] * 0.001851851851851852
+                      +in[(i+2)*n+(j+4)] * 0.000992063492063492
+                      +in[(i+2)*n+(j+5)] * 0.0006172839506172839
+                      +in[(i+2)*n+(j+6)] * 0.00042087542087542086
+                      +in[(i+2)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+2)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+2)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+3)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+3)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+3)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+3)*n+(j+-6)] * -0.00042087542087542086
+                      +in[(i+3)*n+(j+-5)] * -0.0006172839506172839
+                      +in[(i+3)*n+(j+-4)] * -0.000992063492063492
+                      +in[(i+3)*n+(j+-2)] * 0.001851851851851852
+                      +in[(i+3)*n+(j+-1)] * 0.001851851851851852
+                      +in[(i+3)*n+(j+0)] * 0.001851851851851852
+                      +in[(i+3)*n+(j+1)] * 0.001851851851851852
+                      +in[(i+3)*n+(j+2)] * 0.001851851851851852
+                      +in[(i+3)*n+(j+3)] * 0.009259259259259259
+                      +in[(i+3)*n+(j+4)] * 0.000992063492063492
+                      +in[(i+3)*n+(j+5)] * 0.0006172839506172839
+                      +in[(i+3)*n+(j+6)] * 0.00042087542087542086
+                      +in[(i+3)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+3)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+3)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+4)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+4)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+4)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+4)*n+(j+-6)] * -0.00042087542087542086
+                      +in[(i+4)*n+(j+-5)] * -0.0006172839506172839
+                      +in[(i+4)*n+(j+-3)] * 0.000992063492063492
+                      +in[(i+4)*n+(j+-2)] * 0.000992063492063492
+                      +in[(i+4)*n+(j+-1)] * 0.000992063492063492
+                      +in[(i+4)*n+(j+0)] * 0.000992063492063492
+                      +in[(i+4)*n+(j+1)] * 0.000992063492063492
+                      +in[(i+4)*n+(j+2)] * 0.000992063492063492
+                      +in[(i+4)*n+(j+3)] * 0.000992063492063492
+                      +in[(i+4)*n+(j+4)] * 0.006944444444444444
+                      +in[(i+4)*n+(j+5)] * 0.0006172839506172839
+                      +in[(i+4)*n+(j+6)] * 0.00042087542087542086
+                      +in[(i+4)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+4)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+4)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+5)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+5)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+5)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+5)*n+(j+-6)] * -0.00042087542087542086
+                      +in[(i+5)*n+(j+-4)] * 0.0006172839506172839
+                      +in[(i+5)*n+(j+-3)] * 0.0006172839506172839
+                      +in[(i+5)*n+(j+-2)] * 0.0006172839506172839
+                      +in[(i+5)*n+(j+-1)] * 0.0006172839506172839
+                      +in[(i+5)*n+(j+0)] * 0.0006172839506172839
+                      +in[(i+5)*n+(j+1)] * 0.0006172839506172839
+                      +in[(i+5)*n+(j+2)] * 0.0006172839506172839
+                      +in[(i+5)*n+(j+3)] * 0.0006172839506172839
+                      +in[(i+5)*n+(j+4)] * 0.0006172839506172839
+                      +in[(i+5)*n+(j+5)] * 0.005555555555555556
+                      +in[(i+5)*n+(j+6)] * 0.00042087542087542086
+                      +in[(i+5)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+5)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+5)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+6)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+6)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+6)*n+(j+-7)] * -0.00030525030525030525
+                      +in[(i+6)*n+(j+-5)] * 0.00042087542087542086
+                      +in[(i+6)*n+(j+-4)] * 0.00042087542087542086
+                      +in[(i+6)*n+(j+-3)] * 0.00042087542087542086
+                      +in[(i+6)*n+(j+-2)] * 0.00042087542087542086
+                      +in[(i+6)*n+(j+-1)] * 0.00042087542087542086
+                      +in[(i+6)*n+(j+0)] * 0.00042087542087542086
+                      +in[(i+6)*n+(j+1)] * 0.00042087542087542086
+                      +in[(i+6)*n+(j+2)] * 0.00042087542087542086
+                      +in[(i+6)*n+(j+3)] * 0.00042087542087542086
+                      +in[(i+6)*n+(j+4)] * 0.00042087542087542086
+                      +in[(i+6)*n+(j+5)] * 0.00042087542087542086
+                      +in[(i+6)*n+(j+6)] * 0.004629629629629629
+                      +in[(i+6)*n+(j+7)] * 0.00030525030525030525
+                      +in[(i+6)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+6)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+7)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+7)*n+(j+-8)] * -0.0002314814814814815
+                      +in[(i+7)*n+(j+-6)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+-5)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+-4)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+-3)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+-2)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+-1)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+0)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+1)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+2)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+3)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+4)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+5)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+6)] * 0.00030525030525030525
+                      +in[(i+7)*n+(j+7)] * 0.003968253968253968
+                      +in[(i+7)*n+(j+8)] * 0.0002314814814814815
+                      +in[(i+7)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+8)*n+(j+-9)] * -0.00018155410312273057
+                      +in[(i+8)*n+(j+-7)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+-6)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+-5)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+-4)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+-3)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+-2)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+-1)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+0)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+1)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+2)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+3)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+4)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+5)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+6)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+7)] * 0.0002314814814814815
+                      +in[(i+8)*n+(j+8)] * 0.003472222222222222
+                      +in[(i+8)*n+(j+9)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+-8)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+-7)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+-6)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+-5)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+-4)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+-3)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+-2)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+-1)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+0)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+1)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+2)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+3)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+4)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+5)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+6)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+7)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+8)] * 0.00018155410312273057
+                      +in[(i+9)*n+(j+9)] * 0.0030864197530864196
+                      ;
+    }
+  }
+}
+
+void grid9_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
+  PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )
+  for (int i=9; i<n-9; i++) {
+    for (int j=9; j<n-9; j++) {
+        out[i][j] += +in[(i+-9)][(j+-9)] * -0.0030864197530864196
+                      +in[(i+-9)][(j+-8)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+-7)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+-6)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+-5)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+-4)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+-3)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+-2)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+-1)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+0)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+1)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+2)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+3)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+4)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+5)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+6)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+7)] * -0.00018155410312273057
+                      +in[(i+-9)][(j+8)] * -0.00018155410312273057
+                      +in[(i+-8)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+-8)][(j+-8)] * -0.003472222222222222
+                      +in[(i+-8)][(j+-7)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+-6)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+-5)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+-4)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+-3)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+-2)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+-1)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+0)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+1)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+2)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+3)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+4)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+5)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+6)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+7)] * -0.0002314814814814815
+                      +in[(i+-8)][(j+9)] * 0.00018155410312273057
+                      +in[(i+-7)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+-7)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+-7)][(j+-7)] * -0.003968253968253968
+                      +in[(i+-7)][(j+-6)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+-5)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+-4)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+-3)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+-2)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+-1)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+0)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+1)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+2)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+3)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+4)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+5)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+6)] * -0.00030525030525030525
+                      +in[(i+-7)][(j+8)] * 0.0002314814814814815
+                      +in[(i+-7)][(j+9)] * 0.00018155410312273057
+                      +in[(i+-6)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+-6)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+-6)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+-6)][(j+-6)] * -0.004629629629629629
+                      +in[(i+-6)][(j+-5)] * -0.00042087542087542086
+                      +in[(i+-6)][(j+-4)] * -0.00042087542087542086
+                      +in[(i+-6)][(j+-3)] * -0.00042087542087542086
+                      +in[(i+-6)][(j+-2)] * -0.00042087542087542086
+                      +in[(i+-6)][(j+-1)] * -0.00042087542087542086
+                      +in[(i+-6)][(j+0)] * -0.00042087542087542086
+                      +in[(i+-6)][(j+1)] * -0.00042087542087542086
+                      +in[(i+-6)][(j+2)] * -0.00042087542087542086
+                      +in[(i+-6)][(j+3)] * -0.00042087542087542086
+                      +in[(i+-6)][(j+4)] * -0.00042087542087542086
+                      +in[(i+-6)][(j+5)] * -0.00042087542087542086
+                      +in[(i+-6)][(j+7)] * 0.00030525030525030525
+                      +in[(i+-6)][(j+8)] * 0.0002314814814814815
+                      +in[(i+-6)][(j+9)] * 0.00018155410312273057
+                      +in[(i+-5)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+-5)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+-5)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+-5)][(j+-6)] * -0.00042087542087542086
+                      +in[(i+-5)][(j+-5)] * -0.005555555555555556
+                      +in[(i+-5)][(j+-4)] * -0.0006172839506172839
+                      +in[(i+-5)][(j+-3)] * -0.0006172839506172839
+                      +in[(i+-5)][(j+-2)] * -0.0006172839506172839
+                      +in[(i+-5)][(j+-1)] * -0.0006172839506172839
+                      +in[(i+-5)][(j+0)] * -0.0006172839506172839
+                      +in[(i+-5)][(j+1)] * -0.0006172839506172839
+                      +in[(i+-5)][(j+2)] * -0.0006172839506172839
+                      +in[(i+-5)][(j+3)] * -0.0006172839506172839
+                      +in[(i+-5)][(j+4)] * -0.0006172839506172839
+                      +in[(i+-5)][(j+6)] * 0.00042087542087542086
+                      +in[(i+-5)][(j+7)] * 0.00030525030525030525
+                      +in[(i+-5)][(j+8)] * 0.0002314814814814815
+                      +in[(i+-5)][(j+9)] * 0.00018155410312273057
+                      +in[(i+-4)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+-4)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+-4)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+-4)][(j+-6)] * -0.00042087542087542086
+                      +in[(i+-4)][(j+-5)] * -0.0006172839506172839
+                      +in[(i+-4)][(j+-4)] * -0.006944444444444444
+                      +in[(i+-4)][(j+-3)] * -0.000992063492063492
+                      +in[(i+-4)][(j+-2)] * -0.000992063492063492
+                      +in[(i+-4)][(j+-1)] * -0.000992063492063492
+                      +in[(i+-4)][(j+0)] * -0.000992063492063492
+                      +in[(i+-4)][(j+1)] * -0.000992063492063492
+                      +in[(i+-4)][(j+2)] * -0.000992063492063492
+                      +in[(i+-4)][(j+3)] * -0.000992063492063492
+                      +in[(i+-4)][(j+5)] * 0.0006172839506172839
+                      +in[(i+-4)][(j+6)] * 0.00042087542087542086
+                      +in[(i+-4)][(j+7)] * 0.00030525030525030525
+                      +in[(i+-4)][(j+8)] * 0.0002314814814814815
+                      +in[(i+-4)][(j+9)] * 0.00018155410312273057
+                      +in[(i+-3)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+-3)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+-3)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+-3)][(j+-6)] * -0.00042087542087542086
+                      +in[(i+-3)][(j+-5)] * -0.0006172839506172839
+                      +in[(i+-3)][(j+-4)] * -0.000992063492063492
+                      +in[(i+-3)][(j+-3)] * -0.009259259259259259
+                      +in[(i+-3)][(j+-2)] * -0.001851851851851852
+                      +in[(i+-3)][(j+-1)] * -0.001851851851851852
+                      +in[(i+-3)][(j+0)] * -0.001851851851851852
+                      +in[(i+-3)][(j+1)] * -0.001851851851851852
+                      +in[(i+-3)][(j+2)] * -0.001851851851851852
+                      +in[(i+-3)][(j+4)] * 0.000992063492063492
+                      +in[(i+-3)][(j+5)] * 0.0006172839506172839
+                      +in[(i+-3)][(j+6)] * 0.00042087542087542086
+                      +in[(i+-3)][(j+7)] * 0.00030525030525030525
+                      +in[(i+-3)][(j+8)] * 0.0002314814814814815
+                      +in[(i+-3)][(j+9)] * 0.00018155410312273057
+                      +in[(i+-2)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+-2)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+-2)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+-2)][(j+-6)] * -0.00042087542087542086
+                      +in[(i+-2)][(j+-5)] * -0.0006172839506172839
+                      +in[(i+-2)][(j+-4)] * -0.000992063492063492
+                      +in[(i+-2)][(j+-3)] * -0.001851851851851852
+                      +in[(i+-2)][(j+-2)] * -0.013888888888888888
+                      +in[(i+-2)][(j+-1)] * -0.004629629629629629
+                      +in[(i+-2)][(j+0)] * -0.004629629629629629
+                      +in[(i+-2)][(j+1)] * -0.004629629629629629
+                      +in[(i+-2)][(j+3)] * 0.001851851851851852
+                      +in[(i+-2)][(j+4)] * 0.000992063492063492
+                      +in[(i+-2)][(j+5)] * 0.0006172839506172839
+                      +in[(i+-2)][(j+6)] * 0.00042087542087542086
+                      +in[(i+-2)][(j+7)] * 0.00030525030525030525
+                      +in[(i+-2)][(j+8)] * 0.0002314814814814815
+                      +in[(i+-2)][(j+9)] * 0.00018155410312273057
+                      +in[(i+-1)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+-1)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+-1)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+-1)][(j+-6)] * -0.00042087542087542086
+                      +in[(i+-1)][(j+-5)] * -0.0006172839506172839
+                      +in[(i+-1)][(j+-4)] * -0.000992063492063492
+                      +in[(i+-1)][(j+-3)] * -0.001851851851851852
+                      +in[(i+-1)][(j+-2)] * -0.004629629629629629
+                      +in[(i+-1)][(j+-1)] * -0.027777777777777776
+                      +in[(i+-1)][(j+0)] * -0.027777777777777776
+                      +in[(i+-1)][(j+2)] * 0.004629629629629629
+                      +in[(i+-1)][(j+3)] * 0.001851851851851852
+                      +in[(i+-1)][(j+4)] * 0.000992063492063492
+                      +in[(i+-1)][(j+5)] * 0.0006172839506172839
+                      +in[(i+-1)][(j+6)] * 0.00042087542087542086
+                      +in[(i+-1)][(j+7)] * 0.00030525030525030525
+                      +in[(i+-1)][(j+8)] * 0.0002314814814814815
+                      +in[(i+-1)][(j+9)] * 0.00018155410312273057
+                      +in[(i+0)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+0)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+0)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+0)][(j+-6)] * -0.00042087542087542086
+                      +in[(i+0)][(j+-5)] * -0.0006172839506172839
+                      +in[(i+0)][(j+-4)] * -0.000992063492063492
+                      +in[(i+0)][(j+-3)] * -0.001851851851851852
+                      +in[(i+0)][(j+-2)] * -0.004629629629629629
+                      +in[(i+0)][(j+-1)] * -0.027777777777777776
+                      +in[(i+0)][(j+1)] * 0.027777777777777776
+                      +in[(i+0)][(j+2)] * 0.004629629629629629
+                      +in[(i+0)][(j+3)] * 0.001851851851851852
+                      +in[(i+0)][(j+4)] * 0.000992063492063492
+                      +in[(i+0)][(j+5)] * 0.0006172839506172839
+                      +in[(i+0)][(j+6)] * 0.00042087542087542086
+                      +in[(i+0)][(j+7)] * 0.00030525030525030525
+                      +in[(i+0)][(j+8)] * 0.0002314814814814815
+                      +in[(i+0)][(j+9)] * 0.00018155410312273057
+                      +in[(i+1)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+1)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+1)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+1)][(j+-6)] * -0.00042087542087542086
+                      +in[(i+1)][(j+-5)] * -0.0006172839506172839
+                      +in[(i+1)][(j+-4)] * -0.000992063492063492
+                      +in[(i+1)][(j+-3)] * -0.001851851851851852
+                      +in[(i+1)][(j+-2)] * -0.004629629629629629
+                      +in[(i+1)][(j+0)] * 0.027777777777777776
+                      +in[(i+1)][(j+1)] * 0.027777777777777776
+                      +in[(i+1)][(j+2)] * 0.004629629629629629
+                      +in[(i+1)][(j+3)] * 0.001851851851851852
+                      +in[(i+1)][(j+4)] * 0.000992063492063492
+                      +in[(i+1)][(j+5)] * 0.0006172839506172839
+                      +in[(i+1)][(j+6)] * 0.00042087542087542086
+                      +in[(i+1)][(j+7)] * 0.00030525030525030525
+                      +in[(i+1)][(j+8)] * 0.0002314814814814815
+                      +in[(i+1)][(j+9)] * 0.00018155410312273057
+                      +in[(i+2)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+2)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+2)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+2)][(j+-6)] * -0.00042087542087542086
+                      +in[(i+2)][(j+-5)] * -0.0006172839506172839
+                      +in[(i+2)][(j+-4)] * -0.000992063492063492
+                      +in[(i+2)][(j+-3)] * -0.001851851851851852
+                      +in[(i+2)][(j+-1)] * 0.004629629629629629
+                      +in[(i+2)][(j+0)] * 0.004629629629629629
+                      +in[(i+2)][(j+1)] * 0.004629629629629629
+                      +in[(i+2)][(j+2)] * 0.013888888888888888
+                      +in[(i+2)][(j+3)] * 0.001851851851851852
+                      +in[(i+2)][(j+4)] * 0.000992063492063492
+                      +in[(i+2)][(j+5)] * 0.0006172839506172839
+                      +in[(i+2)][(j+6)] * 0.00042087542087542086
+                      +in[(i+2)][(j+7)] * 0.00030525030525030525
+                      +in[(i+2)][(j+8)] * 0.0002314814814814815
+                      +in[(i+2)][(j+9)] * 0.00018155410312273057
+                      +in[(i+3)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+3)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+3)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+3)][(j+-6)] * -0.00042087542087542086
+                      +in[(i+3)][(j+-5)] * -0.0006172839506172839
+                      +in[(i+3)][(j+-4)] * -0.000992063492063492
+                      +in[(i+3)][(j+-2)] * 0.001851851851851852
+                      +in[(i+3)][(j+-1)] * 0.001851851851851852
+                      +in[(i+3)][(j+0)] * 0.001851851851851852
+                      +in[(i+3)][(j+1)] * 0.001851851851851852
+                      +in[(i+3)][(j+2)] * 0.001851851851851852
+                      +in[(i+3)][(j+3)] * 0.009259259259259259
+                      +in[(i+3)][(j+4)] * 0.000992063492063492
+                      +in[(i+3)][(j+5)] * 0.0006172839506172839
+                      +in[(i+3)][(j+6)] * 0.00042087542087542086
+                      +in[(i+3)][(j+7)] * 0.00030525030525030525
+                      +in[(i+3)][(j+8)] * 0.0002314814814814815
+                      +in[(i+3)][(j+9)] * 0.00018155410312273057
+                      +in[(i+4)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+4)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+4)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+4)][(j+-6)] * -0.00042087542087542086
+                      +in[(i+4)][(j+-5)] * -0.0006172839506172839
+                      +in[(i+4)][(j+-3)] * 0.000992063492063492
+                      +in[(i+4)][(j+-2)] * 0.000992063492063492
+                      +in[(i+4)][(j+-1)] * 0.000992063492063492
+                      +in[(i+4)][(j+0)] * 0.000992063492063492
+                      +in[(i+4)][(j+1)] * 0.000992063492063492
+                      +in[(i+4)][(j+2)] * 0.000992063492063492
+                      +in[(i+4)][(j+3)] * 0.000992063492063492
+                      +in[(i+4)][(j+4)] * 0.006944444444444444
+                      +in[(i+4)][(j+5)] * 0.0006172839506172839
+                      +in[(i+4)][(j+6)] * 0.00042087542087542086
+                      +in[(i+4)][(j+7)] * 0.00030525030525030525
+                      +in[(i+4)][(j+8)] * 0.0002314814814814815
+                      +in[(i+4)][(j+9)] * 0.00018155410312273057
+                      +in[(i+5)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+5)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+5)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+5)][(j+-6)] * -0.00042087542087542086
+                      +in[(i+5)][(j+-4)] * 0.0006172839506172839
+                      +in[(i+5)][(j+-3)] * 0.0006172839506172839
+                      +in[(i+5)][(j+-2)] * 0.0006172839506172839
+                      +in[(i+5)][(j+-1)] * 0.0006172839506172839
+                      +in[(i+5)][(j+0)] * 0.0006172839506172839
+                      +in[(i+5)][(j+1)] * 0.0006172839506172839
+                      +in[(i+5)][(j+2)] * 0.0006172839506172839
+                      +in[(i+5)][(j+3)] * 0.0006172839506172839
+                      +in[(i+5)][(j+4)] * 0.0006172839506172839
+                      +in[(i+5)][(j+5)] * 0.005555555555555556
+                      +in[(i+5)][(j+6)] * 0.00042087542087542086
+                      +in[(i+5)][(j+7)] * 0.00030525030525030525
+                      +in[(i+5)][(j+8)] * 0.0002314814814814815
+                      +in[(i+5)][(j+9)] * 0.00018155410312273057
+                      +in[(i+6)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+6)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+6)][(j+-7)] * -0.00030525030525030525
+                      +in[(i+6)][(j+-5)] * 0.00042087542087542086
+                      +in[(i+6)][(j+-4)] * 0.00042087542087542086
+                      +in[(i+6)][(j+-3)] * 0.00042087542087542086
+                      +in[(i+6)][(j+-2)] * 0.00042087542087542086
+                      +in[(i+6)][(j+-1)] * 0.00042087542087542086
+                      +in[(i+6)][(j+0)] * 0.00042087542087542086
+                      +in[(i+6)][(j+1)] * 0.00042087542087542086
+                      +in[(i+6)][(j+2)] * 0.00042087542087542086
+                      +in[(i+6)][(j+3)] * 0.00042087542087542086
+                      +in[(i+6)][(j+4)] * 0.00042087542087542086
+                      +in[(i+6)][(j+5)] * 0.00042087542087542086
+                      +in[(i+6)][(j+6)] * 0.004629629629629629
+                      +in[(i+6)][(j+7)] * 0.00030525030525030525
+                      +in[(i+6)][(j+8)] * 0.0002314814814814815
+                      +in[(i+6)][(j+9)] * 0.00018155410312273057
+                      +in[(i+7)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+7)][(j+-8)] * -0.0002314814814814815
+                      +in[(i+7)][(j+-6)] * 0.00030525030525030525
+                      +in[(i+7)][(j+-5)] * 0.00030525030525030525
+                      +in[(i+7)][(j+-4)] * 0.00030525030525030525
+                      +in[(i+7)][(j+-3)] * 0.00030525030525030525
+                      +in[(i+7)][(j+-2)] * 0.00030525030525030525
+                      +in[(i+7)][(j+-1)] * 0.00030525030525030525
+                      +in[(i+7)][(j+0)] * 0.00030525030525030525
+                      +in[(i+7)][(j+1)] * 0.00030525030525030525
+                      +in[(i+7)][(j+2)] * 0.00030525030525030525
+                      +in[(i+7)][(j+3)] * 0.00030525030525030525
+                      +in[(i+7)][(j+4)] * 0.00030525030525030525
+                      +in[(i+7)][(j+5)] * 0.00030525030525030525
+                      +in[(i+7)][(j+6)] * 0.00030525030525030525
+                      +in[(i+7)][(j+7)] * 0.003968253968253968
+                      +in[(i+7)][(j+8)] * 0.0002314814814814815
+                      +in[(i+7)][(j+9)] * 0.00018155410312273057
+                      +in[(i+8)][(j+-9)] * -0.00018155410312273057
+                      +in[(i+8)][(j+-7)] * 0.0002314814814814815
+                      +in[(i+8)][(j+-6)] * 0.0002314814814814815
+                      +in[(i+8)][(j+-5)] * 0.0002314814814814815
+                      +in[(i+8)][(j+-4)] * 0.0002314814814814815
+                      +in[(i+8)][(j+-3)] * 0.0002314814814814815
+                      +in[(i+8)][(j+-2)] * 0.0002314814814814815
+                      +in[(i+8)][(j+-1)] * 0.0002314814814814815
+                      +in[(i+8)][(j+0)] * 0.0002314814814814815
+                      +in[(i+8)][(j+1)] * 0.0002314814814814815
+                      +in[(i+8)][(j+2)] * 0.0002314814814814815
+                      +in[(i+8)][(j+3)] * 0.0002314814814814815
+                      +in[(i+8)][(j+4)] * 0.0002314814814814815
+                      +in[(i+8)][(j+5)] * 0.0002314814814814815
+                      +in[(i+8)][(j+6)] * 0.0002314814814814815
+                      +in[(i+8)][(j+7)] * 0.0002314814814814815
+                      +in[(i+8)][(j+8)] * 0.003472222222222222
+                      +in[(i+8)][(j+9)] * 0.00018155410312273057
+                      +in[(i+9)][(j+-8)] * 0.00018155410312273057
+                      +in[(i+9)][(j+-7)] * 0.00018155410312273057
+                      +in[(i+9)][(j+-6)] * 0.00018155410312273057
+                      +in[(i+9)][(j+-5)] * 0.00018155410312273057
+                      +in[(i+9)][(j+-4)] * 0.00018155410312273057
+                      +in[(i+9)][(j+-3)] * 0.00018155410312273057
+                      +in[(i+9)][(j+-2)] * 0.00018155410312273057
+                      +in[(i+9)][(j+-1)] * 0.00018155410312273057
+                      +in[(i+9)][(j+0)] * 0.00018155410312273057
+                      +in[(i+9)][(j+1)] * 0.00018155410312273057
+                      +in[(i+9)][(j+2)] * 0.00018155410312273057
+                      +in[(i+9)][(j+3)] * 0.00018155410312273057
+                      +in[(i+9)][(j+4)] * 0.00018155410312273057
+                      +in[(i+9)][(j+5)] * 0.00018155410312273057
+                      +in[(i+9)][(j+6)] * 0.00018155410312273057
+                      +in[(i+9)][(j+7)] * 0.00018155410312273057
+                      +in[(i+9)][(j+8)] * 0.00018155410312273057
+                      +in[(i+9)][(j+9)] * 0.0030864197530864196
+                      ;
+    }
+  }
+}
+

From 259116cba648d2e04cbd13a7d3fa9c2193308915 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 12:03:20 +0300
Subject: [PATCH 35/80] remove schedule

---
 C1z/stencil_target.h | 72 ++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/C1z/stencil_target.h b/C1z/stencil_target.h
index b50d70636..28d1a5fcf 100644
--- a/C1z/stencil_target.h
+++ b/C1z/stencil_target.h
@@ -1,5 +1,5 @@
 void star1(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=1; i<n-1; i++) {
     for (int j=1; j<n-1; j++) {
         out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5
@@ -11,7 +11,7 @@ void star1(const int n, const double * restrict in, double * restrict out) {
 }
 
 void star1_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=1; i<n-1; i++) {
     for (int j=1; j<n-1; j++) {
         out[i][j] += +in[(i+-1)][(j+0)] * -0.5
@@ -23,7 +23,7 @@ void star1_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void star2(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=2; i<n-2; i++) {
     for (int j=2; j<n-2; j++) {
         out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125
@@ -39,7 +39,7 @@ void star2(const int n, const double * restrict in, double * restrict out) {
 }
 
 void star2_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=2; i<n-2; i++) {
     for (int j=2; j<n-2; j++) {
         out[i][j] += +in[(i+-2)][(j+0)] * -0.125
@@ -55,7 +55,7 @@ void star2_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void star3(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=3; i<n-3; i++) {
     for (int j=3; j<n-3; j++) {
         out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555
@@ -75,7 +75,7 @@ void star3(const int n, const double * restrict in, double * restrict out) {
 }
 
 void star3_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=3; i<n-3; i++) {
     for (int j=3; j<n-3; j++) {
         out[i][j] += +in[(i+-3)][(j+0)] * -0.05555555555555555
@@ -95,7 +95,7 @@ void star3_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void star4(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=4; i<n-4; i++) {
     for (int j=4; j<n-4; j++) {
         out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125
@@ -119,7 +119,7 @@ void star4(const int n, const double * restrict in, double * restrict out) {
 }
 
 void star4_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=4; i<n-4; i++) {
     for (int j=4; j<n-4; j++) {
         out[i][j] += +in[(i+-4)][(j+0)] * -0.03125
@@ -143,7 +143,7 @@ void star4_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void star5(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=5; i<n-5; i++) {
     for (int j=5; j<n-5; j++) {
         out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02
@@ -171,7 +171,7 @@ void star5(const int n, const double * restrict in, double * restrict out) {
 }
 
 void star5_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=5; i<n-5; i++) {
     for (int j=5; j<n-5; j++) {
         out[i][j] += +in[(i+-5)][(j+0)] * -0.02
@@ -199,7 +199,7 @@ void star5_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void star6(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=6; i<n-6; i++) {
     for (int j=6; j<n-6; j++) {
         out[i*n+j] += +in[(i+-6)*n+(j+0)] * -0.013888888888888888
@@ -231,7 +231,7 @@ void star6(const int n, const double * restrict in, double * restrict out) {
 }
 
 void star6_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=6; i<n-6; i++) {
     for (int j=6; j<n-6; j++) {
         out[i][j] += +in[(i+-6)][(j+0)] * -0.013888888888888888
@@ -263,7 +263,7 @@ void star6_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void star7(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=7; i<n-7; i++) {
     for (int j=7; j<n-7; j++) {
         out[i*n+j] += +in[(i+-7)*n+(j+0)] * -0.01020408163265306
@@ -299,7 +299,7 @@ void star7(const int n, const double * restrict in, double * restrict out) {
 }
 
 void star7_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=7; i<n-7; i++) {
     for (int j=7; j<n-7; j++) {
         out[i][j] += +in[(i+-7)][(j+0)] * -0.01020408163265306
@@ -335,7 +335,7 @@ void star7_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void star8(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=8; i<n-8; i++) {
     for (int j=8; j<n-8; j++) {
         out[i*n+j] += +in[(i+-8)*n+(j+0)] * -0.0078125
@@ -375,7 +375,7 @@ void star8(const int n, const double * restrict in, double * restrict out) {
 }
 
 void star8_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=8; i<n-8; i++) {
     for (int j=8; j<n-8; j++) {
         out[i][j] += +in[(i+-8)][(j+0)] * -0.0078125
@@ -415,7 +415,7 @@ void star8_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void star9(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=9; i<n-9; i++) {
     for (int j=9; j<n-9; j++) {
         out[i*n+j] += +in[(i+-9)*n+(j+0)] * -0.006172839506172839
@@ -459,7 +459,7 @@ void star9(const int n, const double * restrict in, double * restrict out) {
 }
 
 void star9_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=9; i<n-9; i++) {
     for (int j=9; j<n-9; j++) {
         out[i][j] += +in[(i+-9)][(j+0)] * -0.006172839506172839
@@ -503,7 +503,7 @@ void star9_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void grid1(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=1; i<n-1; i++) {
     for (int j=1; j<n-1; j++) {
         out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25
@@ -518,7 +518,7 @@ void grid1(const int n, const double * restrict in, double * restrict out) {
 }
 
 void grid1_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=1; i<n-1; i++) {
     for (int j=1; j<n-1; j++) {
         out[i][j] += +in[(i+-1)][(j+-1)] * -0.25
@@ -533,7 +533,7 @@ void grid1_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void grid2(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=2; i<n-2; i++) {
     for (int j=2; j<n-2; j++) {
         out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625
@@ -562,7 +562,7 @@ void grid2(const int n, const double * restrict in, double * restrict out) {
 }
 
 void grid2_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=2; i<n-2; i++) {
     for (int j=2; j<n-2; j++) {
         out[i][j] += +in[(i+-2)][(j+-2)] * -0.0625
@@ -591,7 +591,7 @@ void grid2_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void grid3(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=3; i<n-3; i++) {
     for (int j=3; j<n-3; j++) {
         out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776
@@ -642,7 +642,7 @@ void grid3(const int n, const double * restrict in, double * restrict out) {
 }
 
 void grid3_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=3; i<n-3; i++) {
     for (int j=3; j<n-3; j++) {
         out[i][j] += +in[(i+-3)][(j+-3)] * -0.027777777777777776
@@ -693,7 +693,7 @@ void grid3_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void grid4(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=4; i<n-4; i++) {
     for (int j=4; j<n-4; j++) {
         out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625
@@ -774,7 +774,7 @@ void grid4(const int n, const double * restrict in, double * restrict out) {
 }
 
 void grid4_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=4; i<n-4; i++) {
     for (int j=4; j<n-4; j++) {
         out[i][j] += +in[(i+-4)][(j+-4)] * -0.015625
@@ -855,7 +855,7 @@ void grid4_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void grid5(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=5; i<n-5; i++) {
     for (int j=5; j<n-5; j++) {
         out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01
@@ -974,7 +974,7 @@ void grid5(const int n, const double * restrict in, double * restrict out) {
 }
 
 void grid5_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=5; i<n-5; i++) {
     for (int j=5; j<n-5; j++) {
         out[i][j] += +in[(i+-5)][(j+-5)] * -0.01
@@ -1093,7 +1093,7 @@ void grid5_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void grid6(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=6; i<n-6; i++) {
     for (int j=6; j<n-6; j++) {
         out[i*n+j] += +in[(i+-6)*n+(j+-6)] * -0.006944444444444444
@@ -1258,7 +1258,7 @@ void grid6(const int n, const double * restrict in, double * restrict out) {
 }
 
 void grid6_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=6; i<n-6; i++) {
     for (int j=6; j<n-6; j++) {
         out[i][j] += +in[(i+-6)][(j+-6)] * -0.006944444444444444
@@ -1423,7 +1423,7 @@ void grid6_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void grid7(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=7; i<n-7; i++) {
     for (int j=7; j<n-7; j++) {
         out[i*n+j] += +in[(i+-7)*n+(j+-7)] * -0.00510204081632653
@@ -1642,7 +1642,7 @@ void grid7(const int n, const double * restrict in, double * restrict out) {
 }
 
 void grid7_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=7; i<n-7; i++) {
     for (int j=7; j<n-7; j++) {
         out[i][j] += +in[(i+-7)][(j+-7)] * -0.00510204081632653
@@ -1861,7 +1861,7 @@ void grid7_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void grid8(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=8; i<n-8; i++) {
     for (int j=8; j<n-8; j++) {
         out[i*n+j] += +in[(i+-8)*n+(j+-8)] * -0.00390625
@@ -2142,7 +2142,7 @@ void grid8(const int n, const double * restrict in, double * restrict out) {
 }
 
 void grid8_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=8; i<n-8; i++) {
     for (int j=8; j<n-8; j++) {
         out[i][j] += +in[(i+-8)][(j+-8)] * -0.00390625
@@ -2423,7 +2423,7 @@ void grid8_2d(const int n, const double (* restrict in)[n], double (* restrict o
 }
 
 void grid9(const int n, const double * restrict in, double * restrict out) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=9; i<n-9; i++) {
     for (int j=9; j<n-9; j++) {
         out[i*n+j] += +in[(i+-9)*n+(j+-9)] * -0.0030864197530864196
@@ -2774,7 +2774,7 @@ void grid9(const int n, const double * restrict in, double * restrict out) {
 }
 
 void grid9_2d(const int n, const double (* restrict in)[n], double (* restrict out)[n]) {
-  OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )
+  OMP_TARGET( teams distribute parallel for simd collapse(2) )
   for (int i=9; i<n-9; i++) {
     for (int j=9; j<n-9; j++) {
         out[i][j] += +in[(i+-9)][(j+-9)] * -0.0030864197530864196

From 6bb2e3448785387c88e65050470daf032b89f33a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 12:03:28 +0300
Subject: [PATCH 36/80] OpenACC

---
 C1z/Makefile            | 11 ++++++-----
 C1z/transpose-openacc.c |  2 --
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/C1z/Makefile b/C1z/Makefile
index c8c61ed10..f719a9096 100644
--- a/C1z/Makefile
+++ b/C1z/Makefile
@@ -3,13 +3,14 @@ include ../common/PRKVERSION
 
 CPPFLAGS = -DPRKVERSION=$(PRKVERSION)
 
-CFLAGS = $(DEFAULT_OPT_FLAGS) $(CPPFLAGS)
-
 # debugging
 ifdef VERBOSE
-  CFLAGS += -DVERBOSE
+    CPPFLAGS += -DVERBOSE
 endif
 
+CFLAGS = $(DEFAULT_OPT_FLAGS) $(CPPFLAGS)
+
+
 ifdef PRK_USE_MMAP
   CFLAGS += -DPRK_USE_MMAP
 endif
@@ -22,10 +23,10 @@ endif
 ASMFLAGS = -fverbose-asm $(CFLAGS)
 
 OMPFLAGS = $(OPENMPFLAG)
-ACCFLAGS = $(OPENACCFLAG)
 TARGETFLAGS = $(OFFLOADFLAG)
 CILKFLAGS = $(CILKFLAG)
 ISPCFLAGS = $(ISPCFLAG)
+OPENACCFLAGS = $(OPENACCFLAG)
 
 .PHONY: all clean serial thread openmp tasks target taskloop ispc
 
@@ -123,7 +124,7 @@ p2p-2d: p2p-2d.c prk_util.h
 	$(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@
 
 %-openacc: %-openacc.c prk_util.h
-	$(CC) $(CFLAGS) $< $(ACCFLAGS) $(EXTRA_CLIBS) -o $@
+	$(CC) $(CFLAGS) $< $(OPENACCFLAGS) $(EXTRA_CLIBS) -o $@
 
 %-cilk: %-cilk.c prk_util.h
 	$(CC) $(CFLAGS) $< $(CILKFLAGS) $(EXTRA_CLIBS) -o $@
diff --git a/C1z/transpose-openacc.c b/C1z/transpose-openacc.c
index 679afb1d8..0ffc76c8e 100644
--- a/C1z/transpose-openacc.c
+++ b/C1z/transpose-openacc.c
@@ -71,14 +71,12 @@ int main(int argc, char * argv[])
     return 1;
   }
 
-  // number of times to do the transpose
   int iterations = atoi(argv[1]);
   if (iterations < 1) {
     printf("ERROR: iterations must be >= 1\n");
     return 1;
   }
 
-  // order of a the matrix
   int order = atoi(argv[2]);
   if (order <= 0) {
     printf("ERROR: Matrix Order must be greater than 0\n");

From 6f8e9d1c8564abbf206d95a6d3da7be601ce8993 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 12:03:45 +0300
Subject: [PATCH 37/80] cleanup

---
 Cxx11/Makefile | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index a96805be9..365e92363 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -1,7 +1,7 @@
-include ../common/Cxx11.defs
+include ../common/make.defs
 include ../common/PRKVERSION
 
-CPPFLAGS  = -DPRKVERSION=$(PRKVERSION)
+CPPFLAGS = -DPRKVERSION=$(PRKVERSION)
 
 # debugging
 ifdef VERBOSE
@@ -31,7 +31,7 @@ endif
 #ASMFLAGS = -fsource-asm -fverbose-asm -fasm-blocks -fcode-asm
 ASMFLAGS = -fverbose-asm
 
-OMPFLAGS = $(OPENMPFLAG) -DUSE_OPENMP
+OMPFLAGS = $(OPENMPFLAG)
 TARGETFLAGS = $(OFFLOADFLAG)
 OPENCLFLAGS = $(OPENCLFLAG) -DCL_HPP_MINIMUM_OPENCL_VERSION=120 -DCL_HPP_TARGET_OPENCL_VERSION=120 -DCL_HPP_ENABLE_EXCEPTIONS
 # We do not yet handle all possible exceptions...
@@ -62,18 +62,17 @@ OCCAFLAGS = -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib
 	boost-compute thrust executor oneapi onemkl
 
 EXTRA=
-ifeq ($(shell uname -s),Darwin)
-  ifneq ($(findstring icpc,$(CXX)),icpc)
-    EXTRA += target
-  endif
-else
-  EXTRA += target
+ifneq ($(findstring nvc++,$(CXX)),nvc++)
+  EXTRA += ranges stl pstl
+endif
+ifneq ($(OPENACCFLAG),)
+  EXTRA += openacc
 endif
-ifneq ($(findstring pgc++,$(CXX)),pgc++)
-  EXTRA += pstl
+ifneq ($(SYCLCC),)
+  EXTRA += sycl
 endif
 
-all: sequential vector valarray openmp taskloop stl ranges opencl sycl $(EXTRA)
+all: sequential vector valarray openmp taskloop opencl $(EXTRA)
 
 sequential: p2p stencil transpose nstream dgemm sparse
 
@@ -137,7 +136,7 @@ oneapi: onemkl dpcpp sycl onedpl
 
 occa: transpose-occa nstream-occa
 
-openacc: p2p-hyperplane-openacc
+openacc: nstream-openacc stencil-openacc transpose-openacc p2p-hyperplane-openacc
 
 stdpar: nstream-stdpar transpose-stdpar #stencil-stdpar p2p-stdpar
 

From 706bdaeb0e0c7deab21b56949a0078636eeb8026 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 12:03:55 +0300
Subject: [PATCH 38/80] remove USE_OPENMP

---
 Cxx11/prk_util.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h
index e5314fd81..93a037f78 100644
--- a/Cxx11/prk_util.h
+++ b/Cxx11/prk_util.h
@@ -81,7 +81,7 @@
 #endif
 
 // omp_get_wtime()
-#if defined(USE_OPENMP) && defined(_OPENMP)
+#if defined(_OPENMP)
 #include <omp.h>
 #endif
 
@@ -301,7 +301,7 @@ namespace prk {
 
     static inline double wtime(void)
     {
-#if defined(USE_OPENMP) && defined(_OPENMP)
+#if defined(_OPENMP)
         return omp_get_wtime();
 #else
         using t = std::chrono::high_resolution_clock;

From 59b06de28bd0fcf5bf699dbb058ef4f149d2ee38 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 12:04:10 +0300
Subject: [PATCH 39/80] remove unnecessary indirection

---
 common/Cxx11.defs | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 common/Cxx11.defs

diff --git a/common/Cxx11.defs b/common/Cxx11.defs
deleted file mode 100644
index d146ce6f7..000000000
--- a/common/Cxx11.defs
+++ /dev/null
@@ -1 +0,0 @@
-include ../common/make.defs

From 9a20e2ec7e90fc82bdce894a261de41105eb935f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 12:27:53 +0300
Subject: [PATCH 40/80] OpenACC

---
 .gitignore                    |   5 +
 C1z/nstream-openacc.c         |   1 -
 C1z/stencil-openacc.c         |   5 +-
 Cxx11/generate-cxx-stencil.py |  13 +-
 Cxx11/nstream-openacc.cc      | 177 +++++++++++++++
 Cxx11/stencil-openacc.cc      | 233 ++++++++++++++++++++
 Cxx11/stencil_openacc.hpp     | 397 ++++++++++++++++++++++++++++++++++
 Cxx11/transpose-openacc.cc    | 173 +++++++++++++++
 8 files changed, 998 insertions(+), 6 deletions(-)
 create mode 100644 Cxx11/nstream-openacc.cc
 create mode 100644 Cxx11/stencil-openacc.cc
 create mode 100644 Cxx11/stencil_openacc.hpp
 create mode 100644 Cxx11/transpose-openacc.cc

diff --git a/.gitignore b/.gitignore
index df8aeaa8f..2dab1847b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -139,6 +139,7 @@ Cxx11/nstream-cublas
 Cxx11/nstream-cuda
 Cxx11/nstream-cuda-managed
 Cxx11/nstream-dpcpp
+Cxx11/nstream-onedpl
 Cxx11/nstream-executors
 Cxx11/nstream-hip
 Cxx11/nstream-hipblas
@@ -154,6 +155,7 @@ Cxx11/nstream-multigpu-dpcpp
 Cxx11/nstream-onemkl
 Cxx11/nstream-opencl
 Cxx11/nstream-openmp
+Cxx11/nstream-openacc
 Cxx11/nstream-openmp-target
 Cxx11/nstream-pstl
 Cxx11/nstream-raja
@@ -174,6 +176,7 @@ Cxx11/nstream-vector-raja
 Cxx11/p2p
 Cxx11/p2p-doacross-openmp
 Cxx11/p2p-hyperplane-openmp
+Cxx11/p2p-hyperplane-openacc
 Cxx11/p2p-hyperplane-pstl
 Cxx11/p2p-hyperplane-stl
 Cxx11/p2p-hyperplane-sycl
@@ -212,6 +215,7 @@ Cxx11/stencil-kokkos
 Cxx11/stencil-mpi
 Cxx11/stencil-opencl
 Cxx11/stencil-openmp
+Cxx11/stencil-openacc
 Cxx11/stencil-openmp-target
 Cxx11/stencil-pstl
 Cxx11/stencil-raja
@@ -243,6 +247,7 @@ Cxx11/transpose-kokkos
 Cxx11/transpose-mpi
 Cxx11/transpose-opencl
 Cxx11/transpose-openmp
+Cxx11/transpose-openacc
 Cxx11/transpose-openmp-target
 Cxx11/transpose-pstl
 Cxx11/transpose-raja
diff --git a/C1z/nstream-openacc.c b/C1z/nstream-openacc.c
index 051342f45..94985da56 100644
--- a/C1z/nstream-openacc.c
+++ b/C1z/nstream-openacc.c
@@ -53,7 +53,6 @@
 ///          by the execution time. For a vector length of N, the total
 ///          number of words read and written is 4*N*sizeof(double).
 ///
-///
 /// HISTORY: This code is loosely based on the Stream benchmark by John
 ///          McCalpin, but does not follow all the Stream rules. Hence,
 ///          reported results should not be associated with Stream in
diff --git a/C1z/stencil-openacc.c b/C1z/stencil-openacc.c
index edc7e994b..6f79c40f3 100644
--- a/C1z/stencil-openacc.c
+++ b/C1z/stencil-openacc.c
@@ -160,10 +160,7 @@ int main(int argc, char * argv[])
 
   double stencil_time = 0.0;
 
-  // interior of grid with respect to stencil
-  size_t active_points = (n-2*radius)*(n-2*radius);
   size_t bytes = n*n*sizeof(double);
-
   double * restrict in  = acc_malloc(bytes);
   double * restrict out = acc_malloc(bytes);
 
@@ -196,6 +193,8 @@ int main(int argc, char * argv[])
   // Analyze and output results.
   //////////////////////////////////////////////////////////////////////
 
+  // interior of grid with respect to stencil
+  size_t active_points = (n-2*radius)*(n-2*radius);
   // compute L1 norm in parallel
   double norm = 0.0;
   #pragma acc parallel loop reduction( +:norm ) deviceptr(out)
diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py
index 00095484e..67cf61894 100755
--- a/Cxx11/generate-cxx-stencil.py
+++ b/Cxx11/generate-cxx-stencil.py
@@ -67,6 +67,15 @@ def codegen(src,pattern,stencil_size,radius,W,model):
         src.write('       }\n')
         src.write('     }\n')
         src.write('}\n\n')
+    elif (model=='openacc'):
+        src.write('void '+pattern+str(radius)+'(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {\n')
+        src.write('    PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )\n')
+        src.write('    for (int i='+str(radius)+'; i<n-'+str(radius)+'; ++i) {\n')
+        src.write('      for (int j='+str(radius)+'; j<n-'+str(radius)+'; ++j) {\n')
+        bodygen(src,pattern,stencil_size,radius,W,model)
+        src.write('       }\n')
+        src.write('     }\n')
+        src.write('}\n\n')
     elif (model=='target'):
         src.write('void '+pattern+str(radius)+'(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {\n')
         src.write('    OMP_TARGET( teams distribute parallel for simd collapse(2) )\n')
@@ -223,9 +232,9 @@ def instance(src,model,pattern,r):
     codegen(src,pattern,stencil_size,r,W,model)
 
 def main():
-    for model in ['seq','vector','ranges','stl','pgnu','pstl','openmp','taskloop','target','tbb','raja','rajaview','kokkos','cuda']:
+    for model in ['seq','vector','ranges','stl','pgnu','pstl','openmp','taskloop','target','openacc','tbb','raja','rajaview','kokkos','cuda']:
       src = open('stencil_'+model+'.hpp','w')
-      if (model=='target'):
+      if (model=='target' or model=='openacc'):
           src.write('#define RESTRICT __restrict__\n\n')
       if (model=='rajaview'):
           src.write('using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec,\n')
diff --git a/Cxx11/nstream-openacc.cc b/Cxx11/nstream-openacc.cc
new file mode 100644
index 000000000..de5da13bc
--- /dev/null
+++ b/Cxx11/nstream-openacc.cc
@@ -0,0 +1,177 @@
+///
+/// Copyright (c) 2020, Intel Corporation
+/// Copyright (c) 2022, NVIDIA
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    nstream
+///
+/// PURPOSE: To compute memory bandwidth when adding a vector of a given
+///          number of double precision values to the scalar multiple of
+///          another vector of the same length, and storing the result in
+///          a third vector.
+///
+/// USAGE:   The program takes as input the number
+///          of iterations to loop over the triad vectors and
+///          the length of the vectors.
+///
+///          <progname> <# iterations> <vector length>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// NOTES:   Bandwidth is determined as the number of words read, plus the
+///          number of words written, times the size of the words, divided
+///          by the execution time. For a vector length of N, the total
+///          number of words read and written is 4*N*sizeof(double).
+///
+/// HISTORY: This code is loosely based on the Stream benchmark by John
+///          McCalpin, but does not follow all the Stream rules. Hence,
+///          reported results should not be associated with Stream in
+///          external publications
+///
+///          Converted to C++11 by Jeff Hammond, November 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include <openacc.h>
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/OpenACC STREAM triad: A = B + scalar * C" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  size_t length;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <vector length>";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      length = std::atol(argv[2]);
+      if (length <= 0) {
+        throw "ERROR: vector length must be positive";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations  = " << iterations << std::endl;
+  std::cout << "Vector length         = " << length << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double nstream_time{0};
+
+  size_t bytes = length*sizeof(double);
+  double * RESTRICT A = (double *) acc_malloc(bytes);
+  double * RESTRICT B = (double *) acc_malloc(bytes);
+  double * RESTRICT C = (double *) acc_malloc(bytes);
+
+  double scalar = 3.0;
+
+  {
+    #pragma acc parallel loop deviceptr(A,B,C)
+    for (size_t i=0; i<length; i++) {
+      A[i] = 0.0;
+      B[i] = 2.0;
+      C[i] = 2.0;
+    }
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) nstream_time = prk::wtime();
+
+      #pragma acc parallel loop deviceptr(A,B,C)
+      for (size_t i=0; i<length; i++) {
+          A[i] += B[i] + scalar * C[i];
+      }
+    }
+    nstream_time = prk::wtime() - nstream_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double ar(0);
+  double br(2);
+  double cr(2);
+  for (int i=0; i<=iterations; i++) {
+      ar += br + scalar * cr;
+  }
+
+  ar *= length;
+
+  double asum(0);
+  #pragma acc parallel loop reduction( +:asum ) deviceptr(A)
+  for (size_t i=0; i<length; i++) {
+      asum += prk::abs(A[i]);
+  }
+
+  acc_free(A);
+  acc_free(B);
+  acc_free(C);
+
+  double epsilon=1.e-8;
+  if (prk::abs(ar-asum)/asum > epsilon) {
+      std::cout << "Failed Validation on output array\n"
+                << std::setprecision(16)
+                << "       Expected checksum: " << ar << "\n"
+                << "       Observed checksum: " << asum << std::endl;
+      std::cout << "ERROR: solution did not validate" << std::endl;
+      return 1;
+  } else {
+      std::cout << "Solution validates" << std::endl;
+      double avgtime = nstream_time/iterations;
+      double nbytes = 4.0 * length * sizeof(double);
+      std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
+                << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
+
+
diff --git a/Cxx11/stencil-openacc.cc b/Cxx11/stencil-openacc.cc
new file mode 100644
index 000000000..18a1e212f
--- /dev/null
+++ b/Cxx11/stencil-openacc.cc
@@ -0,0 +1,233 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+/// Copyright (c) 2022, NVIDIA
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Stencil
+///
+/// PURPOSE: This program tests the efficiency with which a space-invariant,
+///          linear, symmetric filter (stencil) can be applied to a square
+///          grid or image.
+///
+/// USAGE:   The program takes as input the linear
+///          dimension of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <grid size>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following functions are used in
+///          this program:
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///          - RvdW: Removed unrolling pragmas for clarity;
+///            added constant to array "in" at end of each iteration to force
+///            refreshing of neighbor data in parallel versions; August 2013
+///            C++11-ification by Jeff Hammond, May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include <openacc.h>
+#include "prk_util.h"
+#include "stencil_openacc.hpp"
+
+void nothing(const int n, const int t, const double * RESTRICT in, double * RESTRICT out)
+{
+    // use arguments to silence compiler warnings
+    out[0] = in[0] + n + t;
+}
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/OpenMP TARGET Stencil execution on 2D grid" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, n, radius, tile_size;
+  bool star = true;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <array dimension> [<tile_size> <star/grid> <radius>]";
+      }
+
+      // number of times to run the algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // linear grid dimension
+      n  = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimension must be positive";
+      } else if (n > prk::get_max_matrix_size()) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // default tile size for tiling of local transpose
+      tile_size = 32;
+      if (argc > 3) {
+          tile_size = std::atoi(argv[3]);
+          if (tile_size <= 0) tile_size = n;
+          if (tile_size > n) tile_size = n;
+      }
+
+      // stencil pattern
+      if (argc > 4) {
+          auto stencil = std::string(argv[4]);
+          auto grid = std::string("grid");
+          star = (stencil == grid) ? false : true;
+      }
+
+      // stencil radius
+      radius = 2;
+      if (argc > 5) {
+          radius = std::atoi(argv[5]);
+      }
+
+      if ( (radius < 1) || (2*radius+1 > n) ) {
+        throw "ERROR: Stencil radius negative or too large";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid size            = " << n << std::endl;
+  std::cout << "Tile size            = " << tile_size << std::endl;
+  std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
+  std::cout << "Radius of stencil    = " << radius << std::endl;
+
+  auto stencil = nothing;
+  if (star) {
+      switch (radius) {
+          case 1: stencil = star1; break;
+          case 2: stencil = star2; break;
+          case 3: stencil = star3; break;
+          case 4: stencil = star4; break;
+          case 5: stencil = star5; break;
+      }
+  } else {
+      switch (radius) {
+          case 1: stencil = grid1; break;
+          case 2: stencil = grid2; break;
+          case 3: stencil = grid3; break;
+          case 4: stencil = grid4; break;
+          case 5: stencil = grid5; break;
+      }
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double stencil_time{0};
+
+  size_t bytes = n*n*sizeof(double);
+  double * RESTRICT in  = (double *)acc_malloc(bytes);
+  double * RESTRICT out = (double *)acc_malloc(bytes);
+
+  {
+    #pragma acc parallel loop collapse(2) deviceptr(in,out)
+    for (int i=0; i<n; i++) {
+      for (int j=0; j<n; j++) {
+        in[i*n+j] = static_cast<double>(i+j);
+        out[i*n+j] = 0.0;
+      }
+    }
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) stencil_time = prk::wtime();
+
+      stencil(n, tile_size, in, out);
+
+      #pragma acc parallel loop collapse(2) deviceptr(in)
+      for (int i=0; i<n; i++) {
+        for (int j=0; j<n; j++) {
+          in[i*n+j] += 1.0;
+        }
+      }
+    }
+    stencil_time = prk::wtime() - stencil_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  // interior of grid with respect to stencil
+  size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
+  // compute L1 norm in parallel
+  double norm = 0.0;
+  #pragma acc parallel loop reduction( +:norm ) deviceptr(out)
+  for (int i=radius; i<n-radius; i++) {
+    for (int j=radius; j<n-radius; j++) {
+      norm += prk::abs(out[i*n+j]);
+    }
+  }
+  norm /= active_points;
+
+  acc_free(in);
+  acc_free(out);
+
+  // verify correctness
+  const double epsilon = 1.0e-8;
+  double reference_norm = 2.*(iterations+1.);
+  if (prk::abs(norm-reference_norm) > epsilon) {
+    std::cout << "ERROR: L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+    return 1;
+  } else {
+    std::cout << "Solution validates" << std::endl;
+#ifdef VERBOSE
+    std::cout << "L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+#endif
+    const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
+    size_t flops = (2L*(size_t)stencil_size+1L) * active_points;
+    auto avgtime = stencil_time/iterations;
+    std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  return 0;
+}
diff --git a/Cxx11/stencil_openacc.hpp b/Cxx11/stencil_openacc.hpp
new file mode 100644
index 000000000..523cda771
--- /dev/null
+++ b/Cxx11/stencil_openacc.hpp
@@ -0,0 +1,397 @@
+#define RESTRICT __restrict__
+
+void star1(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
+    PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )
+    for (int i=1; i<n-1; ++i) {
+      for (int j=1; j<n-1; ++j) {
+            out[i*n+j] += +in[(i)*n+(j-1)] * -0.5
+                          +in[(i-1)*n+(j)] * -0.5
+                          +in[(i+1)*n+(j)] * 0.5
+                          +in[(i)*n+(j+1)] * 0.5;
+       }
+     }
+}
+
+void star2(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
+    PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )
+    for (int i=2; i<n-2; ++i) {
+      for (int j=2; j<n-2; ++j) {
+            out[i*n+j] += +in[(i)*n+(j-2)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-2)*n+(j)] * -0.125
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i+2)*n+(j)] * 0.125
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i)*n+(j+2)] * 0.125;
+       }
+     }
+}
+
+void star3(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
+    PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )
+    for (int i=3; i<n-3; ++i) {
+      for (int j=3; j<n-3; ++j) {
+            out[i*n+j] += +in[(i)*n+(j-3)] * -0.05555555555555555
+                          +in[(i)*n+(j-2)] * -0.08333333333333333
+                          +in[(i)*n+(j-1)] * -0.16666666666666666
+                          +in[(i-3)*n+(j)] * -0.05555555555555555
+                          +in[(i-2)*n+(j)] * -0.08333333333333333
+                          +in[(i-1)*n+(j)] * -0.16666666666666666
+                          +in[(i+1)*n+(j)] * 0.16666666666666666
+                          +in[(i+2)*n+(j)] * 0.08333333333333333
+                          +in[(i+3)*n+(j)] * 0.05555555555555555
+                          +in[(i)*n+(j+1)] * 0.16666666666666666
+                          +in[(i)*n+(j+2)] * 0.08333333333333333
+                          +in[(i)*n+(j+3)] * 0.05555555555555555;
+       }
+     }
+}
+
+void star4(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
+    PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )
+    for (int i=4; i<n-4; ++i) {
+      for (int j=4; j<n-4; ++j) {
+            out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125
+                          +in[(i)*n+(j-3)] * -0.041666666666666664
+                          +in[(i)*n+(j-2)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i-4)*n+(j)] * -0.03125
+                          +in[(i-3)*n+(j)] * -0.041666666666666664
+                          +in[(i-2)*n+(j)] * -0.0625
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.0625
+                          +in[(i+3)*n+(j)] * 0.041666666666666664
+                          +in[(i+4)*n+(j)] * 0.03125
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i)*n+(j+2)] * 0.0625
+                          +in[(i)*n+(j+3)] * 0.041666666666666664
+                          +in[(i)*n+(j+4)] * 0.03125;
+       }
+     }
+}
+
+void star5(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
+    PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )
+    for (int i=5; i<n-5; ++i) {
+      for (int j=5; j<n-5; ++j) {
+            out[i*n+j] += +in[(i)*n+(j-5)] * -0.02
+                          +in[(i)*n+(j-4)] * -0.025
+                          +in[(i)*n+(j-3)] * -0.03333333333333333
+                          +in[(i)*n+(j-2)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.1
+                          +in[(i-5)*n+(j)] * -0.02
+                          +in[(i-4)*n+(j)] * -0.025
+                          +in[(i-3)*n+(j)] * -0.03333333333333333
+                          +in[(i-2)*n+(j)] * -0.05
+                          +in[(i-1)*n+(j)] * -0.1
+                          +in[(i+1)*n+(j)] * 0.1
+                          +in[(i+2)*n+(j)] * 0.05
+                          +in[(i+3)*n+(j)] * 0.03333333333333333
+                          +in[(i+4)*n+(j)] * 0.025
+                          +in[(i+5)*n+(j)] * 0.02
+                          +in[(i)*n+(j+1)] * 0.1
+                          +in[(i)*n+(j+2)] * 0.05
+                          +in[(i)*n+(j+3)] * 0.03333333333333333
+                          +in[(i)*n+(j+4)] * 0.025
+                          +in[(i)*n+(j+5)] * 0.02;
+       }
+     }
+}
+
+void grid1(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
+    PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )
+    for (int i=1; i<n-1; ++i) {
+      for (int j=1; j<n-1; ++j) {
+            out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25
+                          +in[(i)*n+(j-1)] * -0.25
+                          +in[(i-1)*n+(j)] * -0.25
+                          +in[(i+1)*n+(j)] * 0.25
+                          +in[(i)*n+(j+1)] * 0.25
+                          +in[(i+1)*n+(j+1)] * 0.25
+                          ;
+       }
+     }
+}
+
+void grid2(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
+    PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )
+    for (int i=2; i<n-2; ++i) {
+      for (int j=2; j<n-2; ++j) {
+            out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625
+                          +in[(i-1)*n+(j-2)] * -0.020833333333333332
+                          +in[(i)*n+(j-2)] * -0.020833333333333332
+                          +in[(i+1)*n+(j-2)] * -0.020833333333333332
+                          +in[(i-2)*n+(j-1)] * -0.020833333333333332
+                          +in[(i-1)*n+(j-1)] * -0.125
+                          +in[(i)*n+(j-1)] * -0.125
+                          +in[(i+2)*n+(j-1)] * 0.020833333333333332
+                          +in[(i-2)*n+(j)] * -0.020833333333333332
+                          +in[(i-1)*n+(j)] * -0.125
+                          +in[(i+1)*n+(j)] * 0.125
+                          +in[(i+2)*n+(j)] * 0.020833333333333332
+                          +in[(i-2)*n+(j+1)] * -0.020833333333333332
+                          +in[(i)*n+(j+1)] * 0.125
+                          +in[(i+1)*n+(j+1)] * 0.125
+                          +in[(i+2)*n+(j+1)] * 0.020833333333333332
+                          +in[(i-1)*n+(j+2)] * 0.020833333333333332
+                          +in[(i)*n+(j+2)] * 0.020833333333333332
+                          +in[(i+1)*n+(j+2)] * 0.020833333333333332
+                          +in[(i+2)*n+(j+2)] * 0.0625
+                          ;
+       }
+     }
+}
+
+void grid3(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
+    PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )
+    for (int i=3; i<n-3; ++i) {
+      for (int j=3; j<n-3; ++j) {
+            out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.027777777777777776
+                          +in[(i-2)*n+(j-3)] * -0.005555555555555556
+                          +in[(i-1)*n+(j-3)] * -0.005555555555555556
+                          +in[(i)*n+(j-3)] * -0.005555555555555556
+                          +in[(i+1)*n+(j-3)] * -0.005555555555555556
+                          +in[(i+2)*n+(j-3)] * -0.005555555555555556
+                          +in[(i-3)*n+(j-2)] * -0.005555555555555556
+                          +in[(i-2)*n+(j-2)] * -0.041666666666666664
+                          +in[(i-1)*n+(j-2)] * -0.013888888888888888
+                          +in[(i)*n+(j-2)] * -0.013888888888888888
+                          +in[(i+1)*n+(j-2)] * -0.013888888888888888
+                          +in[(i+3)*n+(j-2)] * 0.005555555555555556
+                          +in[(i-3)*n+(j-1)] * -0.005555555555555556
+                          +in[(i-2)*n+(j-1)] * -0.013888888888888888
+                          +in[(i-1)*n+(j-1)] * -0.08333333333333333
+                          +in[(i)*n+(j-1)] * -0.08333333333333333
+                          +in[(i+2)*n+(j-1)] * 0.013888888888888888
+                          +in[(i+3)*n+(j-1)] * 0.005555555555555556
+                          +in[(i-3)*n+(j)] * -0.005555555555555556
+                          +in[(i-2)*n+(j)] * -0.013888888888888888
+                          +in[(i-1)*n+(j)] * -0.08333333333333333
+                          +in[(i+1)*n+(j)] * 0.08333333333333333
+                          +in[(i+2)*n+(j)] * 0.013888888888888888
+                          +in[(i+3)*n+(j)] * 0.005555555555555556
+                          +in[(i-3)*n+(j+1)] * -0.005555555555555556
+                          +in[(i-2)*n+(j+1)] * -0.013888888888888888
+                          +in[(i)*n+(j+1)] * 0.08333333333333333
+                          +in[(i+1)*n+(j+1)] * 0.08333333333333333
+                          +in[(i+2)*n+(j+1)] * 0.013888888888888888
+                          +in[(i+3)*n+(j+1)] * 0.005555555555555556
+                          +in[(i-3)*n+(j+2)] * -0.005555555555555556
+                          +in[(i-1)*n+(j+2)] * 0.013888888888888888
+                          +in[(i)*n+(j+2)] * 0.013888888888888888
+                          +in[(i+1)*n+(j+2)] * 0.013888888888888888
+                          +in[(i+2)*n+(j+2)] * 0.041666666666666664
+                          +in[(i+3)*n+(j+2)] * 0.005555555555555556
+                          +in[(i-2)*n+(j+3)] * 0.005555555555555556
+                          +in[(i-1)*n+(j+3)] * 0.005555555555555556
+                          +in[(i)*n+(j+3)] * 0.005555555555555556
+                          +in[(i+1)*n+(j+3)] * 0.005555555555555556
+                          +in[(i+2)*n+(j+3)] * 0.005555555555555556
+                          +in[(i+3)*n+(j+3)] * 0.027777777777777776
+                          ;
+       }
+     }
+}
+
+void grid4(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
+    PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )
+    for (int i=4; i<n-4; ++i) {
+      for (int j=4; j<n-4; ++j) {
+            out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625
+                          +in[(i-3)*n+(j-4)] * -0.002232142857142857
+                          +in[(i-2)*n+(j-4)] * -0.002232142857142857
+                          +in[(i-1)*n+(j-4)] * -0.002232142857142857
+                          +in[(i)*n+(j-4)] * -0.002232142857142857
+                          +in[(i+1)*n+(j-4)] * -0.002232142857142857
+                          +in[(i+2)*n+(j-4)] * -0.002232142857142857
+                          +in[(i+3)*n+(j-4)] * -0.002232142857142857
+                          +in[(i-4)*n+(j-3)] * -0.002232142857142857
+                          +in[(i-3)*n+(j-3)] * -0.020833333333333332
+                          +in[(i-2)*n+(j-3)] * -0.004166666666666667
+                          +in[(i-1)*n+(j-3)] * -0.004166666666666667
+                          +in[(i)*n+(j-3)] * -0.004166666666666667
+                          +in[(i+1)*n+(j-3)] * -0.004166666666666667
+                          +in[(i+2)*n+(j-3)] * -0.004166666666666667
+                          +in[(i+4)*n+(j-3)] * 0.002232142857142857
+                          +in[(i-4)*n+(j-2)] * -0.002232142857142857
+                          +in[(i-3)*n+(j-2)] * -0.004166666666666667
+                          +in[(i-2)*n+(j-2)] * -0.03125
+                          +in[(i-1)*n+(j-2)] * -0.010416666666666666
+                          +in[(i)*n+(j-2)] * -0.010416666666666666
+                          +in[(i+1)*n+(j-2)] * -0.010416666666666666
+                          +in[(i+3)*n+(j-2)] * 0.004166666666666667
+                          +in[(i+4)*n+(j-2)] * 0.002232142857142857
+                          +in[(i-4)*n+(j-1)] * -0.002232142857142857
+                          +in[(i-3)*n+(j-1)] * -0.004166666666666667
+                          +in[(i-2)*n+(j-1)] * -0.010416666666666666
+                          +in[(i-1)*n+(j-1)] * -0.0625
+                          +in[(i)*n+(j-1)] * -0.0625
+                          +in[(i+2)*n+(j-1)] * 0.010416666666666666
+                          +in[(i+3)*n+(j-1)] * 0.004166666666666667
+                          +in[(i+4)*n+(j-1)] * 0.002232142857142857
+                          +in[(i-4)*n+(j)] * -0.002232142857142857
+                          +in[(i-3)*n+(j)] * -0.004166666666666667
+                          +in[(i-2)*n+(j)] * -0.010416666666666666
+                          +in[(i-1)*n+(j)] * -0.0625
+                          +in[(i+1)*n+(j)] * 0.0625
+                          +in[(i+2)*n+(j)] * 0.010416666666666666
+                          +in[(i+3)*n+(j)] * 0.004166666666666667
+                          +in[(i+4)*n+(j)] * 0.002232142857142857
+                          +in[(i-4)*n+(j+1)] * -0.002232142857142857
+                          +in[(i-3)*n+(j+1)] * -0.004166666666666667
+                          +in[(i-2)*n+(j+1)] * -0.010416666666666666
+                          +in[(i)*n+(j+1)] * 0.0625
+                          +in[(i+1)*n+(j+1)] * 0.0625
+                          +in[(i+2)*n+(j+1)] * 0.010416666666666666
+                          +in[(i+3)*n+(j+1)] * 0.004166666666666667
+                          +in[(i+4)*n+(j+1)] * 0.002232142857142857
+                          +in[(i-4)*n+(j+2)] * -0.002232142857142857
+                          +in[(i-3)*n+(j+2)] * -0.004166666666666667
+                          +in[(i-1)*n+(j+2)] * 0.010416666666666666
+                          +in[(i)*n+(j+2)] * 0.010416666666666666
+                          +in[(i+1)*n+(j+2)] * 0.010416666666666666
+                          +in[(i+2)*n+(j+2)] * 0.03125
+                          +in[(i+3)*n+(j+2)] * 0.004166666666666667
+                          +in[(i+4)*n+(j+2)] * 0.002232142857142857
+                          +in[(i-4)*n+(j+3)] * -0.002232142857142857
+                          +in[(i-2)*n+(j+3)] * 0.004166666666666667
+                          +in[(i-1)*n+(j+3)] * 0.004166666666666667
+                          +in[(i)*n+(j+3)] * 0.004166666666666667
+                          +in[(i+1)*n+(j+3)] * 0.004166666666666667
+                          +in[(i+2)*n+(j+3)] * 0.004166666666666667
+                          +in[(i+3)*n+(j+3)] * 0.020833333333333332
+                          +in[(i+4)*n+(j+3)] * 0.002232142857142857
+                          +in[(i-3)*n+(j+4)] * 0.002232142857142857
+                          +in[(i-2)*n+(j+4)] * 0.002232142857142857
+                          +in[(i-1)*n+(j+4)] * 0.002232142857142857
+                          +in[(i)*n+(j+4)] * 0.002232142857142857
+                          +in[(i+1)*n+(j+4)] * 0.002232142857142857
+                          +in[(i+2)*n+(j+4)] * 0.002232142857142857
+                          +in[(i+3)*n+(j+4)] * 0.002232142857142857
+                          +in[(i+4)*n+(j+4)] * 0.015625
+                          ;
+       }
+     }
+}
+
+void grid5(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {
+    PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )
+    for (int i=5; i<n-5; ++i) {
+      for (int j=5; j<n-5; ++j) {
+            out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01
+                          +in[(i-4)*n+(j-5)] * -0.0011111111111111111
+                          +in[(i-3)*n+(j-5)] * -0.0011111111111111111
+                          +in[(i-2)*n+(j-5)] * -0.0011111111111111111
+                          +in[(i-1)*n+(j-5)] * -0.0011111111111111111
+                          +in[(i)*n+(j-5)] * -0.0011111111111111111
+                          +in[(i+1)*n+(j-5)] * -0.0011111111111111111
+                          +in[(i+2)*n+(j-5)] * -0.0011111111111111111
+                          +in[(i+3)*n+(j-5)] * -0.0011111111111111111
+                          +in[(i+4)*n+(j-5)] * -0.0011111111111111111
+                          +in[(i-5)*n+(j-4)] * -0.0011111111111111111
+                          +in[(i-4)*n+(j-4)] * -0.0125
+                          +in[(i-3)*n+(j-4)] * -0.0017857142857142857
+                          +in[(i-2)*n+(j-4)] * -0.0017857142857142857
+                          +in[(i-1)*n+(j-4)] * -0.0017857142857142857
+                          +in[(i)*n+(j-4)] * -0.0017857142857142857
+                          +in[(i+1)*n+(j-4)] * -0.0017857142857142857
+                          +in[(i+2)*n+(j-4)] * -0.0017857142857142857
+                          +in[(i+3)*n+(j-4)] * -0.0017857142857142857
+                          +in[(i+5)*n+(j-4)] * 0.0011111111111111111
+                          +in[(i-5)*n+(j-3)] * -0.0011111111111111111
+                          +in[(i-4)*n+(j-3)] * -0.0017857142857142857
+                          +in[(i-3)*n+(j-3)] * -0.016666666666666666
+                          +in[(i-2)*n+(j-3)] * -0.0033333333333333335
+                          +in[(i-1)*n+(j-3)] * -0.0033333333333333335
+                          +in[(i)*n+(j-3)] * -0.0033333333333333335
+                          +in[(i+1)*n+(j-3)] * -0.0033333333333333335
+                          +in[(i+2)*n+(j-3)] * -0.0033333333333333335
+                          +in[(i+4)*n+(j-3)] * 0.0017857142857142857
+                          +in[(i+5)*n+(j-3)] * 0.0011111111111111111
+                          +in[(i-5)*n+(j-2)] * -0.0011111111111111111
+                          +in[(i-4)*n+(j-2)] * -0.0017857142857142857
+                          +in[(i-3)*n+(j-2)] * -0.0033333333333333335
+                          +in[(i-2)*n+(j-2)] * -0.025
+                          +in[(i-1)*n+(j-2)] * -0.008333333333333333
+                          +in[(i)*n+(j-2)] * -0.008333333333333333
+                          +in[(i+1)*n+(j-2)] * -0.008333333333333333
+                          +in[(i+3)*n+(j-2)] * 0.0033333333333333335
+                          +in[(i+4)*n+(j-2)] * 0.0017857142857142857
+                          +in[(i+5)*n+(j-2)] * 0.0011111111111111111
+                          +in[(i-5)*n+(j-1)] * -0.0011111111111111111
+                          +in[(i-4)*n+(j-1)] * -0.0017857142857142857
+                          +in[(i-3)*n+(j-1)] * -0.0033333333333333335
+                          +in[(i-2)*n+(j-1)] * -0.008333333333333333
+                          +in[(i-1)*n+(j-1)] * -0.05
+                          +in[(i)*n+(j-1)] * -0.05
+                          +in[(i+2)*n+(j-1)] * 0.008333333333333333
+                          +in[(i+3)*n+(j-1)] * 0.0033333333333333335
+                          +in[(i+4)*n+(j-1)] * 0.0017857142857142857
+                          +in[(i+5)*n+(j-1)] * 0.0011111111111111111
+                          +in[(i-5)*n+(j)] * -0.0011111111111111111
+                          +in[(i-4)*n+(j)] * -0.0017857142857142857
+                          +in[(i-3)*n+(j)] * -0.0033333333333333335
+                          +in[(i-2)*n+(j)] * -0.008333333333333333
+                          +in[(i-1)*n+(j)] * -0.05
+                          +in[(i+1)*n+(j)] * 0.05
+                          +in[(i+2)*n+(j)] * 0.008333333333333333
+                          +in[(i+3)*n+(j)] * 0.0033333333333333335
+                          +in[(i+4)*n+(j)] * 0.0017857142857142857
+                          +in[(i+5)*n+(j)] * 0.0011111111111111111
+                          +in[(i-5)*n+(j+1)] * -0.0011111111111111111
+                          +in[(i-4)*n+(j+1)] * -0.0017857142857142857
+                          +in[(i-3)*n+(j+1)] * -0.0033333333333333335
+                          +in[(i-2)*n+(j+1)] * -0.008333333333333333
+                          +in[(i)*n+(j+1)] * 0.05
+                          +in[(i+1)*n+(j+1)] * 0.05
+                          +in[(i+2)*n+(j+1)] * 0.008333333333333333
+                          +in[(i+3)*n+(j+1)] * 0.0033333333333333335
+                          +in[(i+4)*n+(j+1)] * 0.0017857142857142857
+                          +in[(i+5)*n+(j+1)] * 0.0011111111111111111
+                          +in[(i-5)*n+(j+2)] * -0.0011111111111111111
+                          +in[(i-4)*n+(j+2)] * -0.0017857142857142857
+                          +in[(i-3)*n+(j+2)] * -0.0033333333333333335
+                          +in[(i-1)*n+(j+2)] * 0.008333333333333333
+                          +in[(i)*n+(j+2)] * 0.008333333333333333
+                          +in[(i+1)*n+(j+2)] * 0.008333333333333333
+                          +in[(i+2)*n+(j+2)] * 0.025
+                          +in[(i+3)*n+(j+2)] * 0.0033333333333333335
+                          +in[(i+4)*n+(j+2)] * 0.0017857142857142857
+                          +in[(i+5)*n+(j+2)] * 0.0011111111111111111
+                          +in[(i-5)*n+(j+3)] * -0.0011111111111111111
+                          +in[(i-4)*n+(j+3)] * -0.0017857142857142857
+                          +in[(i-2)*n+(j+3)] * 0.0033333333333333335
+                          +in[(i-1)*n+(j+3)] * 0.0033333333333333335
+                          +in[(i)*n+(j+3)] * 0.0033333333333333335
+                          +in[(i+1)*n+(j+3)] * 0.0033333333333333335
+                          +in[(i+2)*n+(j+3)] * 0.0033333333333333335
+                          +in[(i+3)*n+(j+3)] * 0.016666666666666666
+                          +in[(i+4)*n+(j+3)] * 0.0017857142857142857
+                          +in[(i+5)*n+(j+3)] * 0.0011111111111111111
+                          +in[(i-5)*n+(j+4)] * -0.0011111111111111111
+                          +in[(i-3)*n+(j+4)] * 0.0017857142857142857
+                          +in[(i-2)*n+(j+4)] * 0.0017857142857142857
+                          +in[(i-1)*n+(j+4)] * 0.0017857142857142857
+                          +in[(i)*n+(j+4)] * 0.0017857142857142857
+                          +in[(i+1)*n+(j+4)] * 0.0017857142857142857
+                          +in[(i+2)*n+(j+4)] * 0.0017857142857142857
+                          +in[(i+3)*n+(j+4)] * 0.0017857142857142857
+                          +in[(i+4)*n+(j+4)] * 0.0125
+                          +in[(i+5)*n+(j+4)] * 0.0011111111111111111
+                          +in[(i-4)*n+(j+5)] * 0.0011111111111111111
+                          +in[(i-3)*n+(j+5)] * 0.0011111111111111111
+                          +in[(i-2)*n+(j+5)] * 0.0011111111111111111
+                          +in[(i-1)*n+(j+5)] * 0.0011111111111111111
+                          +in[(i)*n+(j+5)] * 0.0011111111111111111
+                          +in[(i+1)*n+(j+5)] * 0.0011111111111111111
+                          +in[(i+2)*n+(j+5)] * 0.0011111111111111111
+                          +in[(i+3)*n+(j+5)] * 0.0011111111111111111
+                          +in[(i+4)*n+(j+5)] * 0.0011111111111111111
+                          +in[(i+5)*n+(j+5)] * 0.01
+                          ;
+       }
+     }
+}
+
diff --git a/Cxx11/transpose-openacc.cc b/Cxx11/transpose-openacc.cc
new file mode 100644
index 000000000..130d424d3
--- /dev/null
+++ b/Cxx11/transpose-openacc.cc
@@ -0,0 +1,173 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations> [tile size]
+///
+///          An optional parameter specifies the tile size used to divide the
+///          individual matrix blocks for improved cache and TLB performance.
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include <openacc.h>
+#include "prk_util.h"
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/OpenMP TARGET Matrix transpose: B = A^T" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int order;
+  int tile_size;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order> [tile size]";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > prk::get_max_matrix_size()) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+
+      // default tile size for tiling of local transpose
+      tile_size = (argc>3) ? std::atoi(argv[3]) : order;
+      // a negative tile size means no tiling of the local transpose
+      if (tile_size <= 0) tile_size = order;
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations  = " << iterations << std::endl;
+  std::cout << "Matrix order          = " << order << std::endl;
+  std::cout << "Tile size             = " << tile_size << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double trans_time{0};
+
+  size_t bytes = order*order*sizeof(double);
+  double * restrict A = (double *)acc_malloc(bytes);
+  double * restrict B = (double *)acc_malloc(bytes);
+
+  {
+    #pragma acc parallel loop deviceptr(A,B)
+    for (int i=0;i<order; i++) {
+      for (int j=0;j<order;j++) {
+        A[i*order+j] = static_cast<double>(i*order+j);
+        B[i*order+j] = 0.0;
+      }
+    }
+
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) trans_time = prk::wtime();
+
+      #pragma acc parallel loop tile(tile_size,tile_size) deviceptr(A,B)
+      for (int i=0;i<order; i++) {
+        for (int j=0;j<order;j++) {
+          B[i*order+j] += A[j*order+i];
+          A[j*order+i] += 1.0;
+        }
+      }
+    }
+    trans_time = prk::wtime() - trans_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  const auto addit = (iterations+1.) * (iterations/2.);
+  auto abserr = 0.0;
+  #pragma acc parallel loop reduction( +:abserr ) deviceptr(B)
+  for (int j=0; j<order; j++) {
+    for (int i=0; i<order; i++) {
+      const size_t ij = i*order+j;
+      const size_t ji = j*order+i;
+      const double reference = static_cast<double>(ij)*(1.+iterations)+addit;
+      abserr += prk::abs(B[ji] - reference);
+    }
+  }
+
+  acc_free(A);
+  acc_free(B);
+
+#ifdef VERBOSE
+  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+  const auto epsilon = 1.0e-8;
+  if (abserr < epsilon) {
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = trans_time/iterations;
+    std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "ERROR: Aggregate squared error " << abserr
+              << " exceeds threshold " << epsilon << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+

From 5af2232d7a831564832c81951d96904d3a93c159 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 12:36:49 +0300
Subject: [PATCH 41/80] cleanup

---
 C1z/nstream-openacc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/C1z/nstream-openacc.c b/C1z/nstream-openacc.c
index 94985da56..ba4e587af 100644
--- a/C1z/nstream-openacc.c
+++ b/C1z/nstream-openacc.c
@@ -143,7 +143,7 @@ int main(int argc, char * argv[])
   ar *= length;
 
   double asum = 0.0;
-  #pragma acc parallel loop reduction( +:asum ) deviceptr(A,B,C)
+  #pragma acc parallel loop reduction( +:asum ) deviceptr(A)
   for (size_t i=0; i<length; i++) {
       asum += fabs(A[i]);
   }

From 2643138cc2387781dfc85762f93c9fa6107d0f85 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Wed, 18 May 2022 03:45:59 -0700
Subject: [PATCH 42/80] fix validation

---
 Cxx11/dgemm-multigpu-cublas.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Cxx11/dgemm-multigpu-cublas.cu b/Cxx11/dgemm-multigpu-cublas.cu
index 160a9d12c..439f33a9b 100644
--- a/Cxx11/dgemm-multigpu-cublas.cu
+++ b/Cxx11/dgemm-multigpu-cublas.cu
@@ -153,7 +153,7 @@ int main(int argc, char * argv[])
   std::cout << "C++11/CUBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl;
 
   prk::CUDA::info info;
-  info.print();
+  //info.print();
 
   //////////////////////////////////////////////////////////////////////
   /// Read and test input parameters
@@ -306,18 +306,18 @@ int main(int argc, char * argv[])
   double residuum(0);
   for (int i=0; i<ngpus; ++i) {
       for (int b=0; b<matrices; ++b) {
-          const auto checksum = prk::reduce( &(h_c[i][b*order*order+0]), &(h_c[i][b*order*order+nelems]), 0.0);
+          const double checksum = prk::reduce( &(h_c[i][b*order*order+0]), &(h_c[i][b*order*order+nelems]), 0.0);
           residuum += std::abs(checksum-reference)/reference;
+#if VERBOSE
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Actual checksum = " << checksum << std::endl;
+#endif
       }
   }
   residuum/=matrices;
   residuum/=ngpus;
 
   if (residuum < epsilon) {
-#if VERBOSE
-    std::cout << "Reference checksum = " << reference << "\n"
-              << "Actual checksum = " << checksum << std::endl;
-#endif
     std::cout << "Solution validates" << std::endl;
     auto avgtime = dgemm_time/iterations/matrices;
     auto nflops = 2.0 * prk::pow(forder,3) * ngpus;

From c83da1620d48651a84a3d965abb5164afe9650ad Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Wed, 18 May 2022 03:47:03 -0700
Subject: [PATCH 43/80] print

---
 Cxx11/dgemm-multigpu-cublas.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/dgemm-multigpu-cublas.cu b/Cxx11/dgemm-multigpu-cublas.cu
index 439f33a9b..18a039425 100644
--- a/Cxx11/dgemm-multigpu-cublas.cu
+++ b/Cxx11/dgemm-multigpu-cublas.cu
@@ -153,7 +153,7 @@ int main(int argc, char * argv[])
   std::cout << "C++11/CUBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl;
 
   prk::CUDA::info info;
-  //info.print();
+  info.print();
 
   //////////////////////////////////////////////////////////////////////
   /// Read and test input parameters

From b002f2829a2149b7989011e04339954b510485f3 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Wed, 18 May 2022 03:47:32 -0700
Subject: [PATCH 44/80] update

---
 common/make.defs.nvhpc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc
index 38438888b..fa4b59e8b 100644
--- a/common/make.defs.nvhpc
+++ b/common/make.defs.nvhpc
@@ -1,6 +1,6 @@
 #
 # This file shows the NVHPC toolchain options.
-NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11
+NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/22.2
 #NVHPC_PATH=/proj/nv/Linux_$$(uname -m)/21.11
 #NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_$$(uname -m)/2021
 NVHPC_CBIN=${NVHPC_PATH}/compilers/bin/
@@ -74,7 +74,7 @@ CBLASFLAG=${BLASFLAG}
 NVCC=${NVHPC_CBIN}nvcc
 CUDAFLAGS=-g -O3 -std=c++17
 CUDAFLAGS+=--extended-lambda
-CUDAFLAGS+=--gpu-architecture=sm_75
+CUDAFLAGS+=--gpu-architecture=sm_80
 #CUDAFLAGS+=--compiler-bindir=/swtools/gcc/7.5.0/bin
 #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp
 CUDAFLAGS+=-rdc=true # FIXES ptxas fatal   : Unresolved extern function 'cudaCGGetIntrinsicHandle'
@@ -106,8 +106,7 @@ CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED
 # MPI-3
 #
 # mpiicc wraps icc.  mpicc and mpigcc wrap gcc.
-MPIDIR=${NVHPC_PATH}/comm_libs/openmpi/openmpi-3.1.5
-#MPIDIR=${NVHPC_PATH}/comm_libs/openmpi4/openmpi-4.0.5
+MPIDIR=${NVHPC_PATH}/comm_libs/hpcx/latest/ompi
 MPICC=${MPIDIR}/bin/mpicc
 MPICXX=${MPIDIR}/bin/mpicxx
 MPIFORT=${MPIDIR}/bin/mpifort

From 06f16806267f74daa87d74b2d7e07b4372218e43 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 17:50:36 +0300
Subject: [PATCH 45/80] transpose cleanup

---
 FORTRAN/nstream-mpi.F90               |   3 +-
 FORTRAN/transpose-openacc.F90         | 119 ++++++--------------------
 FORTRAN/transpose-openmp-target.F90   |  69 +++------------
 FORTRAN/transpose-openmp.F90          |  63 +++-----------
 FORTRAN/transpose-pointer.F90         |  52 ++---------
 FORTRAN/transpose-pretty.F90          |  51 +++--------
 FORTRAN/transpose-stdpar.F90          |  71 +++------------
 FORTRAN/transpose-taskloop-openmp.F90 |  65 +++-----------
 FORTRAN/transpose-tasks-openmp.F90    |  66 +++-----------
 FORTRAN/transpose.F90                 |  10 +--
 10 files changed, 118 insertions(+), 451 deletions(-)

diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90
index 2f4e58937..66ba8d30c 100644
--- a/FORTRAN/nstream-mpi.F90
+++ b/FORTRAN/nstream-mpi.F90
@@ -139,7 +139,8 @@ program main
   !$omp parallel default(none)                   &
   !$omp&  shared(A,B,C,nstream_time)             &
   !$omp&  firstprivate(length,iterations,scalar) &
-  !$omp&  private(i,k,t0,t1)
+  !$omp&  private(i,k,t0,t1)                     &
+  !$omp&  shared(MPI_COMM_WORLD)
 #endif
 
 #if defined(_OPENMP)
diff --git a/FORTRAN/transpose-openacc.F90 b/FORTRAN/transpose-openacc.F90
index 02ab0ab9d..1a0a69fe9 100644
--- a/FORTRAN/transpose-openacc.F90
+++ b/FORTRAN/transpose-openacc.F90
@@ -50,16 +50,14 @@
 !
 ! HISTORY: Written by  Rob Van der Wijngaart, February 2009.
 !          Converted to Fortran by Jeff Hammond, January 2015
+!
 ! *******************************************************************
 
 program main
   use, intrinsic :: iso_fortran_env
   use prk
   implicit none
-  ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! problem definition
   integer(kind=INT32) ::  iterations                ! number of times to do the transpose
   integer(kind=INT32) ::  order                     ! order of a the matrix
@@ -80,120 +78,50 @@ program main
   write(*,'(a25)') 'Parallel Research Kernels'
   write(*,'(a41)') 'Fortran OpenACC Matrix transpose: B = A^T'
 
-  if (command_argument_count().lt.2) then
-    write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-    write(*,'(a62)')    'Usage: ./transpose <# iterations> <matrix order> [<tile_size>]'
-    stop 1
-  endif
-
-  iterations = 1
-  call get_command_argument(1,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') iterations
-  if (iterations .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-    stop 1
-  endif
-
-  order = 1
-  call get_command_argument(2,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') order
-  if (order .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order
-    stop 1
-  endif
+  call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
 
-  ! same default as the C implementation
-  tile_size = 32
-  if (command_argument_count().gt.2) then
-      call get_command_argument(3,argtmp,arglen,err)
-      if (err.eq.0) read(argtmp,'(i32)') tile_size
-  endif
-  if ((tile_size .lt. 1).or.(tile_size.gt.order)) then
-    write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,&
-                           ' must be >= 1 and <= ',order
-    tile_size = order ! no tiling
-  endif
+  write(*,'(a22,i8)') 'Number of iterations = ', iterations
+  write(*,'(a22,i8)') 'Matrix order         = ', order
+  write(*,'(a22,i8)') 'Tile size            = ', tile_size
 
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
   ! ********************************************************************
 
-  allocate( A(order,order), stat=err)
+  allocate( A(order,order), B(order,order), stat=err)
   if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of A returned ',err
+    write(*,'(a,i3)') 'allocation  returned ',err
     stop 1
   endif
 
-  allocate( B(order,order), stat=err )
-  if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of B returned ',err
-    stop 1
-  endif
-
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix order         = ', order
-  write(*,'(a,i8)') 'Tile size            = ', tile_size
-
   t0 = 0
 
-  if (tile_size.lt.order) then
-    !$acc parallel loop gang collapse(2)
-    do jt=1,order,tile_size
-      do it=1,order,tile_size
-        !$acc loop vector collapse(2)
-        do j=jt,min(order,jt+tile_size-1)
-          do i=it,min(order,it+tile_size-1)
-              A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64)
-              B(i,j) = 0.0
-          enddo
-        enddo
-      enddo
-    enddo
-  else
-    !$acc parallel loop collapse(2)
-    do j=1,order
-      do i=1,order
-        A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64)
-        B(i,j) = 0.0
-      enddo
+  !$acc data create(A,B)
+
+  !$acc parallel loop collapse(2)
+  do j=1,order
+    do i=1,order
+      A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64)
+      B(i,j) = 0
     enddo
-  endif
+  enddo
 
-  !$acc data pcopyin(A) pcopy(B)
   do k=0,iterations
 
     if (k.eq.1) t0 = prk_get_wtime()
 
-    ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix
-    if (tile_size.lt.order) then
-      !$acc parallel loop gang collapse(2)
-      do jt=1,order,tile_size
-        do it=1,order,tile_size
-          !$acc loop vector collapse(2)
-          do j=jt,min(order,jt+tile_size-1)
-            do i=it,min(order,it+tile_size-1)
-              B(j,i) = B(j,i) + A(i,j)
-              A(i,j) = A(i,j) + 1.0
-            enddo
-          enddo
-        enddo
-      enddo
-    else
-      !$acc parallel loop collapse(2)
-      do j=1,order
-        do i=1,order
-          B(j,i) = B(j,i) + A(i,j)
-          A(i,j) = A(i,j) + 1.0
-        enddo
+    !$acc parallel loop tile(tile_size,tile_size)
+    do j=1,order
+      do i=1,order
+        B(j,i) = B(j,i) + A(i,j)
+        A(i,j) = A(i,j) + 1.0
       enddo
-    endif
+    enddo
 
   enddo ! iterations
 
   t1 = prk_get_wtime()
 
-  !$acc end data
-
   trans_time = t1 - t0
 
   ! ********************************************************************
@@ -212,8 +140,9 @@ program main
     enddo
   enddo
 
-  deallocate( B )
-  deallocate( A )
+  !$acc end data
+
+  deallocate( A,B )
 
   if (abserr .lt. epsilon) then
     write(*,'(a)') 'Solution validates'
diff --git a/FORTRAN/transpose-openmp-target.F90 b/FORTRAN/transpose-openmp-target.F90
index 4aa431b18..a8b75a245 100644
--- a/FORTRAN/transpose-openmp-target.F90
+++ b/FORTRAN/transpose-openmp-target.F90
@@ -50,16 +50,15 @@
 !
 ! HISTORY: Written by  Rob Van der Wijngaart, February 2009.
 !          Converted to Fortran by Jeff Hammond, January 2015
+!
 ! *******************************************************************
 
 program main
   use, intrinsic :: iso_fortran_env
   use omp_lib
+  use prk
   implicit none
-  ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! problem definition
   integer(kind=INT32) ::  iterations                ! number of times to do the transpose
   integer(kind=INT32) ::  order                     ! order of a the matrix
@@ -81,66 +80,23 @@ program main
   write(*,'(a25)') 'Parallel Research Kernels'
   write(*,'(a47)') 'Fortran OpenMP TARGET Matrix transpose: B = A^T'
 
-  if (command_argument_count().lt.2) then
-    write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-    write(*,'(a62)')    'Usage: ./transpose <# iterations> <matrix order> [<tile_size>]'
-    stop 1
-  endif
-
-  iterations = 1
-  call get_command_argument(1,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') iterations
-  if (iterations .lt. 1) then
-    write(*,'(a33,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-    stop 1
-  endif
-
-  order = 1
-  call get_command_argument(2,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') order
-  if (order .lt. 1) then
-    write(*,'(a28,i5)') 'ERROR: order must be >= 1 : ', order
-    stop 1
-  endif
-
-  ! same default as the C implementation
-  tile_size = 32
-  if (command_argument_count().gt.2) then
-      call get_command_argument(3,argtmp,arglen,err)
-      if (err.eq.0) read(argtmp,'(i32)') tile_size
-  endif
-  if ((tile_size.gt.order).or.(tile_size.lt.1)) then
-    tile_size = order
-  endif
+  call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
 
-  if (tile_size.lt.order) then
-    if (mod(order,tile_size).ne.0) then
-      write(*,'(a50)') 'ERROR: order must be evenly divisible by tile_size'
-      stop 1
-    endif
-    if (tile_size.gt.32) then
-      write(*,'(a50)') 'ERROR: tile_size must be less than 32 to use temp space'
-      stop 1
-    endif
+  write(*,'(a22,i8)') 'Number of iterations = ', iterations
+  write(*,'(a22,i8)') 'Matrix order         = ', order
+  if (tile_size.ne.order) then
+    write(*,'(a22,i8)') 'Tile size            = ', tile_size
+  else
+    write(*,'(a10)') 'Tiling off'
   endif
 
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix order         = ', order
-  write(*,'(a,i8)') 'Tile size            = ', tile_size
-
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
   ! ********************************************************************
 
-  allocate( A(order,order), stat=err)
-  if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of A returned ',err
-    stop 1
-  endif
-
-  allocate( B(order,order), stat=err )
+  allocate( A(order,order), B(order,order), stat=err)
   if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of B returned ',err
+    write(*,'(a,i3)') 'allocation  returned ',err
     stop 1
   endif
 
@@ -223,8 +179,7 @@ program main
   enddo
   !$omp end parallel do
 
-  deallocate( B )
-  deallocate( A )
+  deallocate( A,B )
 
   if (abserr .lt. epsilon) then
     write(*,'(a)') 'Solution validates'
diff --git a/FORTRAN/transpose-openmp.F90 b/FORTRAN/transpose-openmp.F90
index 93dab50a8..d88d470ff 100644
--- a/FORTRAN/transpose-openmp.F90
+++ b/FORTRAN/transpose-openmp.F90
@@ -50,16 +50,15 @@
 !
 ! HISTORY: Written by  Rob Van der Wijngaart, February 2009.
 !          Converted to Fortran by Jeff Hammond, January 2015
+!
 ! *******************************************************************
 
 program main
   use, intrinsic :: iso_fortran_env
   use omp_lib
+  use prk
   implicit none
-  ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! problem definition
   integer(kind=INT32) ::  iterations                ! number of times to do the transpose
   integer(kind=INT32) ::  order                     ! order of a the matrix
@@ -80,63 +79,27 @@ program main
   write(*,'(a25)') 'Parallel Research Kernels'
   write(*,'(a40)') 'Fortran OpenMP Matrix transpose: B = A^T'
 
-  if (command_argument_count().lt.2) then
-    write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-    write(*,'(a62)')    'Usage: ./transpose <# iterations> <matrix order> [<tile_size>]'
-    stop 1
-  endif
-
-  iterations = 1
-  call get_command_argument(1,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') iterations
-  if (iterations .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-    stop 1
-  endif
-
-  order = 1
-  call get_command_argument(2,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') order
-  if (order .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order
-    stop 1
-  endif
+  call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
 
-  ! same default as the C implementation
-  tile_size = 32
-  if (command_argument_count().gt.2) then
-      call get_command_argument(3,argtmp,arglen,err)
-      if (err.eq.0) read(argtmp,'(i32)') tile_size
-  endif
-  if ((tile_size .lt. 1).or.(tile_size.gt.order)) then
-    write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,&
-                           ' must be >= 1 and <= ',order
-    tile_size = order ! no tiling
+  write(*,'(a22,i8)') 'Number of threads    = ',omp_get_max_threads()
+  write(*,'(a22,i8)') 'Number of iterations = ', iterations
+  write(*,'(a22,i8)') 'Matrix order         = ', order
+  if (tile_size.ne.order) then
+    write(*,'(a22,i8)') 'Tile size            = ', tile_size
+  else
+    write(*,'(a10)') 'Tiling off'
   endif
 
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
   ! ********************************************************************
 
-  allocate( A(order,order), stat=err)
-  if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of A returned ',err
-    stop 1
-  endif
-
-  allocate( B(order,order), stat=err )
+  allocate( A(order,order), B(order,order), stat=err)
   if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of B returned ',err
+    write(*,'(a,i3)') 'allocation  returned ',err
     stop 1
   endif
 
-  write(*,'(a,i8)') 'Number of threads    = ',omp_get_max_threads()
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix order         = ', order
-  write(*,'(a,i8)') 'Tile size            = ', tile_size
-
-  t0 = 0
-
   !$omp parallel default(none)                     &
   !$omp&  shared(A,B,t0,t1)                        &
   !$omp&  firstprivate(order,iterations,tile_size) &
@@ -172,6 +135,8 @@ program main
     !$omp end do
   endif
 
+  t0 = 0
+
   ! need this because otherwise no barrier between initialization
   ! and iteration 0 (warmup), which will lead to incorrectness.
   !$omp barrier
diff --git a/FORTRAN/transpose-pointer.F90 b/FORTRAN/transpose-pointer.F90
index 87c3eaac1..b576d5e36 100644
--- a/FORTRAN/transpose-pointer.F90
+++ b/FORTRAN/transpose-pointer.F90
@@ -57,10 +57,7 @@ program main
   use, intrinsic :: iso_fortran_env
   use prk
   implicit none
-  ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! problem definition
   integer(kind=INT32) ::  iterations                ! number of times to do the transpose
   integer(kind=INT32) ::  order                     ! order of a the matrix
@@ -83,38 +80,14 @@ program main
   write(*,'(a25)') 'Parallel Research Kernels'
   write(*,'(a40)') 'Fortran Serial Matrix transpose: B = A^T'
 
-  if (command_argument_count().lt.2) then
-    write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-    write(*,'(a62)')    'Usage: ./transpose <# iterations> <matrix order> [<tile_size>]'
-    stop 1
-  endif
-
-  iterations = 1
-  call get_command_argument(1,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') iterations
-  if (iterations .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-    stop 1
-  endif
-
-  order = 1
-  call get_command_argument(2,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') order
-  if (order .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order
-    stop 1
-  endif
+  call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
 
-  ! same default as the C implementation
-  tile_size = 32
-  if (command_argument_count().gt.2) then
-      call get_command_argument(3,argtmp,arglen,err)
-      if (err.eq.0) read(argtmp,'(i32)') tile_size
-  endif
-  if ((tile_size .lt. 1).or.(tile_size.gt.order)) then
-    write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,&
-                           ' must be >= 1 and <= ',order
-    tile_size = order ! no tiling
+  write(*,'(a22,i8)') 'Number of iterations = ', iterations
+  write(*,'(a22,i8)') 'Matrix order         = ', order
+  if (tile_size.ne.order) then
+    write(*,'(a22,i8)') 'Tile size            = ', tile_size
+  else
+    write(*,'(a10)') 'Tiling off'
   endif
 
   ! ********************************************************************
@@ -130,10 +103,6 @@ program main
   A(1:order,1:order) => TA
   B(1:order,1:order) => TB
 
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix order         = ', order
-  write(*,'(a,i8)') 'Tile size            = ', tile_size
-
   t0 = 0
 
   if (tile_size.lt.order) then
@@ -158,9 +127,7 @@ program main
 
   do k=0,iterations
 
-    if (k.eq.1) then
-      t0 = prk_get_wtime()
-    endif
+    if (k.eq.1) t0 = prk_get_wtime()
 
     ! Transpose the  matrix; only use tiling if the tile size is smaller than the matrix
     if (tile_size.lt.order) then
@@ -204,8 +171,7 @@ program main
     enddo
   enddo
 
-  deallocate( B )
-  deallocate( A )
+  deallocate( A,B )
 
   if (abserr .lt. epsilon) then
     write(*,'(a)') 'Solution validates'
diff --git a/FORTRAN/transpose-pretty.F90 b/FORTRAN/transpose-pretty.F90
index 885c4ac3d..6eff0820d 100644
--- a/FORTRAN/transpose-pretty.F90
+++ b/FORTRAN/transpose-pretty.F90
@@ -53,13 +53,11 @@ program main
   use, intrinsic :: iso_fortran_env
   use prk
   implicit none
-  ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! problem definition
   integer(kind=INT32) ::  iterations                ! number of times to do the transpose
   integer(kind=INT32) ::  order                     ! order of a the matrix
+  integer(kind=INT32) ::  tile_size
   real(kind=REAL64), allocatable ::  A(:,:)         ! buffer to hold original matrix
   real(kind=REAL64), allocatable ::  B(:,:)         ! buffer to hold transposed matrix
   integer(kind=INT64) ::  bytes                     ! combined size of matrices
@@ -77,57 +75,37 @@ program main
   write(*,'(a25)') 'Parallel Research Kernels'
   write(*,'(a40)') 'Fortran Pretty Matrix transpose: B = A^T'
 
-  if (command_argument_count().lt.2) then
-    write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-    write(*,'(a62)')    'Usage: ./transpose <# iterations> <matrix order> [<tile_size>]'
-    stop 1
-  endif
-
-  iterations = 1
-  call get_command_argument(1,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') iterations
-  if (iterations .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-    stop 1
-  endif
+  call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
 
-  order = 1
-  call get_command_argument(2,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') order
-  if (order .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order
-    stop 1
+  write(*,'(a22,i8)') 'Number of iterations = ', iterations
+  write(*,'(a22,i8)') 'Matrix order         = ', order
+  if (tile_size.ne.order) then
+    write(*,'(a22,i8)') 'Tile size            = ', tile_size
+  else
+    write(*,'(a10)') 'Tiling off'
   endif
 
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix order         = ', order
-
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
   ! ********************************************************************
 
-  allocate( A(order,order), stat=err)
+  allocate( A(order,order), B(order,order), stat=err)
   if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of A returned ',err
+    write(*,'(a,i3)') 'allocation  returned ',err
     stop 1
   endif
 
-  allocate( B(order,order), stat=err )
-  if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of B returned ',err
-    stop 1
-  endif
+  t0 = 0
 
   ! Fill the original matrix
   o2 = int(order,INT64)**2
   A = reshape((/ (j2, j2 = 0,o2) /),(/order, order/))
   B = 0
 
-  t0 = 0
-
   do k=0,iterations
-    ! start timer after a warmup iteration
+
     if (k.eq.1) t0 = prk_get_wtime()
+
     B = B + transpose(A)
     A = A + 1
   enddo ! iterations
@@ -155,8 +133,7 @@ program main
   abserr = norm2(A-B)
 #endif
 
-  deallocate( B )
-  deallocate( A )
+  deallocate( A,B )
 
   if (abserr .lt. epsilon) then
     write(*,'(a)') 'Solution validates'
diff --git a/FORTRAN/transpose-stdpar.F90 b/FORTRAN/transpose-stdpar.F90
index 26c0e87f5..7faf89646 100644
--- a/FORTRAN/transpose-stdpar.F90
+++ b/FORTRAN/transpose-stdpar.F90
@@ -50,16 +50,14 @@
 !
 ! HISTORY: Written by  Rob Van der Wijngaart, February 2009.
 !          Converted to Fortran by Jeff Hammond, January 2015
+!
 ! *******************************************************************
 
 program main
   use, intrinsic :: iso_fortran_env
   use prk
   implicit none
-  ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! problem definition
   integer(kind=INT32) ::  iterations                ! number of times to do the transpose
   integer(kind=INT32) ::  order                     ! order of a the matrix
@@ -81,76 +79,33 @@ program main
   write(*,'(a25)') 'Parallel Research Kernels'
   write(*,'(a40)') 'Fortran stdpar Matrix transpose: B = A^T'
 
-  if (command_argument_count().lt.2) then
-    write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-    write(*,'(a62)')    'Usage: ./transpose <# iterations> <matrix order> [<tile_size>]'
-    stop 1
-  endif
-
-  iterations = 1
-  call get_command_argument(1,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') iterations
-  if (iterations .lt. 1) then
-    write(*,'(a33,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-    stop 1
-  endif
-
-  order = 1
-  call get_command_argument(2,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') order
-  if (order .lt. 1) then
-    write(*,'(a28,i5)') 'ERROR: order must be >= 1 : ', order
-    stop 1
-  endif
-
-  ! same default as the C implementation
-  tile_size = 16
-  if (command_argument_count().gt.2) then
-      call get_command_argument(3,argtmp,arglen,err)
-      if (err.eq.0) read(argtmp,'(i32)') tile_size
-  endif
-  if ((tile_size .lt. 1).or.(tile_size.gt.order)) then
-    write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,&
-                               ' must be >= 1 and <= ',order
-    tile_size = order ! no tiling
-  endif
+  call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
 
-  if ((tile_size.gt.0).and.(mod(order,tile_size).ne.0)) then
-    write(*,'(a50)') 'ERROR: order must be evenly divisible by tile_size'
-    stop 1
-  endif
-  if ((tile_size.ne.order) .and. (tile_size.gt.32)) then
-    write(*,'(a50)') 'ERROR: tile_size must be less than 32 to use temp space'
-    stop 1
+  write(*,'(a22,i8)') 'Number of iterations = ', iterations
+  write(*,'(a22,i8)') 'Matrix order         = ', order
+  if (tile_size.ne.order) then
+    write(*,'(a22,i8)') 'Tile size            = ', tile_size
+  else
+    write(*,'(a10)') 'Tiling off'
   endif
 
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
   ! ********************************************************************
 
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix order         = ', order
-  write(*,'(a,i8)') 'Tile size            = ', tile_size
-
-  allocate( A(order,order), stat=err)
+  allocate( A(order,order), B(order,order), stat=err)
   if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of A returned ',err
+    write(*,'(a,i3)') 'allocation  returned ',err
     stop 1
   endif
 
-  allocate( B(order,order), stat=err )
-  if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of B returned ',err
-    stop 1
-  endif
+  t0 = 0
 
   do concurrent (j=1:order, i=1:order)
     A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64)
     B(i,j) = 0.0
   enddo
 
-  t0 = 0
-
   do k=0,iterations
 
     if (k.eq.1) t0 = prk_get_wtime()
@@ -180,7 +135,6 @@ program main
   enddo ! iterations
 
   t1 = prk_get_wtime()
-
   trans_time = t1 - t0
 
   ! ********************************************************************
@@ -196,8 +150,7 @@ program main
     abserr = abserr + abs(B(i,j) - (temp+addit))
   enddo
 
-  deallocate( B )
-  deallocate( A )
+  deallocate( A,B )
 
   if (abserr .lt. epsilon) then
     write(*,'(a)') 'Solution validates'
diff --git a/FORTRAN/transpose-taskloop-openmp.F90 b/FORTRAN/transpose-taskloop-openmp.F90
index 3cc0fbc78..fccef232f 100644
--- a/FORTRAN/transpose-taskloop-openmp.F90
+++ b/FORTRAN/transpose-taskloop-openmp.F90
@@ -49,16 +49,15 @@
 !
 ! HISTORY: Written by  Rob Van der Wijngaart, February 2009.
 !          Converted to Fortran by Jeff Hammond, January 2015
+!
 ! *******************************************************************
 
 program main
   use, intrinsic :: iso_fortran_env
   use omp_lib
+  use prk
   implicit none
-  ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! problem definition
   integer(kind=INT32) ::  iterations                ! number of times to do the transpose
   integer(kind=INT32) ::  order                     ! order of a the matrix
@@ -79,61 +78,26 @@ program main
   write(*,'(a25)') 'Parallel Research Kernels'
   write(*,'(a50)') 'Fortran OpenMP TASKLOOP Matrix transpose: B = A^T'
 
-  if (command_argument_count().lt.2) then
-    write(*,'(a,i1)') 'argument count = ', command_argument_count()
-    write(*,'(a)')    'Usage: ./transpose <# iterations> <matrix order> [<tile_size>]'
-    stop 1
-  endif
-
-  iterations = 1
-  call get_command_argument(1,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') iterations
-  if (iterations .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-    stop 1
-  endif
-
-  order = 1
-  call get_command_argument(2,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') order
-  if (order .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order
-    stop 1
-  endif
+  call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
 
-  ! same default as the C implementation
-  tile_size = 32
-  if (command_argument_count().gt.2) then
-      call get_command_argument(3,argtmp,arglen,err)
-      if (err.eq.0) read(argtmp,'(i32)') tile_size
-  endif
-  if ((tile_size .lt. 1).or.(tile_size.gt.order)) then
-    write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,&
-                           ' must be >= 1 and <= ',order
-    tile_size = order ! no tiling
+  write(*,'(a22,i8)') 'Number of iterations = ', iterations
+  write(*,'(a22,i8)') 'Matrix order         = ', order
+  if (tile_size.ne.order) then
+    write(*,'(a22,i8)') 'Tile size            = ', tile_size
+  else
+    write(*,'(a10)') 'Tiling off'
   endif
 
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
   ! ********************************************************************
 
-  allocate( A(order,order), stat=err)
-  if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of A returned ',err
-    stop 1
-  endif
-
-  allocate( B(order,order), stat=err )
+  allocate( A(order,order), B(order,order), stat=err)
   if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of B returned ',err
+    write(*,'(a,i3)') 'allocation  returned ',err
     stop 1
   endif
 
-  write(*,'(a,i8)') 'Number of threads    = ',omp_get_max_threads()
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix order         = ', order
-  write(*,'(a,i8)') 'Tile size            = ', tile_size
-
   t0 = 0
 
   !$omp parallel default(none)                     &
@@ -160,9 +124,7 @@ program main
 
   do k=0,iterations
 
-    if (k.eq.1) then
-      t0 = omp_get_wtime()
-    endif
+    if (k.eq.1) t0 = omp_get_wtime()
 
     !$omp taskloop firstprivate(order,tile_size) shared(A,B) private(i,j,it,jt)
     do jt=1,order,tile_size
@@ -211,8 +173,7 @@ program main
   enddo
   !$omp end parallel do
 
-  deallocate( B )
-  deallocate( A )
+  deallocate( A,B )
 
   if (abserr .lt. epsilon) then
     write(*,'(a)') 'Solution validates'
diff --git a/FORTRAN/transpose-tasks-openmp.F90 b/FORTRAN/transpose-tasks-openmp.F90
index a0ac9afb9..7cce694ba 100644
--- a/FORTRAN/transpose-tasks-openmp.F90
+++ b/FORTRAN/transpose-tasks-openmp.F90
@@ -49,16 +49,15 @@
 !
 ! HISTORY: Written by  Rob Van der Wijngaart, February 2009.
 !          Converted to Fortran by Jeff Hammond, January 2015
+!
 ! *******************************************************************
 
 program main
   use, intrinsic :: iso_fortran_env
   use omp_lib
+  use prk
   implicit none
-  ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! problem definition
   integer(kind=INT32) ::  iterations                ! number of times to do the transpose
   integer(kind=INT32) ::  order                     ! order of a the matrix
@@ -79,61 +78,27 @@ program main
   write(*,'(a25)') 'Parallel Research Kernels'
   write(*,'(a46)') 'Fortran OpenMP TASKS Matrix transpose: B = A^T'
 
-  if (command_argument_count().lt.2) then
-    write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-    write(*,'(a62)')    'Usage: ./transpose <# iterations> <matrix order> [<tile_size>]'
-    stop 1
-  endif
-
-  iterations = 1
-  call get_command_argument(1,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') iterations
-  if (iterations .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-    stop 1
-  endif
-
-  order = 1
-  call get_command_argument(2,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') order
-  if (order .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order
-    stop 1
-  endif
+  call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
 
-  ! same default as the C implementation
-  tile_size = 32
-  if (command_argument_count().gt.2) then
-      call get_command_argument(3,argtmp,arglen,err)
-      if (err.eq.0) read(argtmp,'(i32)') tile_size
-  endif
-  if ((tile_size .lt. 1).or.(tile_size.gt.order)) then
-    write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,&
-                           ' must be >= 1 and <= ',order
-    tile_size = order ! no tiling
+  write(*,'(a22,i8)') 'Number of threads    = ',omp_get_max_threads()
+  write(*,'(a22,i8)') 'Number of iterations = ', iterations
+  write(*,'(a22,i8)') 'Matrix order         = ', order
+  if (tile_size.ne.order) then
+    write(*,'(a22,i8)') 'Tile size            = ', tile_size
+  else
+    write(*,'(a10)') 'Tiling off'
   endif
 
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
   ! ********************************************************************
 
-  allocate( A(order,order), stat=err)
-  if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of A returned ',err
-    stop 1
-  endif
-
-  allocate( B(order,order), stat=err )
+  allocate( A(order,order), B(order,order), stat=err)
   if (err .ne. 0) then
-    write(*,'(a,i3)') 'allocation of B returned ',err
+    write(*,'(a,i3)') 'allocation  returned ',err
     stop 1
   endif
 
-  write(*,'(a,i8)') 'Number of threads    = ',omp_get_max_threads()
-  write(*,'(a,i8)') 'Number of iterations = ', iterations
-  write(*,'(a,i8)') 'Matrix order         = ', order
-  write(*,'(a,i8)') 'Tile size            = ', tile_size
-
   t0 = 0
 
   !$omp parallel default(none)                     &
@@ -160,9 +125,7 @@ program main
 
   do k=0,iterations
 
-    if (k.eq.1) then
-      t0 = omp_get_wtime()
-    endif
+    if (k.eq.1) t0 = omp_get_wtime()
 
     do jt=1,order,tile_size
       !$omp task  firstprivate(order,tile_size,jt) shared(A,B) private(i,j,it)
@@ -211,8 +174,7 @@ program main
   enddo
   !$omp end parallel do
 
-  deallocate( B )
-  deallocate( A )
+  deallocate( A,B )
 
   if (abserr .lt. epsilon) then
     write(*,'(a)') 'Solution validates'
diff --git a/FORTRAN/transpose.F90 b/FORTRAN/transpose.F90
index 4e398a1bf..56fb6ab26 100644
--- a/FORTRAN/transpose.F90
+++ b/FORTRAN/transpose.F90
@@ -98,6 +98,8 @@ program main
     stop 1
   endif
 
+  t0 = 0
+
   if (tile_size.lt.order) then
     do jt=1,order,tile_size
       do it=1,order,tile_size
@@ -118,12 +120,9 @@ program main
     enddo
   endif
 
-  t0 = 0
-
   do k=0,iterations
-    if (k.eq.1) then
-      t0 = prk_get_wtime()
-    endif
+
+    if (k.eq.1) t0 = prk_get_wtime()
 
     ! Transpose the  matrix; only use tiling if the tile size is smaller than the matrix
     if (tile_size.lt.order) then
@@ -149,7 +148,6 @@ program main
   enddo ! iterations
 
   t1 = prk_get_wtime()
-
   trans_time = t1 - t0
 
   ! ********************************************************************

From 359da8eabe2fd842ef9d1611d63a73bfe085c902 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 18:05:13 +0300
Subject: [PATCH 46/80] fix arg parse in MPI

---
 FORTRAN/transpose-a2a-mpi.F90 | 35 ++++------------------
 FORTRAN/transpose-acc-mpi.F90 | 34 ++++------------------
 FORTRAN/transpose-ga.F90      | 55 +++++++++++------------------------
 FORTRAN/transpose-get-mpi.F90 | 34 ++++------------------
 FORTRAN/transpose-p2p-mpi.F90 | 34 ++++------------------
 5 files changed, 37 insertions(+), 155 deletions(-)

diff --git a/FORTRAN/transpose-a2a-mpi.F90 b/FORTRAN/transpose-a2a-mpi.F90
index c121b037a..c38158397 100644
--- a/FORTRAN/transpose-a2a-mpi.F90
+++ b/FORTRAN/transpose-a2a-mpi.F90
@@ -90,8 +90,6 @@ program main
   implicit none
   ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! problem definition
   integer(kind=INT32) ::  iterations
   integer(kind=INT32) ::  order, block_order
@@ -101,8 +99,7 @@ program main
   real(kind=REAL64), parameter :: one=1.0d0
   ! runtime variables
   integer(kind=INT64) :: bytes
-  integer(kind=INT32) :: i, j, k, r, lo, hi
-  !integer(kind=INT32) ::  it, jt, tile_size
+  integer(kind=INT32) :: i, j, k, r, lo, hi, tile_size
   real(kind=REAL64) ::  abserr, addit, temp
   real(kind=REAL64) ::  t0, t1, trans_time, avgtime
   real(kind=REAL64), parameter ::  epsilon=1.d-8
@@ -118,38 +115,16 @@ program main
   ! ********************************************************************
 
   if (me.eq.0) then
+    call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
     write(*,'(a25)') 'Parallel Research Kernels'
     write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T'
-
-    if (command_argument_count().lt.2) then
-      write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-      write(*,'(a62)')    'Usage: ./transpose <# iterations> <matrix order>'
-      call MPI_Abort(MPI_COMM_WORLD, command_argument_count())
-    endif
- 
-    iterations = 1
-    call get_command_argument(1,argtmp,arglen,err)
-    if (err.eq.0) read(argtmp,'(i32)') iterations
-    if (iterations .lt. 1) then
-      write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-      call MPI_Abort(MPI_COMM_WORLD, 2)
-    endif
- 
-    order = 1
-    call get_command_argument(2,argtmp,arglen,err)
-    if (err.eq.0) read(argtmp,'(i32)') order
-    if (order .lt. 1) then
-      write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order
-      call MPI_Abort(MPI_COMM_WORLD, 3)
-    endif
+    write(*,'(a22,i8)') 'Number of MPI procs    = ', np
+    write(*,'(a22,i8)') 'Number of iterations    = ', iterations
+    write(*,'(a22,i8)') 'Matrix order            = ', order
     if (mod(order,np).ne.0) then
       write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np
       call MPI_Abort(MPI_COMM_WORLD, 4)
     endif
-
-    write(*,'(a23,i8)') 'Number of MPI procs  = ', np
-    write(*,'(a23,i8)') 'Number of iterations = ', iterations
-    write(*,'(a23,i8)') 'Matrix order         = ', order
   endif
   call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD)
   call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD)
diff --git a/FORTRAN/transpose-acc-mpi.F90 b/FORTRAN/transpose-acc-mpi.F90
index 9023a006f..2b49bb0bd 100644
--- a/FORTRAN/transpose-acc-mpi.F90
+++ b/FORTRAN/transpose-acc-mpi.F90
@@ -91,8 +91,6 @@ program main
   implicit none
   ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! problem definition
   integer(kind=INT32) ::  iterations
   integer(kind=INT32) ::  order, block_order
@@ -104,7 +102,7 @@ program main
   real(kind=REAL64), parameter :: one=1.0d0
   ! runtime variables
   integer(kind=INT64) :: bytes
-  integer(kind=INT32) :: i, j, k, q, r, lo, hi
+  integer(kind=INT32) :: i, j, k, q, r, lo, hi, tile_size
   !integer(kind=INT32) ::  it, jt, tile_size
   real(kind=REAL64) ::  abserr, addit, temp
   real(kind=REAL64) ::  t0, t1, trans_time, avgtime
@@ -123,38 +121,16 @@ program main
   ! ********************************************************************
 
   if (me.eq.0) then
+    call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
     write(*,'(a25)') 'Parallel Research Kernels'
     write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T'
-
-    if (command_argument_count().lt.2) then
-      write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-      write(*,'(a62)')    'Usage: ./transpose <# iterations> <matrix order>'
-      call MPI_Abort(MPI_COMM_WORLD, command_argument_count())
-    endif
- 
-    iterations = 1
-    call get_command_argument(1,argtmp,arglen,err)
-    if (err.eq.0) read(argtmp,'(i32)') iterations
-    if (iterations .lt. 1) then
-      write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-      call MPI_Abort(MPI_COMM_WORLD, 2)
-    endif
- 
-    order = 1
-    call get_command_argument(2,argtmp,arglen,err)
-    if (err.eq.0) read(argtmp,'(i32)') order
-    if (order .lt. 1) then
-      write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order
-      call MPI_Abort(MPI_COMM_WORLD, 3)
-    endif
+    write(*,'(a22,i8)') 'Number of MPI procs    = ', np
+    write(*,'(a22,i8)') 'Number of iterations    = ', iterations
+    write(*,'(a22,i8)') 'Matrix order            = ', order
     if (mod(order,np).ne.0) then
       write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np
       call MPI_Abort(MPI_COMM_WORLD, 4)
     endif
-
-    write(*,'(a23,i8)') 'Number of MPI procs  = ', np
-    write(*,'(a23,i8)') 'Number of iterations = ', iterations
-    write(*,'(a23,i8)') 'Matrix order         = ', order
   endif
   call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD)
   call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD)
diff --git a/FORTRAN/transpose-ga.F90 b/FORTRAN/transpose-ga.F90
index 8d81c038d..5e2fde45f 100644
--- a/FORTRAN/transpose-ga.F90
+++ b/FORTRAN/transpose-ga.F90
@@ -55,14 +55,12 @@
 program main
   use, intrinsic :: iso_fortran_env
   use mpi_f08
+  use prk
   implicit none
 #include "global.fh"
 #include "mafdecls.fh"
 !#include 'ga-mpi.fh' ! unused
-  ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! MPI - should always use 32-bit INTEGER
   integer(kind=INT32), parameter :: requested = MPI_THREAD_SERIALIZED
   integer(kind=INT32) :: provided
@@ -86,33 +84,7 @@ program main
   real(kind=REAL64) ::  t0, t1, trans_time, avgtime
   real(kind=REAL64), parameter ::  epsilon=1.d-8
 
-  ! ********************************************************************
-  ! read and test input parameters
-  ! ********************************************************************
-
-  if (command_argument_count().lt.2) then
-    write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-    write(*,'(a62)')    'Usage: ./transpose <# iterations> <matrix order>'
-    stop 1
-  endif
-
-  iterations = 1
-  call get_command_argument(1,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') iterations
-  if (iterations .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-    stop 1
-  endif
-
-  order = 1
-  call get_command_argument(2,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') order
-  if (order .lt. 1) then
-    write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order
-    stop 1
-  endif
-
-  call mpi_init_thread(requested,provided)
+  call MPI_Init_thread(requested,provided)
 
   !call ga_initialize()
   ! ask GA to allocate enough memory for 4 matrices, just to be safe
@@ -124,6 +96,21 @@ program main
 
   !if (me.eq.0) print*,'max_mem=',max_mem
 
+  ! ********************************************************************
+  ! read and test input parameters
+  ! ********************************************************************
+
+  if (me.eq.0) then
+    call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
+    write(*,'(a25)') 'Parallel Research Kernels'
+    write(*,'(a47)') 'Fortran Global Arrays Matrix transpose: B = A^T'
+    write(*,'(a22,i8)') 'Number of GA procs     = ', np
+    write(*,'(a22,i8)') 'Number of iterations    = ', iterations
+    write(*,'(a22,i8)') 'Matrix order            = ', order
+  endif
+  call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD)
+  call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD)
+
 #if PRK_CHECK_GA_MPI
   ! We do use MPI anywhere, but if we did, we would need to avoid MPI collectives
   ! on the world communicator, because it is possible for that to be larger than
@@ -140,14 +127,6 @@ program main
   endif
 #endif
 
-  if (me.eq.0) then
-    write(*,'(a25)') 'Parallel Research Kernels'
-    write(*,'(a47)') 'Fortran Global Arrays Matrix transpose: B = A^T'
-    write(*,'(a22,i12)') 'Number of GA procs   = ', np
-    write(*,'(a,i8)') 'Number of iterations    = ', iterations
-    write(*,'(a,i8)') 'Matrix order            = ', order
-  endif
-
   call ga_sync()
 
   ! ********************************************************************
diff --git a/FORTRAN/transpose-get-mpi.F90 b/FORTRAN/transpose-get-mpi.F90
index b153117ca..ebab0c406 100644
--- a/FORTRAN/transpose-get-mpi.F90
+++ b/FORTRAN/transpose-get-mpi.F90
@@ -91,8 +91,6 @@ program main
   implicit none
   ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! problem definition
   integer(kind=INT32) ::  iterations
   integer(kind=INT32) ::  order, block_order
@@ -104,7 +102,7 @@ program main
   real(kind=REAL64), parameter :: one=1.0d0
   ! runtime variables
   integer(kind=INT64) :: bytes
-  integer(kind=INT32) :: i, j, k, q, r, lo, hi
+  integer(kind=INT32) :: i, j, k, q, r, lo, hi, tile_size
   !integer(kind=INT32) ::  it, jt, tile_size
   real(kind=REAL64) ::  abserr, addit, temp
   real(kind=REAL64) ::  t0, t1, trans_time, avgtime
@@ -123,38 +121,16 @@ program main
   ! ********************************************************************
 
   if (me.eq.0) then
+    call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
     write(*,'(a25)') 'Parallel Research Kernels'
     write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T'
-
-    if (command_argument_count().lt.2) then
-      write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-      write(*,'(a62)')    'Usage: ./transpose <# iterations> <matrix order>'
-      call MPI_Abort(MPI_COMM_WORLD, command_argument_count())
-    endif
- 
-    iterations = 1
-    call get_command_argument(1,argtmp,arglen,err)
-    if (err.eq.0) read(argtmp,'(i32)') iterations
-    if (iterations .lt. 1) then
-      write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-      call MPI_Abort(MPI_COMM_WORLD, 2)
-    endif
- 
-    order = 1
-    call get_command_argument(2,argtmp,arglen,err)
-    if (err.eq.0) read(argtmp,'(i32)') order
-    if (order .lt. 1) then
-      write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order
-      call MPI_Abort(MPI_COMM_WORLD, 3)
-    endif
+    write(*,'(a22,i8)') 'Number of MPI procs    = ', np
+    write(*,'(a22,i8)') 'Number of iterations    = ', iterations
+    write(*,'(a22,i8)') 'Matrix order            = ', order
     if (mod(order,np).ne.0) then
       write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np
       call MPI_Abort(MPI_COMM_WORLD, 4)
     endif
-
-    write(*,'(a23,i8)') 'Number of MPI procs  = ', np
-    write(*,'(a23,i8)') 'Number of iterations = ', iterations
-    write(*,'(a23,i8)') 'Matrix order         = ', order
   endif
   call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD)
   call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD)
diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90
index 3d72cb36c..b18c3b64f 100644
--- a/FORTRAN/transpose-p2p-mpi.F90
+++ b/FORTRAN/transpose-p2p-mpi.F90
@@ -90,8 +90,6 @@ program main
   implicit none
   ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   ! problem definition
   integer(kind=INT32) ::  iterations
   integer(kind=INT32) ::  order, block_order
@@ -101,7 +99,7 @@ program main
   real(kind=REAL64), parameter :: one=1.0d0
   ! runtime variables
   integer(kind=INT64) :: bytes
-  integer(kind=INT32) :: i, j, k, lo, hi, q
+  integer(kind=INT32) :: i, j, k, lo, hi, q, tile_size
   real(kind=REAL64) ::  abserr, addit, temp
   real(kind=REAL64) ::  t0, t1, trans_time, avgtime
   real(kind=REAL64), parameter ::  epsilon=1.d-8
@@ -118,38 +116,16 @@ program main
   ! ********************************************************************
 
   if (me.eq.0) then
+    call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
     write(*,'(a25)') 'Parallel Research Kernels'
     write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T'
-
-    if (command_argument_count().lt.2) then
-      write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-      write(*,'(a62)')    'Usage: ./transpose <# iterations> <matrix order>'
-      call MPI_Abort(MPI_COMM_WORLD, command_argument_count())
-    endif
- 
-    iterations = 1
-    call get_command_argument(1,argtmp,arglen,err)
-    if (err.eq.0) read(argtmp,'(i32)') iterations
-    if (iterations .lt. 1) then
-      write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-      call MPI_Abort(MPI_COMM_WORLD, 2)
-    endif
- 
-    order = 1
-    call get_command_argument(2,argtmp,arglen,err)
-    if (err.eq.0) read(argtmp,'(i32)') order
-    if (order .lt. 1) then
-      write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order
-      call MPI_Abort(MPI_COMM_WORLD, 3)
-    endif
+    write(*,'(a22,i8)') 'Number of MPI procs    = ', np
+    write(*,'(a22,i8)') 'Number of iterations    = ', iterations
+    write(*,'(a22,i8)') 'Matrix order            = ', order
     if (mod(order,np).ne.0) then
       write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np
       call MPI_Abort(MPI_COMM_WORLD, 4)
     endif
-
-    write(*,'(a23,i8)') 'Number of MPI procs  = ', np
-    write(*,'(a23,i8)') 'Number of iterations = ', iterations
-    write(*,'(a23,i8)') 'Matrix order         = ', order
   endif
   call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD)
   call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD)

From a73e3cec55ffe4203002e639df8fabc8454fc09e Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 19:06:18 +0300
Subject: [PATCH 47/80] print fix

---
 FORTRAN/transpose-a2a-mpi.F90 | 2 +-
 FORTRAN/transpose-acc-mpi.F90 | 2 +-
 FORTRAN/transpose-get-mpi.F90 | 2 +-
 FORTRAN/transpose-p2p-mpi.F90 | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/FORTRAN/transpose-a2a-mpi.F90 b/FORTRAN/transpose-a2a-mpi.F90
index c38158397..72a55c797 100644
--- a/FORTRAN/transpose-a2a-mpi.F90
+++ b/FORTRAN/transpose-a2a-mpi.F90
@@ -117,7 +117,7 @@ program main
   if (me.eq.0) then
     call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
     write(*,'(a25)') 'Parallel Research Kernels'
-    write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T'
+    write(*,'(a37)') 'Fortran MPI Matrix transpose: B = A^T'
     write(*,'(a22,i8)') 'Number of MPI procs    = ', np
     write(*,'(a22,i8)') 'Number of iterations    = ', iterations
     write(*,'(a22,i8)') 'Matrix order            = ', order
diff --git a/FORTRAN/transpose-acc-mpi.F90 b/FORTRAN/transpose-acc-mpi.F90
index 2b49bb0bd..6ac96b7cf 100644
--- a/FORTRAN/transpose-acc-mpi.F90
+++ b/FORTRAN/transpose-acc-mpi.F90
@@ -123,7 +123,7 @@ program main
   if (me.eq.0) then
     call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
     write(*,'(a25)') 'Parallel Research Kernels'
-    write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T'
+    write(*,'(a37)') 'Fortran MPI Matrix transpose: B = A^T'
     write(*,'(a22,i8)') 'Number of MPI procs    = ', np
     write(*,'(a22,i8)') 'Number of iterations    = ', iterations
     write(*,'(a22,i8)') 'Matrix order            = ', order
diff --git a/FORTRAN/transpose-get-mpi.F90 b/FORTRAN/transpose-get-mpi.F90
index ebab0c406..96a5470d5 100644
--- a/FORTRAN/transpose-get-mpi.F90
+++ b/FORTRAN/transpose-get-mpi.F90
@@ -123,7 +123,7 @@ program main
   if (me.eq.0) then
     call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
     write(*,'(a25)') 'Parallel Research Kernels'
-    write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T'
+    write(*,'(a37)') 'Fortran MPI Matrix transpose: B = A^T'
     write(*,'(a22,i8)') 'Number of MPI procs    = ', np
     write(*,'(a22,i8)') 'Number of iterations    = ', iterations
     write(*,'(a22,i8)') 'Matrix order            = ', order
diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90
index b18c3b64f..b7fc14605 100644
--- a/FORTRAN/transpose-p2p-mpi.F90
+++ b/FORTRAN/transpose-p2p-mpi.F90
@@ -118,7 +118,7 @@ program main
   if (me.eq.0) then
     call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
     write(*,'(a25)') 'Parallel Research Kernels'
-    write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T'
+    write(*,'(a37)') 'Fortran MPI Matrix transpose: B = A^T'
     write(*,'(a22,i8)') 'Number of MPI procs    = ', np
     write(*,'(a22,i8)') 'Number of iterations    = ', iterations
     write(*,'(a22,i8)') 'Matrix order            = ', order

From 908e3a304f5e70d346f7f0c1c6100245ded5439c Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Wed, 18 May 2022 19:12:56 +0300
Subject: [PATCH 48/80] args

---
 FORTRAN/transpose-coarray.F90 | 57 +++++------------------------------
 1 file changed, 8 insertions(+), 49 deletions(-)

diff --git a/FORTRAN/transpose-coarray.F90 b/FORTRAN/transpose-coarray.F90
index bc15f1238..08526a1bb 100644
--- a/FORTRAN/transpose-coarray.F90
+++ b/FORTRAN/transpose-coarray.F90
@@ -58,10 +58,7 @@ program main
   use, intrinsic :: iso_fortran_env
   use prk
   implicit none
-  ! for argument parsing
   integer :: err
-  integer :: arglen
-  character(len=32) :: argtmp
   integer :: me, np
   logical :: printer
   ! problem definition
@@ -90,37 +87,18 @@ program main
   ! ********************************************************************
 
   if (printer) then
+    call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
     write(6,'(a25)') 'Parallel Research Kernels'
     write(6,'(a41)') 'Fortran coarray Matrix transpose: B = A^T'
+    write(6,'(a23,i8)') 'Number of images     = ', np
+    write(6,'(a23,i8)') 'Number of iterations = ', iterations
+    write(6,'(a23,i8)') 'Matrix order         = ', order
+    write(6,'(a23,i8)') 'Tile size            = ', tile_size
   endif
+  call co_broadcast(iterations,1)
+  call co_broadcast(order,1)
+  call co_broadcast(tile_size,1)
 
-  if (command_argument_count().lt.2) then
-    if (printer) then
-      write(*,'(a17,i1)') 'argument count = ', command_argument_count()
-      write(6,'(a62)')    'Usage: ./transpose <# iterations> <matrix order> [<tile_size>]'
-    endif
-    stop 1
-  endif
-
-  iterations = 1
-  call get_command_argument(1,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') iterations
-  if (iterations .lt. 1) then
-    if (printer) then
-      write(6,'(a35,i5)') 'ERROR: iterations must be >= 1 : ', iterations
-    endif
-    stop 1
-  endif
-
-  order = 1
-  call get_command_argument(2,argtmp,arglen,err)
-  if (err.eq.0) read(argtmp,'(i32)') order
-  if (order .lt. 1) then
-    if (printer) then
-      write(6,'(a30,i5)') 'ERROR: order must be >= 1 : ', order
-    endif
-    stop 1
-  endif
   if (modulo(order,np).gt.0) then
     if (printer) then
       write(6,'(a20,i5,a35,i5)') 'ERROR: matrix order ',order,&
@@ -130,18 +108,6 @@ program main
   endif
   block_order = order/np
 
-  ! same default as the C implementation
-  tile_size = 32
-  if (command_argument_count().gt.2) then
-      call get_command_argument(3,argtmp,arglen,err)
-      if (err.eq.0) read(argtmp,'(i32)') tile_size
-  endif
-  if ((tile_size .lt. 1).or.(tile_size.gt.order)) then
-    write(6,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,&
-                           ' must be >= 1 and <= ',order
-    tile_size = order ! no tiling
-  endif
-
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
   ! ********************************************************************
@@ -152,13 +118,6 @@ program main
     stop 1
   endif
 
-  if (printer) then
-    write(6,'(a23,i8)') 'Number of images     = ', np
-    write(6,'(a23,i8)') 'Number of iterations = ', iterations
-    write(6,'(a23,i8)') 'Matrix order         = ', order
-    write(6,'(a23,i8)') 'Tile size            = ', tile_size
-  endif
-
   ! initialization
   ! local column index j corresponds to global column index block_order*me+j
   if ((tile_size.gt.1).and.(tile_size.lt.order)) then

From f465bed8fa7f3d5ef319f0102f9b24f17582e688 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@users.noreply.github.com>
Date: Thu, 2 Jun 2022 09:47:05 -0600
Subject: [PATCH 49/80] Fix typos in README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 14a059365..4fbb49dd8 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ executed on many computing systems.
 These programs should not be used as benchmarks.  They are operations to 
 explore features of a hardware platform, but they do not define 
 fixed problems that can be used to rank systems.  Furthermore 
-they have not been optimimzed for the features of any particular system.
+they have not been optimized for the features of any particular system.
 
 # Build Instructions
 
@@ -51,7 +51,7 @@ If you are looking for the simplest option, try `make.defs.gcc`.
 | `make.defs.pgi`      | PGI compiler toolchain (infrequently tested). |
 | `make.defs.hip`      | HIP compiler toolchain (infrequently tested). |
 
-Some of the C++ implementations require you to install Boost, RAJA, KOKKOS, Parallel STL, respectively,
+Some of the C++ implementations require you to install Boost, RAJA, Kokkos, Parallel STL, respectively,
 and then modify `make.defs` appropriately.  Please see the documentation in the
 [documentation](https://github.com/ParRes/Kernels/tree/default/doc) (`doc`) subdirectory.
 
@@ -215,7 +215,7 @@ be used unless a `make veryclean` has been issued.
 
 ## Individual make
 
-Descend into the desired sub-tree and cd to the kernel(s) of interest. 
+Descend into the desired sub-tree and `cd` to the kernel(s) of interest. 
 Each kernel has its own Makefile. There are a number of parameters 
 that determine the behavior of the kernel that need to be known at 
 compile time. These are explained succinctly in the Makefile itself. Edit 

From 00e68f8ddb482149255fc3bf77d878e8b2ff1bc5 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 16 Jun 2022 12:11:01 +0300
Subject: [PATCH 50/80] mpifort required for prk_mpi_mod

---
 FORTRAN/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index 4faa6796a..6d3b0c1f1 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -99,7 +99,7 @@ prk.mod prk_mod.o: prk_mod.F90
 	$(FC) $(FCFLAGS) -c $< -o prk_mod.o
 
 prk_mpi.mod prk_mpi_mod.o: prk_mpi.F90
-	$(FC) $(FCFLAGS) -c $< -o prk_mpi_mod.o
+	$(MPIFORT) $(FCFLAGS) -c $< -o prk_mpi_mod.o
 
 stencil: stencil.F90 prk.mod
 	$(FC) $(FCFLAGS) -c stencil_serial.F90

From c89329dd810f315f96c3d6ba15a13d6e922e7ee8 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 16 Jun 2022 12:13:36 +0300
Subject: [PATCH 51/80] default(none) and MPI_COMM_WORLD cannot coexist

---
 FORTRAN/nstream-mpi.F90 | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90
index 66ba8d30c..aa6c6b408 100644
--- a/FORTRAN/nstream-mpi.F90
+++ b/FORTRAN/nstream-mpi.F90
@@ -136,11 +136,10 @@ program main
   scalar = 3
 
 #ifdef _OPENMP
-  !$omp parallel default(none)                   &
+  !$omp parallel                                 &
   !$omp&  shared(A,B,C,nstream_time)             &
   !$omp&  firstprivate(length,iterations,scalar) &
-  !$omp&  private(i,k,t0,t1)                     &
-  !$omp&  shared(MPI_COMM_WORLD)
+  !$omp&  private(i,k,t0,t1)
 #endif
 
 #if defined(_OPENMP)

From 83bad194530f9f3778cff68c9fcfd68d15e64146 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 16 Jun 2022 12:14:08 +0300
Subject: [PATCH 52/80] update for homebrew

---
 common/make.defs.gcc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 05a06c0ee..afcf1a6ae 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -205,7 +205,7 @@ ISPCFLAG=-O3 --target=host --opt=fast-math
 #
 # MPI-3
 #
-MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.1_2
+MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.4
 MPICC=${MPIDIR}/bin/mpicc
 MPICXX=${MPIDIR}/bin/mpicxx
 MPIFORT=${MPIDIR}/bin/mpifort

From eeabb8c15c97797ec1840e9b8adc15c74e53ea9a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 16 Jun 2022 12:38:22 +0300
Subject: [PATCH 53/80] replace non-constant tiling with automatic tiling

non-constant tiling was supported by NVHPC not GCC,
and was not standard anyways.
---
 FORTRAN/stencil-openacc.F90   | 98 +++++++++--------------------------
 FORTRAN/transpose-openacc.F90 |  7 ++-
 2 files changed, 28 insertions(+), 77 deletions(-)

diff --git a/FORTRAN/stencil-openacc.F90 b/FORTRAN/stencil-openacc.F90
index a5543e5f3..da660dd22 100644
--- a/FORTRAN/stencil-openacc.F90
+++ b/FORTRAN/stencil-openacc.F90
@@ -61,82 +61,42 @@
 !
 ! *******************************************************************
 
-subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B)
+subroutine apply_stencil(is_star,r,n,W,A,B)
   use, intrinsic :: iso_fortran_env
   implicit none
-  logical, intent(in) :: is_star, tiling
-  integer(kind=INT32), intent(in) :: tile_size, r, n
+  logical, intent(in) :: is_star
+  integer(kind=INT32), intent(in) :: r, n
   real(kind=REAL64), intent(in) :: W(-r:r,-r:r)
   real(kind=REAL64), intent(in) :: A(n,n)
   real(kind=REAL64), intent(inout) :: B(n,n)
-  integer(kind=INT32) :: i, j, ii, jj, it, jt
+  integer(kind=INT32) :: i, j, ii, jj
   !$acc data pcopyin(W,A) pcopy(B)
   if (is_star) then
-    if (.not.tiling) then
-      !$acc parallel loop collapse(2)
-      do j=r,n-r-1
-        do i=r,n-r-1
-          do jj=-r,r
-            B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1)
-          enddo
-          do ii=-r,-1
-            B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1)
-          enddo
-          do ii=1,r
-            B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1)
-          enddo
+    !$acc parallel loop tile(*,*)
+    do j=r,n-r-1
+      do i=r,n-r-1
+        do jj=-r,r
+          B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1)
         enddo
-      enddo
-    else ! tiling
-      !$acc parallel loop gang collapse(2)
-      do jt=r,n-r-1,tile_size
-        do it=r,n-r-1,tile_size
-          !$acc loop vector collapse(2)
-          do j=jt,min(n-r-1,jt+tile_size-1)
-            do i=it,min(n-r-1,it+tile_size-1)
-              do jj=-r,r
-                B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1)
-              enddo
-              do ii=-r,-1
-                B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1)
-              enddo
-              do ii=1,r
-                B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1)
-              enddo
-            enddo
-          enddo
+        do ii=-r,-1
+          B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1)
         enddo
-      enddo
-    endif ! tiling
-  else ! grid
-    if (.not.tiling) then
-      !$acc parallel loop collapse(2)
-      do j=r,n-r-1
-        do i=r,n-r-1
-          do jj=-r,r
-            do ii=-r,r
-              B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1)
-            enddo
-          enddo
+        do ii=1,r
+          B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1)
         enddo
       enddo
-    else ! tiling
-      !$acc parallel loop gang collapse(2)
-      do jt=r,n-r-1,tile_size
-        do it=r,n-r-1,tile_size
-          !$acc loop vector collapse(2)
-          do j=jt,min(n-r-1,jt+tile_size-1)
-            do i=it,min(n-r-1,it+tile_size-1)
-              do jj=-r,r
-                do ii=-r,r
-                  B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1)
-                enddo
-              enddo
-            enddo
+    enddo
+  else ! grid
+    !$acc parallel loop tile(*,*)
+    do j=r,n-r-1
+      do i=r,n-r-1
+        do jj=-r,r
+          do ii=-r,r
+            B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1)
           enddo
         enddo
       enddo
-    endif ! tiling
+    enddo
   endif ! star
   !$acc end data
 end subroutine apply_stencil
@@ -150,8 +110,6 @@ program main
   integer(kind=INT32) :: iterations                     ! number of times to run the pipeline algorithm
   integer(kind=INT32) ::  n                             ! linear grid dimension
   integer(kind=INT32) ::  stencil_size                  ! number of points in stencil
-  integer(kind=INT32) ::  tile_size                     ! loop nest block factor
-  logical :: tiling                                     ! boolean indication loop nest blocking
   logical :: is_star                                    ! true = star, false = grid
   integer(kind=INT32), parameter :: r=RADIUS            ! radius of stencil
   real(kind=REAL64) :: W(-r:r,-r:r)                     ! weights of points in the stencil
@@ -172,7 +130,7 @@ program main
   write(*,'(a25)') 'Parallel Research Kernels'
   write(*,'(a44)') 'Fortran OpenACC Stencil execution on 2D grid'
 
-  call prk_get_arguments('stencil',iterations=iterations,order=n,tile_size=tile_size)
+  call prk_get_arguments('stencil',iterations=iterations,order=n)
 
   ! TODO: parse runtime input for star/grid
 #ifdef STAR
@@ -181,8 +139,6 @@ program main
   is_star = .false.
 #endif
 
-  tiling = (tile_size.ne.n)
-
   write(*,'(a22,i8)') 'Number of iterations = ', iterations
   write(*,'(a22,i8)') 'Grid size            = ', n
   write(*,'(a22,i8)') 'Radius of stencil    = ', r
@@ -193,11 +149,7 @@ program main
     write(*,'(a22,a8)')  'Type of stencil      = ','grid'
     stencil_size = (2*r+1)**2
   endif
-  if (tiling) then
-    write(*,'(a22,i8)') 'Tile size            = ', tile_size
-  else
-    write(*,'(a10)') 'Tiling off'
-  endif
+  write(*,'(a32)')    'Tile size            = automatic'
 
   ! ********************************************************************
   ! ** Allocate space for the input and perform the computation
@@ -228,7 +180,7 @@ program main
     if (k.eq.1) t0 = prk_get_wtime()
 
     ! Apply the stencil operator
-    call apply_stencil(is_star,tiling,tile_size,r,n,W,A,B)
+    call apply_stencil(is_star,r,n,W,A,B)
 
     ! add constant to solution to force refresh of neighbor data, if any
     !$acc parallel loop collapse(2)
diff --git a/FORTRAN/transpose-openacc.F90 b/FORTRAN/transpose-openacc.F90
index 1a0a69fe9..ad242cbfb 100644
--- a/FORTRAN/transpose-openacc.F90
+++ b/FORTRAN/transpose-openacc.F90
@@ -66,7 +66,6 @@ program main
   integer(kind=INT64) ::  bytes                     ! combined size of matrices
   ! runtime variables
   integer(kind=INT32) ::  i, j, k
-  integer(kind=INT32) ::  it, jt, tile_size
   real(kind=REAL64) ::  abserr, addit, temp         ! squared error
   real(kind=REAL64) ::  t0, t1, trans_time, avgtime ! timing parameters
   real(kind=REAL64), parameter ::  epsilon=1.D-8    ! error tolerance
@@ -78,11 +77,11 @@ program main
   write(*,'(a25)') 'Parallel Research Kernels'
   write(*,'(a41)') 'Fortran OpenACC Matrix transpose: B = A^T'
 
-  call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size)
+  call prk_get_arguments('transpose',iterations=iterations,order=order)
 
   write(*,'(a22,i8)') 'Number of iterations = ', iterations
   write(*,'(a22,i8)') 'Matrix order         = ', order
-  write(*,'(a22,i8)') 'Tile size            = ', tile_size
+  write(*,'(a32)')    'Tile size            = automatic'
 
   ! ********************************************************************
   ! ** Allocate space for the input and transpose matrix
@@ -110,7 +109,7 @@ program main
 
     if (k.eq.1) t0 = prk_get_wtime()
 
-    !$acc parallel loop tile(tile_size,tile_size)
+    !$acc parallel loop tile(*,*)
     do j=1,order
       do i=1,order
         B(j,i) = B(j,i) + A(i,j)

From dd8308e00648a3b402ccdd613cfd2ee5bbae5c09 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Thu, 23 Jun 2022 22:06:52 +0300
Subject: [PATCH 54/80] Update README.md

---
 README.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 4fbb49dd8..994639a7e 100644
--- a/README.md
+++ b/README.md
@@ -38,15 +38,16 @@ If you are looking for the simplest option, try `make.defs.gcc`.
 
 | File (in `./common/`) | Environment |  
 |----------------------|-------------------------|  
-| `make.defs.cray`     | Cray compilers on Cray XC systems. |
+| `make.defs.cray`     | Cray toolchain (rarely tested). |
 | `make.defs.cuda`     | GCC with the CUDA compiler (only used in C++/CUDA implementation). |
-| `make.defs.gcc`      | GCC compiler tool chain, which supports essentially all implementations. |
+| `make.defs.gcc`      | GCC compiler toolchain, which supports essentially all implementations (tested often). |
 | `make.defs.freebsd`  | FreeBSD (rarely tested). |
 | `make.defs.ibmbg`    | IBM Blue Gene/Q compiler toolchain (deprecated). |
-| `make.defs.ibmp9nv`  | IBM compilers for POWER9 and NVIDIA Volta platforms. |
-| `make.defs.intel`    | Intel compiler tool chain, which supports most implementations. |
-| `make.defs.llvm`     | LLVM compiler tool chain, which supports most implementations. |
-| `make.defs.musl`     | GCC compiler toolchain with MUSL as the C standard library, which is required to use C11 threads. |
+| `make.defs.ibmp9nv`  | IBM compilers for POWER9 and NVIDIA Volta platforms (rarely tested). |
+| `make.defs.intel`    | Intel Parallel Studio toolchain, which supports most implementations (tested often). |
+| `make.defs.llvm`     | LLVM compiler toolchain, which supports most implementations (tested often). |
+| `make.defs.musl`     | GCC compiler toolchain with MUSL as the C standard library, which was required to use C11 threads. |
+| `make.defs.nvhpc`    | NVIDIA HPC compiler tool chain, which supports most implementations (tested often). |
 | `make.defs.oneapi`   | Intel oneAPI (https://software.intel.com/oneapi/hpc-kit). |
 | `make.defs.pgi`      | PGI compiler toolchain (infrequently tested). |
 | `make.defs.hip`      | HIP compiler toolchain (infrequently tested). |

From 029c003c20a5ffaf522708685e673200f7c5de9e Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Thu, 23 Jun 2022 22:31:00 +0300
Subject: [PATCH 55/80] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 994639a7e..4c0852d4d 100644
--- a/README.md
+++ b/README.md
@@ -47,8 +47,8 @@ If you are looking for the simplest option, try `make.defs.gcc`.
 | `make.defs.intel`    | Intel Parallel Studio toolchain, which supports most implementations (tested often). |
 | `make.defs.llvm`     | LLVM compiler toolchain, which supports most implementations (tested often). |
 | `make.defs.musl`     | GCC compiler toolchain with MUSL as the C standard library, which was required to use C11 threads. |
-| `make.defs.nvhpc`    | NVIDIA HPC compiler tool chain, which supports most implementations (tested often). |
-| `make.defs.oneapi`   | Intel oneAPI (https://software.intel.com/oneapi/hpc-kit). |
+| `make.defs.nvhpc`    | [NVIDIA HPC SDK](https://developer.nvidia.com/nvidia-hpc-sdk-downloads), which supports most implementations (tested often). |
+| `make.defs.oneapi`   | Intel [oneAPI](https://software.intel.com/oneapi/hpc-kit). |
 | `make.defs.pgi`      | PGI compiler toolchain (infrequently tested). |
 | `make.defs.hip`      | HIP compiler toolchain (infrequently tested). |
 

From b1699023fb8d491c8a17d485dc57814b2e799005 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 4 Oct 2022 13:45:38 +0300
Subject: [PATCH 56/80] update OpenCL C++ header

this ancient one does not compile with ICPX or Clang when C++20 is enabled
---
 Cxx11/{cl2.hpp => opencl.hpp} | 706 +++++++++++++++++++++-------------
 Cxx11/prk_opencl.h            |   2 +-
 2 files changed, 429 insertions(+), 279 deletions(-)
 rename Cxx11/{cl2.hpp => opencl.hpp} (94%)

diff --git a/Cxx11/cl2.hpp b/Cxx11/opencl.hpp
similarity index 94%
rename from Cxx11/cl2.hpp
rename to Cxx11/opencl.hpp
index 09e295ec5..1e61d7890 100644
--- a/Cxx11/cl2.hpp
+++ b/Cxx11/opencl.hpp
@@ -1,36 +1,23 @@
-/*******************************************************************************
- * Copyright (c) 2008-2016 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
+//
+// Copyright (c) 2008-2020 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 /*! \file
  *
- *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33),
- *       OpenCL 1.2 (rev 15), OpenCL 2.0 (rev 29), OpenCL 2.1 (rev 17),
- *       and OpenCL 2.2 (V2.2-11).
+ *   \brief C++ bindings for OpenCL 1.0, OpenCL 1.1, OpenCL 1.2,
+ *       OpenCL 2.0, OpenCL 2.1, OpenCL 2.2, and OpenCL 3.0.
  *   \author Lee Howes and Bruce Merry
  *
  *   Derived from the OpenCL 1.x C++ bindings written by
@@ -73,10 +60,10 @@
  * For many large applications C++ is the language of choice and so it seems
  * reasonable to define C++ bindings for OpenCL.
  *
- * The interface is contained with a single C++ header file \em cl2.hpp and all
+ * The interface is contained with a single C++ header file \em opencl.hpp and all
  * definitions are contained within the namespace \em cl. There is no additional
  * requirement to include \em cl.h and to use either the C++ or original C
- * bindings; it is enough to simply include \em cl2.hpp.
+ * bindings; it is enough to simply include \em opencl.hpp.
  *
  * The bindings themselves are lightweight and correspond closely to the
  * underlying C API. Using the C++ bindings introduces no additional execution
@@ -85,7 +72,7 @@
  * There are numerous compatibility, portability and memory management
  * fixes in the new header as well as additional OpenCL 2.0 features.
  * As a result the header is not directly backward compatible and for this
- * reason we release it as cl2.hpp rather than a new version of cl.hpp.
+ * reason we release it as opencl.hpp rather than a new version of cl.hpp.
  * 
  *
  * \section compatibility Compatibility
@@ -157,30 +144,26 @@
  * - CL_HPP_NO_STD_STRING
  *
  *   Do not use the standard library string class. cl::string is not
- *   defined and may be defined by the user before cl2.hpp is
+ *   defined and may be defined by the user before opencl.hpp is
  *   included.
  *
  * - CL_HPP_NO_STD_VECTOR
  *
  *   Do not use the standard library vector class. cl::vector is not
- *   defined and may be defined by the user before cl2.hpp is
+ *   defined and may be defined by the user before opencl.hpp is
  *   included.
  *
  * - CL_HPP_NO_STD_ARRAY
  *
  *   Do not use the standard library array class. cl::array is not
- *   defined and may be defined by the user before cl2.hpp is
+ *   defined and may be defined by the user before opencl.hpp is
  *   included.
  *
  * - CL_HPP_NO_STD_UNIQUE_PTR
  *
  *   Do not use the standard library unique_ptr class. cl::pointer and
  *   the cl::allocate_pointer functions are not defined and may be
- *   defined by the user before cl2.hpp is included.
- *
- * - CL_HPP_ENABLE_DEVICE_FISSION
- *
- *   Enables device fission for OpenCL 1.2 platforms.
+ *   defined by the user before opencl.hpp is included.
  *
  * - CL_HPP_ENABLE_EXCEPTIONS
  *
@@ -207,10 +190,22 @@
  *   applies to use of cl::Program construction and other program
  *   build variants.
  *
+ * - CL_HPP_USE_CL_DEVICE_FISSION
+ *
+ *   Enable the cl_ext_device_fission extension.
+ *
+ * - CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR
+ *
+ *   Enable the cl_khr_image2d_from_buffer extension.
+ *
  * - CL_HPP_USE_CL_SUB_GROUPS_KHR
  *
  *   Enable the cl_khr_subgroups extension.
  *
+ * - CL_HPP_USE_DX_INTEROP
+ *
+ *   Enable the cl_khr_d3d10_sharing extension.
+ *
  * - CL_HPP_USE_IL_KHR
  *
  *   Enable the cl_khr_il_program extension.
@@ -222,12 +217,16 @@
  * bindings, including support for the optional exception feature and
  * also the supplied vector and string classes, see following sections for
  * decriptions of these features.
+ * 
+ * Note: the C++ bindings use std::call_once and therefore may need to be
+ * compiled using special command-line options (such as "-pthread") on some
+ * platforms!
  *
  * \code
     #define CL_HPP_ENABLE_EXCEPTIONS
     #define CL_HPP_TARGET_OPENCL_VERSION 200
 
-    #include <CL/cl2.hpp>
+    #include <CL/opencl.hpp>
     #include <iostream>
     #include <vector>
     #include <memory>
@@ -237,28 +236,30 @@
 
     int main(void)
     {
-        // Filter for a 2.0 platform and set it as the default
+        // Filter for a 2.0 or newer platform and set it as the default
         std::vector<cl::Platform> platforms;
         cl::Platform::get(&platforms);
         cl::Platform plat;
         for (auto &p : platforms) {
             std::string platver = p.getInfo<CL_PLATFORM_VERSION>();
-            if (platver.find("OpenCL 2.") != std::string::npos) {
+            if (platver.find("OpenCL 2.") != std::string::npos ||
+                platver.find("OpenCL 3.") != std::string::npos) {
+                // Note: an OpenCL 3.x platform may not support all required features!
                 plat = p;
             }
         }
-        if (plat() == 0)  {
-            std::cout << "No OpenCL 2.0 platform found.";
+        if (plat() == 0) {
+            std::cout << "No OpenCL 2.0 or newer platform found.\n";
             return -1;
         }
 
         cl::Platform newP = cl::Platform::setDefault(plat);
         if (newP != plat) {
-            std::cout << "Error setting default platform.";
+            std::cout << "Error setting default platform.\n";
             return -1;
         }
 
-        // Use C++11 raw string literals for kernel source code
+        // C++11 raw string literal for the first kernel
         std::string kernel1{R"CLC(
             global int globalA;
             kernel void updateGlobal()
@@ -266,6 +267,8 @@
               globalA = 75;
             }
         )CLC"};
+
+        // Raw string literal for the second kernel
         std::string kernel2{R"CLC(
             typedef struct { global int *bar; } Foo;
             kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB,
@@ -292,8 +295,9 @@
             }
         )CLC"};
 
-        // New simpler string interface style
-        std::vector<std::string> programStrings {kernel1, kernel2};
+        std::vector<std::string> programStrings;
+        programStrings.push_back(kernel1);
+        programStrings.push_back(kernel2);
 
         cl::Program vectorAddProgram(programStrings);
         try {
@@ -332,10 +336,9 @@
         std::vector<int, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>> inputA(numElements, 1, svmAlloc);
         cl::coarse_svm_vector<int> inputB(numElements, 2, svmAlloc);
 
-        //
         //////////////
-
         // Traditional cl_mem allocations
+
         std::vector<int> output(numElements, 0xdeadbeef);
         cl::Buffer outputBuffer(begin(output), end(output), false);
         cl::Pipe aPipe(sizeof(cl_int), numElements / 2);
@@ -359,14 +362,8 @@
         // This one was not passed as a parameter
         vectorAddKernel.setSVMPointers(anSVMInt);
 
-        // Hand control of coarse allocations to runtime
-        cl::enqueueUnmapSVM(anSVMInt);
-        cl::enqueueUnmapSVM(fooPointer);
-        cl::unmapSVM(inputB);
-        cl::unmapSVM(output2);
-
-	    cl_int error;
-	    vectorAddKernel(
+        cl_int error;
+        vectorAddKernel(
             cl::EnqueueArgs(
                 cl::NDRange(numElements/2),
                 cl::NDRange(numElements/2)),
@@ -377,12 +374,10 @@
             3,
             aPipe,
             defaultDeviceQueue,
-		    error
+            error
             );
 
         cl::copy(outputBuffer, begin(output), end(output));
-        // Grab the SVM output vector using a map
-        cl::mapSVM(output2);
 
         cl::Device d = cl::Device::getDefault();
 
@@ -406,59 +401,60 @@
  * both and hence work with either version of the bindings.
  */
 #if !defined(CL_HPP_USE_DX_INTEROP) && defined(USE_DX_INTEROP)
-# pragma message("cl2.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead")
+# pragma message("opencl.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead")
 # define CL_HPP_USE_DX_INTEROP
 #endif
 #if !defined(CL_HPP_USE_CL_DEVICE_FISSION) && defined(USE_CL_DEVICE_FISSION)
-# pragma message("cl2.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead")
+# pragma message("opencl.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead")
 # define CL_HPP_USE_CL_DEVICE_FISSION
 #endif
 #if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS)
-# pragma message("cl2.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
+# pragma message("opencl.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
 # define CL_HPP_ENABLE_EXCEPTIONS
 #endif
 #if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR)
-# pragma message("cl2.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead")
+# pragma message("opencl.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead")
 # define CL_HPP_NO_STD_VECTOR
 #endif
 #if !defined(CL_HPP_NO_STD_STRING) && defined(__NO_STD_STRING)
-# pragma message("cl2.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead")
+# pragma message("opencl.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead")
 # define CL_HPP_NO_STD_STRING
 #endif
 #if defined(VECTOR_CLASS)
-# pragma message("cl2.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead")
+# pragma message("opencl.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead")
 #endif
 #if defined(STRING_CLASS)
-# pragma message("cl2.hpp: STRING_CLASS is deprecated. Alias cl::string instead.")
+# pragma message("opencl.hpp: STRING_CLASS is deprecated. Alias cl::string instead.")
 #endif
 #if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) && defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
-# pragma message("cl2.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead")
+# pragma message("opencl.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead")
 # define CL_HPP_USER_OVERRIDE_ERROR_STRINGS
 #endif
 
 /* Warn about features that are no longer supported
  */
 #if defined(__USE_DEV_VECTOR)
-# pragma message("cl2.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors")
+# pragma message("opencl.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors")
 #endif
 #if defined(__USE_DEV_STRING)
-# pragma message("cl2.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors")
+# pragma message("opencl.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors")
 #endif
 
 /* Detect which version to target */
 #if !defined(CL_HPP_TARGET_OPENCL_VERSION)
-# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 220 (OpenCL 2.2)")
-# define CL_HPP_TARGET_OPENCL_VERSION 220
+# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 300 (OpenCL 3.0)")
+# define CL_HPP_TARGET_OPENCL_VERSION 300
 #endif
 #if CL_HPP_TARGET_OPENCL_VERSION != 100 && \
     CL_HPP_TARGET_OPENCL_VERSION != 110 && \
     CL_HPP_TARGET_OPENCL_VERSION != 120 && \
     CL_HPP_TARGET_OPENCL_VERSION != 200 && \
     CL_HPP_TARGET_OPENCL_VERSION != 210 && \
-    CL_HPP_TARGET_OPENCL_VERSION != 220
-# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210 or 220). It will be set to 220")
+    CL_HPP_TARGET_OPENCL_VERSION != 220 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 300
+# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 300 (OpenCL 3.0).")
 # undef CL_HPP_TARGET_OPENCL_VERSION
-# define CL_HPP_TARGET_OPENCL_VERSION 220
+# define CL_HPP_TARGET_OPENCL_VERSION 300
 #endif
 
 /* Forward target OpenCL version to C headers if necessary */
@@ -480,8 +476,9 @@
     CL_HPP_MINIMUM_OPENCL_VERSION != 120 && \
     CL_HPP_MINIMUM_OPENCL_VERSION != 200 && \
     CL_HPP_MINIMUM_OPENCL_VERSION != 210 && \
-    CL_HPP_MINIMUM_OPENCL_VERSION != 220
-# pragma message("cl2.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210 or 220). It will be set to 100")
+    CL_HPP_MINIMUM_OPENCL_VERSION != 220 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 300
+# pragma message("opencl.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 100")
 # undef CL_HPP_MINIMUM_OPENCL_VERSION
 # define CL_HPP_MINIMUM_OPENCL_VERSION 100
 #endif
@@ -541,13 +538,15 @@
 #include <CL/opencl.h>
 #endif // !__APPLE__
 
-#if (__cplusplus >= 201103L)
+#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L )
 #define CL_HPP_NOEXCEPT_ noexcept
 #else
 #define CL_HPP_NOEXCEPT_
 #endif
 
-#if defined(_MSC_VER)
+#if __cplusplus >= 201703L
+# define CL_HPP_DEFINE_STATIC_MEMBER_ inline
+#elif defined(_MSC_VER)
 # define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany)
 #elif defined(__MINGW32__)
 # define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((selectany))
@@ -557,19 +556,26 @@
 
 // Define deprecated prefixes and suffixes to ensure compilation
 // in case they are not pre-defined
-#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
-#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-
-#if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
-#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED  
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
-#if !defined(CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED)
-#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+#if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_API_PREFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_API_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)
+#define CL_API_PREFIX__VERSION_1_2_DEPRECATED
+#endif // #if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)
+#if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED)
+#define CL_API_SUFFIX__VERSION_1_2_DEPRECATED
+#endif // #if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED)
+
+#if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED)
+#define CL_API_PREFIX__VERSION_2_2_DEPRECATED
+#endif // #if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED)
+#if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED)
+#define CL_API_SUFFIX__VERSION_2_2_DEPRECATED
+#endif // #if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED)
 
 #if !defined(CL_CALLBACK)
 #define CL_CALLBACK
@@ -1326,13 +1332,20 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, string) \
     F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \
     \
+    F(cl_kernel_work_group_info, CL_KERNEL_GLOBAL_WORK_SIZE, cl::detail::size_t_array) \
+    \
+    F(cl_device_info, CL_DEVICE_LINKER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, size_type) \
     F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl::Device) \
+    F(cl_device_info, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, cl_uint) \
     F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, cl::vector<cl_device_partition_property>) \
     F(cl_device_info, CL_DEVICE_PARTITION_TYPE, cl::vector<cl_device_partition_property>)  \
     F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, size_type) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, cl_bool) \
     F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
     F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, string) \
+    F(cl_device_info, CL_DEVICE_PRINTF_BUFFER_SIZE, size_type) \
     \
     F(cl_image_info, CL_IMAGE_ARRAY_SIZE, size_type) \
     F(cl_image_info, CL_IMAGE_NUM_MIP_LEVELS, cl_uint) \
@@ -1352,6 +1365,14 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_device_info, CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT, cl_uint) \
     F(cl_device_info, CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT, cl_uint) \
     F(cl_device_info, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_IMAGE_PITCH_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS, cl_uint ) \
+    F(cl_device_info, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, size_type ) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE, size_type ) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_COMPLETE, cl_ulong) \
+    F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM, cl_bool) \
+    F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_PTRS, void**) \
     F(cl_command_queue_info, CL_QUEUE_SIZE, cl_uint) \
     F(cl_mem_info, CL_MEM_USES_SVM_POINTER, cl_bool) \
     F(cl_program_build_info, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, size_type) \
@@ -1367,17 +1388,17 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_program_info, CL_PROGRAM_IL_KHR, cl::vector<unsigned char>)
 
 #define CL_HPP_PARAM_NAME_INFO_2_1_(F) \
-    F(cl_platform_info, CL_PLATFORM_HOST_TIMER_RESOLUTION, size_type) \
+    F(cl_platform_info, CL_PLATFORM_HOST_TIMER_RESOLUTION, cl_ulong) \
     F(cl_program_info, CL_PROGRAM_IL, cl::vector<unsigned char>) \
-    F(cl_kernel_info, CL_KERNEL_MAX_NUM_SUB_GROUPS, size_type) \
-    F(cl_kernel_info, CL_KERNEL_COMPILE_NUM_SUB_GROUPS, size_type) \
     F(cl_device_info, CL_DEVICE_MAX_NUM_SUB_GROUPS, cl_uint) \
     F(cl_device_info, CL_DEVICE_IL_VERSION, string) \
     F(cl_device_info, CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, cl_bool) \
     F(cl_command_queue_info, CL_QUEUE_DEVICE_DEFAULT, cl::DeviceCommandQueue) \
     F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, size_type) \
     F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, size_type) \
-    F(cl_kernel_sub_group_info, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, cl::detail::size_t_array)
+    F(cl_kernel_sub_group_info, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, cl::detail::size_t_array) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_NUM_SUB_GROUPS, size_type) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_COMPILE_NUM_SUB_GROUPS, size_type)
 
 #define CL_HPP_PARAM_NAME_INFO_2_2_(F) \
     F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT, cl_bool) \
@@ -1390,6 +1411,43 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
     F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, cl::vector<cl_device_partition_property_ext>)
 
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(F) \
+    F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION_KHR, cl_version_khr) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \
+    \
+    F(cl_device_info, CL_DEVICE_NUMERIC_VERSION_KHR, cl_version_khr) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \
+    F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>)
+
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(F) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR, cl_version_khr)
+
+#define CL_HPP_PARAM_NAME_INFO_3_0_(F) \
+    F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION, cl_version) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION, cl::vector<cl_name_version>) \
+    \
+    F(cl_device_info, CL_DEVICE_NUMERIC_VERSION, cl_version) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES, cl_device_atomic_capabilities) \
+    F(cl_device_info, CL_DEVICE_ATOMIC_FENCE_CAPABILITIES, cl_device_atomic_capabilities) \
+    F(cl_device_info, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_ALL_VERSIONS, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \
+    F(cl_device_info, CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_FEATURES, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES, cl_device_device_enqueue_capabilities) \
+    F(cl_device_info, CL_DEVICE_PIPE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED, string) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES_ARRAY, cl::vector<cl_queue_properties>) \
+    F(cl_mem_info, CL_MEM_PROPERTIES, cl::vector<cl_mem_properties>) \
+    F(cl_pipe_info, CL_PIPE_PROPERTIES, cl::vector<cl_pipe_properties>) \
+    F(cl_sampler_info, CL_SAMPLER_PROPERTIES, cl::vector<cl_sampler_properties>)
+
 template <typename enum_type, cl_int Name>
 struct param_traits {};
 
@@ -1418,12 +1476,15 @@ CL_HPP_PARAM_NAME_INFO_2_1_(CL_HPP_DECLARE_PARAM_TRAITS_)
 #if CL_HPP_TARGET_OPENCL_VERSION >= 220
 CL_HPP_PARAM_NAME_INFO_2_2_(CL_HPP_DECLARE_PARAM_TRAITS_)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 220
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+CL_HPP_PARAM_NAME_INFO_3_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300
 
 #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210
 CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_)
 #endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210
 
-#if defined(CL_HPP_USE_IL_KHR)
+#if defined(CL_HPP_USE_IL_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210
 CL_HPP_PARAM_NAME_INFO_IL_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_)
 #endif // #if defined(CL_HPP_USE_IL_KHR)
 
@@ -1454,6 +1515,35 @@ CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
 CL_HPP_PARAM_NAME_DEVICE_FISSION_(CL_HPP_DECLARE_PARAM_TRAITS_);
 #endif // CL_HPP_USE_CL_DEVICE_FISSION
 
+#if defined(cl_khr_extended_versioning)
+#if CL_HPP_TARGET_OPENCL_VERSION < 300
+CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION < 300
+CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_khr_extended_versioning
+
+#if defined(cl_khr_device_uuid)
+using uuid_array = array<cl_uchar, CL_UUID_SIZE_KHR>;
+using luid_array = array<cl_uchar, CL_LUID_SIZE_KHR>;
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_UUID_KHR, uuid_array)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DRIVER_UUID_KHR, uuid_array)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_VALID_KHR, cl_bool)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_KHR, luid_array)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NODE_MASK_KHR, cl_uint)
+#endif
+
+#if defined(cl_khr_pci_bus_info)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PCI_BUS_INFO_KHR, cl_device_pci_bus_info_khr)
+#endif
+
+#if defined(cl_khr_integer_dot_product)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR, cl_device_integer_dot_product_capabilities_khr)
+#if defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR, cl_device_integer_dot_product_acceleration_properties_khr)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR, cl_device_integer_dot_product_acceleration_properties_khr)
+#endif // defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR)
+#endif // defined(cl_khr_integer_dot_product)
+
 #ifdef CL_PLATFORM_ICD_SUFFIX_KHR
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string)
 #endif
@@ -1461,7 +1551,6 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, strin
 #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
 #endif
-
 #ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, vector<size_type>)
 #endif
@@ -1492,6 +1581,9 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUT
 #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
 #endif
+#ifdef CL_DEVICE_BOARD_NAME_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_BOARD_NAME_AMD, string)
+#endif
 
 #ifdef CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM, cl_ulong)
@@ -1499,6 +1591,30 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_AR
 #ifdef CL_DEVICE_JOB_SLOTS_ARM
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_JOB_SLOTS_ARM, cl_uint)
 #endif
+#ifdef CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, cl_bitfield)
+#endif
+#ifdef CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM, vector<cl_uint>)
+#endif
+#ifdef CL_DEVICE_MAX_WARP_COUNT_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_MAX_WARP_COUNT_ARM, cl_uint)
+#endif
+#ifdef CL_KERNEL_MAX_WARP_COUNT_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_info, CL_KERNEL_MAX_WARP_COUNT_ARM, cl_uint)
+#endif
+#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM, cl_uint)
+#endif
+#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, cl_int)
+#endif
+#ifdef CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM, cl_uint)
+#endif
+#ifdef CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM, cl_uint)
+#endif
 
 #ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
@@ -1862,6 +1978,7 @@ class Wrapper<cl_device_id>
         retVal = true;
 #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+        (void)device;
         return retVal;
     }
 
@@ -1982,51 +2099,7 @@ inline bool operator!=(const Wrapper<T> &lhs, const Wrapper<T> &rhs)
 //! \endcond
 
 
-using BuildLogType = vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, CL_PROGRAM_BUILD_LOG>::param_type>>;
-#if defined(CL_HPP_ENABLE_EXCEPTIONS)
-/**
-* Exception class for build errors to carry build info
-*/
-class BuildError : public Error
-{
-private:
-    BuildLogType buildLogs;
-public:
-    BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec)
-    {
-    }
 
-    BuildLogType getBuildLog() const
-    {
-        return buildLogs;
-    }
-};
-namespace detail {
-    static inline cl_int buildErrHandler(
-        cl_int err,
-        const char * errStr,
-        const BuildLogType &buildLogs)
-    {
-        if (err != CL_SUCCESS) {
-            throw BuildError(err, errStr, buildLogs);
-        }
-        return err;
-    }
-} // namespace detail
-
-#else
-namespace detail {
-    static inline cl_int buildErrHandler(
-        cl_int err,
-        const char * errStr,
-        const BuildLogType &buildLogs)
-    {
-        (void)buildLogs; // suppress unused variable warning
-        (void)errStr;
-        return err;
-    }
-} // namespace detail
-#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
 
 
 /*! \stuct ImageFormat
@@ -2046,6 +2119,9 @@ struct ImageFormat : public cl_image_format
         image_channel_data_type = type;
     }
 
+    //! \brief Copy constructor.
+    ImageFormat(const ImageFormat &other) { *this = other; }
+
     //! \brief Assignment operator.
     ImageFormat& operator = (const ImageFormat& rhs)
     {
@@ -2187,7 +2263,7 @@ class Device : public detail::Wrapper<cl_device_id>
     }
 
     //! \brief Wrapper for clGetDeviceInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_device_info name> typename
     detail::param_traits<detail::cl_device_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -2299,7 +2375,7 @@ class Device : public detail::Wrapper<cl_device_id>
                 const cl_device_partition_property_ext * /* properties */,
                 cl_uint /*num_entries*/,
                 cl_device_id * /*out_devices*/,
-                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+                cl_uint * /*num_devices*/ ) CL_API_SUFFIX__VERSION_1_1;
 
         static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT);
@@ -2333,6 +2409,52 @@ class Device : public detail::Wrapper<cl_device_id>
 #endif // defined(CL_HPP_USE_CL_DEVICE_FISSION)
 };
 
+using BuildLogType = vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, CL_PROGRAM_BUILD_LOG>::param_type>>;
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+/**
+* Exception class for build errors to carry build info
+*/
+class BuildError : public Error
+{
+private:
+    BuildLogType buildLogs;
+public:
+    BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec)
+    {
+    }
+
+    BuildLogType getBuildLog() const
+    {
+        return buildLogs;
+    }
+};
+namespace detail {
+    static inline cl_int buildErrHandler(
+        cl_int err,
+        const char * errStr,
+        const BuildLogType &buildLogs)
+    {
+        if (err != CL_SUCCESS) {
+            throw BuildError(err, errStr, buildLogs);
+        }
+        return err;
+    }
+} // namespace detail
+
+#else
+namespace detail {
+    static inline cl_int buildErrHandler(
+        cl_int err,
+        const char * errStr,
+        const BuildLogType &buildLogs)
+    {
+        (void)buildLogs; // suppress unused variable warning
+        (void)errStr;
+        return err;
+    }
+} // namespace detail
+#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
+
 CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Device::default_initialized_;
 CL_HPP_DEFINE_STATIC_MEMBER_ Device Device::default_;
 CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Device::default_error_ = CL_SUCCESS;
@@ -2465,7 +2587,8 @@ class Platform : public detail::Wrapper<cl_platform_id>
     }
 
     //! \brief Wrapper for clGetPlatformInfo().
-    cl_int getInfo(cl_platform_info name, string* param) const
+    template <typename T>
+    cl_int getInfo(cl_platform_info name, T* param) const
     {
         return detail::errHandler(
             detail::getInfo(&::clGetPlatformInfo, object_, name, param),
@@ -2473,7 +2596,7 @@ class Platform : public detail::Wrapper<cl_platform_id>
     }
 
     //! \brief Wrapper for clGetPlatformInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_platform_info name> typename
     detail::param_traits<detail::cl_platform_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -2708,8 +2831,8 @@ CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS;
  * Unload the OpenCL compiler.
  * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
  */
-inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
-UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 inline cl_int
 UnloadCompiler()
 {
@@ -2799,7 +2922,7 @@ class Context
      */
     Context(
         const vector<Device>& devices,
-        cl_context_properties* properties = NULL,
+        const cl_context_properties* properties = NULL,
         void (CL_CALLBACK * notifyFptr)(
             const char *,
             const void *,
@@ -2828,9 +2951,13 @@ class Context
         }
     }
 
+    /*! \brief Constructs a context including a specific device.
+     *
+     *  Wraps clCreateContext().
+     */
     Context(
         const Device& device,
-        cl_context_properties* properties = NULL,
+        const cl_context_properties* properties = NULL,
         void (CL_CALLBACK * notifyFptr)(
             const char *,
             const void *,
@@ -2860,7 +2987,7 @@ class Context
      */
     Context(
         cl_device_type type,
-        cl_context_properties* properties = NULL,
+        const cl_context_properties* properties = NULL,
         void (CL_CALLBACK * notifyFptr)(
             const char *,
             const void *,
@@ -3030,7 +3157,7 @@ class Context
     }
 
     //! \brief Wrapper for clGetContextInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_context_info name> typename
     detail::param_traits<detail::cl_context_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -3172,7 +3299,7 @@ class Event : public detail::Wrapper<cl_event>
     }
 
     //! \brief Wrapper for clGetEventInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_event_info name> typename
     detail::param_traits<detail::cl_event_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -3195,7 +3322,7 @@ class Event : public detail::Wrapper<cl_event>
     }
 
     //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_profiling_info name> typename
     detail::param_traits<detail::cl_profiling_info, name>::param_type
     getProfilingInfo(cl_int* err = NULL) const
     {
@@ -3226,7 +3353,7 @@ class Event : public detail::Wrapper<cl_event>
      */
     cl_int setCallback(
         cl_int type,
-        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),
         void * user_data = NULL)
     {
         return detail::errHandler(
@@ -3387,7 +3514,7 @@ class Memory : public detail::Wrapper<cl_mem>
     }
 
     //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_mem_info name> typename
     detail::param_traits<detail::cl_mem_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -3415,7 +3542,7 @@ class Memory : public detail::Wrapper<cl_mem>
      *  value - not the Memory class instance.
      */
     cl_int setDestructorCallback(
-        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),
         void * user_data = NULL)
     {
         return detail::errHandler(
@@ -3758,7 +3885,7 @@ cl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Arg
 
         return cl::pointer<T, detail::Deleter<Alloc>>(tmp, detail::Deleter<Alloc>{alloc, copies});
     }
-    catch (std::bad_alloc& b)
+    catch (std::bad_alloc&)
     {
         std::allocator_traits<Alloc>::deallocate(alloc, tmp, copies);
         throw;
@@ -3893,7 +4020,7 @@ class Buffer : public Memory
         Context context = Context::getDefault(err);
 
         if( useHostPtr ) {
-            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+            object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);
         } else {
             object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
         }
@@ -4006,7 +4133,7 @@ class Buffer : public Memory
         }
 
         return result;
-    }		
+    }
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
 };
 
@@ -4385,7 +4512,7 @@ class Image : public Memory
     }
     
     //! \brief Wrapper for clGetImageInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_image_info name> typename
     detail::param_traits<detail::cl_image_info, name>::param_type
     getImageInfo(cl_int* err = NULL) const
     {
@@ -4422,12 +4549,11 @@ class Image1D : public Image
         cl_int* err = NULL)
     {
         cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D,
-            width,
-            0, 0, 0, 0, 0, 0, 0, 0
-        };
+
+        cl_image_desc desc = {0};
+        desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+        desc.image_width = width;
+
         object_ = ::clCreateImage(
             context(), 
             flags, 
@@ -4510,13 +4636,12 @@ class Image1DBuffer : public Image
         cl_int* err = NULL)
     {
         cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D_BUFFER,
-            width,
-            0, 0, 0, 0, 0, 0, 0,
-            buffer()
-        };
+
+        cl_image_desc desc = {0};
+        desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        desc.image_width = width;
+        desc.buffer = buffer();
+
         object_ = ::clCreateImage(
             context(), 
             flags, 
@@ -4596,15 +4721,13 @@ class Image1DArray : public Image
         cl_int* err = NULL)
     {
         cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D_ARRAY,
-            width,
-            0, 0,  // height, depth (unused)
-            arraySize,
-            rowPitch,
-            0, 0, 0, 0
-        };
+
+        cl_image_desc desc = {0};
+        desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+        desc.image_width = width;
+        desc.image_array_size = arraySize;
+        desc.image_row_pitch = rowPitch;
+
         object_ = ::clCreateImage(
             context(), 
             flags, 
@@ -4711,15 +4834,12 @@ class Image2D : public Image
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         if (useCreateImage)
         {
-            cl_image_desc desc =
-            {
-                CL_MEM_OBJECT_IMAGE2D,
-                width,
-                height,
-                0, 0, // depth, array size (unused)
-                row_pitch,
-                0, 0, 0, 0
-            };
+            cl_image_desc desc = {0};
+            desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+            desc.image_width = width;
+            desc.image_height = height;
+            desc.image_row_pitch = row_pitch;
+
             object_ = ::clCreateImage(
                 context(),
                 flags,
@@ -4765,17 +4885,13 @@ class Image2D : public Image
     {
         cl_int error;
 
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE2D,
-            width,
-            height,
-            0, 0, // depth, array size (unused)
-            row_pitch,
-            0, 0, 0,
-            // Use buffer as input to image
-            sourceBuffer()
-        };
+        cl_image_desc desc = {0};
+        desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+        desc.image_width = width;
+        desc.image_height = height;
+        desc.image_row_pitch = row_pitch;
+        desc.buffer = sourceBuffer();
+
         object_ = ::clCreateImage(
             context(),
             0, // flags inherited from buffer
@@ -4829,19 +4945,16 @@ class Image2D : public Image
         // Update only the channel order. 
         // Channel format inherited from source.
         sourceFormat.image_channel_order = order;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE2D,
-            sourceWidth,
-            sourceHeight,
-            0, 0, // depth (unused), array size (unused)
-            sourceRowPitch,
-            0, // slice pitch (unused)
-            sourceNumMIPLevels,
-            sourceNumSamples,
-            // Use buffer as input to image
-            sourceImage()
-        };
+
+        cl_image_desc desc = {0};
+        desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+        desc.image_width = sourceWidth;
+        desc.image_height = sourceHeight;
+        desc.image_row_pitch = sourceRowPitch;
+        desc.num_mip_levels = sourceNumMIPLevels;
+        desc.num_samples = sourceNumSamples;
+        desc.buffer = sourceImage();
+
         object_ = ::clCreateImage(
             context(),
             0, // flags should be inherited from mem_object
@@ -4921,7 +5034,7 @@ class Image2D : public Image
  *  \see Memory
  *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
  */
-class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D 
+class CL_API_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D 
 {
 public:
     /*! \brief Constructs an Image2DGL in a specified context, from a given
@@ -5004,7 +5117,7 @@ class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D
         return *this;
     }
 
-} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+} CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 #endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
@@ -5027,17 +5140,15 @@ class Image2DArray : public Image
         cl_int* err = NULL)
     {
         cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE2D_ARRAY,
-            width,
-            height,
-            0,       // depth (unused)
-            arraySize,
-            rowPitch,
-            slicePitch,
-            0, 0, 0
-        };
+
+        cl_image_desc desc = {0};
+        desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+        desc.image_width = width;
+        desc.image_height = height;
+        desc.image_array_size = arraySize;
+        desc.image_row_pitch = rowPitch;
+        desc.image_slice_pitch = slicePitch;
+
         object_ = ::clCreateImage(
             context(), 
             flags, 
@@ -5142,17 +5253,14 @@ class Image3D : public Image
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         if (useCreateImage)
         {
-            cl_image_desc desc =
-            {
-                CL_MEM_OBJECT_IMAGE3D,
-                width,
-                height,
-                depth,
-                0,      // array size (unused)
-                row_pitch,
-                slice_pitch,
-                0, 0, 0
-            };
+            cl_image_desc desc = {0};
+            desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+            desc.image_width = width;
+            desc.image_height = height;
+            desc.image_depth = depth;
+            desc.image_row_pitch = row_pitch;
+            desc.image_slice_pitch = slice_pitch;
+
             object_ = ::clCreateImage(
                 context(), 
                 flags, 
@@ -5534,7 +5642,7 @@ class Pipe : public Memory
     }
 
     //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_pipe_info name> typename
         detail::param_traits<detail::cl_pipe_info, name>::param_type
         getInfo(cl_int* err = NULL) const
     {
@@ -5667,7 +5775,7 @@ class Sampler : public detail::Wrapper<cl_sampler>
     }
 
     //! \brief Wrapper for clGetSamplerInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_sampler_info name> typename
     detail::param_traits<detail::cl_sampler_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -5890,7 +5998,7 @@ class Kernel : public detail::Wrapper<cl_kernel>
             __GET_KERNEL_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_kernel_info name> typename
     detail::param_traits<detail::cl_kernel_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -5912,7 +6020,7 @@ class Kernel : public detail::Wrapper<cl_kernel>
             __GET_KERNEL_ARG_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_kernel_arg_info name> typename
     detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
     getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
     {
@@ -5936,7 +6044,7 @@ class Kernel : public detail::Wrapper<cl_kernel>
                 __GET_KERNEL_WORK_GROUP_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_kernel_work_group_info name> typename
     detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
         getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
     {
@@ -5971,7 +6079,7 @@ class Kernel : public detail::Wrapper<cl_kernel>
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
     }
 
-    template <cl_int name>
+    template <cl_kernel_sub_group_info name>
         size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = NULL) const
     {
         size_type param;
@@ -6134,6 +6242,23 @@ class Kernel : public detail::Wrapper<cl_kernel>
             sizeof(void*)*(1 + sizeof...(Ts)),
             pointerList.data()));
     }
+
+    template<typename T>
+    cl_int setExecInfo(cl_kernel_exec_info param_name, const T& val)
+    {
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+            object_,
+            param_name,
+            sizeof(T),
+            &val));
+    }
+
+    template<cl_kernel_exec_info name>
+    cl_int setExecInfo(typename detail::param_traits<detail::cl_kernel_exec_info, name>::param_type& val)
+    {
+        return setExecInfo(name, val);
+    }
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 210
@@ -6339,8 +6464,7 @@ class Program : public detail::Wrapper<cl_program>
         static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL;
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR);
 
-        return detail::errHandler(
-            pfn_clCreateProgramWithILKHR(
+        object_ = pfn_clCreateProgramWithILKHR(
                 context(), static_cast<const void*>(IL.data()), IL.size(), &error);
 
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
@@ -6393,8 +6517,7 @@ class Program : public detail::Wrapper<cl_program>
         static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL;
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR);
 
-        return detail::errHandler(
-            pfn_clCreateProgramWithILKHR(
+        object_ = pfn_clCreateProgramWithILKHR(
             context(), static_cast<const void*>(IL.data()), IL.size(), &error);
 
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
@@ -6538,7 +6661,7 @@ class Program : public detail::Wrapper<cl_program>
     Program() { }
     
 
-    /*! \brief Constructor from cl_mem - takes ownership.
+    /*! \brief Constructor from cl_program - takes ownership.
      *
      * \param retainObject will cause the constructor to retain its cl object.
      *                     Defaults to false to maintain compatibility with
@@ -6606,6 +6729,27 @@ class Program : public detail::Wrapper<cl_program>
         return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
     }
 
+    cl_int build(
+        const Device& device,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        cl_device_id deviceID = device();
+
+        cl_int buildError = ::clBuildProgram(
+            object_,
+            1,
+            &deviceID,
+            options,
+            notifyFptr,
+            data);
+
+        BuildLogType buildLog(0);
+        buildLog.push_back(std::make_pair(device, getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)));
+        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, buildLog);
+    }
+
     cl_int build(
         const char* options = NULL,
         void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
@@ -6619,7 +6763,6 @@ class Program : public detail::Wrapper<cl_program>
             notifyFptr,
             data);
 
-
         return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
     }
 
@@ -6651,7 +6794,7 @@ class Program : public detail::Wrapper<cl_program>
             __GET_PROGRAM_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_program_info name> typename
     detail::param_traits<detail::cl_program_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -6674,7 +6817,7 @@ class Program : public detail::Wrapper<cl_program>
                 __GET_PROGRAM_BUILD_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_program_build_info name> typename
     detail::param_traits<detail::cl_program_build_info, name>::param_type
     getBuildInfo(const Device& device, cl_int* err = NULL) const
     {
@@ -6692,7 +6835,7 @@ class Program : public detail::Wrapper<cl_program>
      * info type and for all devices in the program.
      * On an error reading the info for any device, an empty vector of info will be returned.
      */
-    template <cl_int name>
+    template <cl_program_build_info name>
     vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>
         getBuildInfo(cl_int *err = NULL) const
     {
@@ -6762,6 +6905,7 @@ class Program : public detail::Wrapper<cl_program>
     }
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 220
+#if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
     /*! \brief Registers a callback function to be called when destructors for
      *         program scope global variables are complete and before the
      *         program is released.
@@ -6772,9 +6916,9 @@ class Program : public detail::Wrapper<cl_program>
      *  on a callback stack associated with program. The registered user callback
      *  functions are called in the reverse order in which they were registered.
      */
-    cl_int setReleaseCallback(
+    CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int setReleaseCallback(
         void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data),
-        void * user_data = NULL)
+        void * user_data = NULL) CL_API_SUFFIX__VERSION_2_2_DEPRECATED
     {
         return detail::errHandler(
             ::clSetProgramReleaseCallback(
@@ -6783,6 +6927,7 @@ class Program : public detail::Wrapper<cl_program>
                 user_data),
             __SET_PROGRAM_RELEASE_CALLBACK_ERR);
     }
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
 
     /*! \brief Sets a SPIR-V specialization constant.
      *
@@ -6978,6 +7123,11 @@ inline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs)
     return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));
 }
 
+inline QueueProperties operator&(QueueProperties lhs, QueueProperties rhs)
+{
+    return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) & static_cast<cl_command_queue_properties>(rhs));
+}
+
 /*! \class CommandQueue
  * \brief CommandQueue interface for cl_command_queue.
  */
@@ -7434,7 +7584,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     CommandQueue() { }
 
 
-    /*! \brief Constructor from cl_mem - takes ownership.
+    /*! \brief Constructor from cl_command_queue - takes ownership.
      *
      * \param retainObject will cause the constructor to retain its cl object.
      *                     Defaults to false to maintain compatibility with
@@ -7486,7 +7636,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 __GET_COMMAND_QUEUE_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_command_queue_info name> typename
     detail::param_traits<detail::cl_command_queue_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
@@ -8119,7 +8269,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     {
         cl_event tmp;
         cl_int err = detail::errHandler(::clEnqueueSVMMap(
-            object_, blocking, flags, static_cast<void*>(container.data()), container.size(),
+            object_, blocking, flags, static_cast<void*>(container.data()), container.size()*sizeof(T),
             (events != NULL) ? (cl_uint)events->size() : 0,
             (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
             (event != NULL) ? &tmp : NULL),
@@ -8478,10 +8628,10 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     }
 
 #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
-    CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
+    CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
         const Kernel& kernel,
         const vector<Event>* events = NULL,
-        Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+        Event* event = NULL) const CL_API_SUFFIX__VERSION_1_2_DEPRECATED
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -8538,8 +8688,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
  * Deprecated APIs for 1.2
  */
 #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
-    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    CL_API_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -8554,8 +8704,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         return err;
     }
 
-    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-    cl_int enqueueWaitForEvents(const vector<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    CL_API_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const vector<Event>& events) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
     {
         return detail::errHandler(
             ::clEnqueueWaitForEvents(
@@ -8691,8 +8841,8 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
  * Deprecated APIs for 1.2
  */
 #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    CL_API_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
     {
         return detail::errHandler(
             ::clEnqueueBarrier(object_),
@@ -8866,7 +9016,7 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
             __GET_COMMAND_QUEUE_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_command_queue_info name> typename
         detail::param_traits<detail::cl_command_queue_info, name>::param_type
         getInfo(cl_int* err = NULL) const
     {
@@ -9038,7 +9188,7 @@ Buffer::Buffer(
     size_type size = sizeof(DataType)*(endIterator - startIterator);
 
     if( useHostPtr ) {
-        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);
     } else {
         object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
     }
@@ -9091,7 +9241,7 @@ Buffer::Buffer(
     Context context = queue.getInfo<CL_QUEUE_CONTEXT>();
 
     if (useHostPtr) {
-        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);
     }
     else {
         object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
@@ -9213,7 +9363,7 @@ inline cl_int enqueueMapSVM(
  */
 template<typename T, class D>
 inline cl_int enqueueMapSVM(
-    cl::pointer<T, D> ptr,
+    cl::pointer<T, D> &ptr,
     cl_bool blocking,
     cl_map_flags flags,
     size_type size,
@@ -9237,7 +9387,7 @@ inline cl_int enqueueMapSVM(
  */
 template<typename T, class Alloc>
 inline cl_int enqueueMapSVM(
-    cl::vector<T, Alloc> container,
+    cl::vector<T, Alloc> &container,
     cl_bool blocking,
     cl_map_flags flags,
     const vector<Event>* events = NULL,
@@ -10063,7 +10213,7 @@ class KernelFunctor
 
 namespace compatibility {
     /**
-     * Backward compatibility class to ensure that cl.hpp code works with cl2.hpp.
+     * Backward compatibility class to ensure that cl.hpp code works with opencl.hpp.
      * Please use KernelFunctor directly.
      */
     template<typename... Ts>
diff --git a/Cxx11/prk_opencl.h b/Cxx11/prk_opencl.h
index f8f0ade9c..b8d783438 100644
--- a/Cxx11/prk_opencl.h
+++ b/Cxx11/prk_opencl.h
@@ -19,7 +19,7 @@
 
 #include <cstdlib>
 
-#include "cl2.hpp"
+#include "opencl.hpp"
 
 namespace prk {
 

From dee4ba9d96b9c7b0eeba6c28dc79d23776f31530 Mon Sep 17 00:00:00 2001
From: Sajid Ali <sajidsyed2021@u.northwestern.edu>
Date: Sat, 5 Nov 2022 14:36:36 -0500
Subject: [PATCH 57/80] RUST: nstream with rayon!

---
 .gitignore                     |   2 +
 RUST/Makefile                  |   1 +
 RUST/nstream-rayon/Cargo.toml  |   9 ++
 RUST/nstream-rayon/src/main.rs | 184 +++++++++++++++++++++++++++++++++
 4 files changed, 196 insertions(+)
 create mode 100644 RUST/nstream-rayon/Cargo.toml
 create mode 100644 RUST/nstream-rayon/src/main.rs

diff --git a/.gitignore b/.gitignore
index a92a237e3..bd9ee8deb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -377,6 +377,8 @@ RUST/nstream-unsafe/Cargo.lock
 RUST/nstream-unsafe/target/
 RUST/nstream-iter/Cargo.lock
 RUST/nstream-iter/target/
+RUST/nstream-rayon/Cargo.lock
+RUST/nstream-rayon/target/
 RUST/p2p/Cargo.lock
 RUST/p2p/target/
 RUST/stencil/Cargo.lock
diff --git a/RUST/Makefile b/RUST/Makefile
index d70e5855e..9904e005b 100644
--- a/RUST/Makefile
+++ b/RUST/Makefile
@@ -16,6 +16,7 @@ all:
 	cd nstream 	&& cargo build $(RCFLAGS)
 	cd nstream-unsafe 	&& cargo build $(RCFLAGS)
 	cd nstream-iter 	&& cargo build $(RCFLAGS)
+	cd nstream-rayon 	&& cargo build $(RCFLAGS)
 	cd p2p 		&& cargo build $(RCFLAGS)
 	cd stencil 	&& cargo build $(RCFLAGS)
 	cd transpose 	&& cargo build $(RCFLAGS)
diff --git a/RUST/nstream-rayon/Cargo.toml b/RUST/nstream-rayon/Cargo.toml
new file mode 100644
index 000000000..054caa930
--- /dev/null
+++ b/RUST/nstream-rayon/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "nstream"
+version = "0.1.0"
+authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Thomas Hayward-Schneider <thomas.hayward@ipp.mpg.de>", "Sajid Ali <sasyed@fnal.gov>"]
+
+edition = "2021"
+
+[dependencies]
+rayon = "1.5"
diff --git a/RUST/nstream-rayon/src/main.rs b/RUST/nstream-rayon/src/main.rs
new file mode 100644
index 000000000..4d02cb145
--- /dev/null
+++ b/RUST/nstream-rayon/src/main.rs
@@ -0,0 +1,184 @@
+//
+// Copyright (c) 2020, Intel Corporation
+// Copyright (c) 2020, Thomas Hayward-Schneider
+// Copyright (c) 2022, Sajid Ali
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+// * Neither the name of Intel Corporation nor the names of its
+//       contributors may be used to endorse or promote products
+//       derived from this software without specific prior written
+//       permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+///////////////////////////////////////////////
+//
+// NAME:    nstream
+//
+// PURPOSE: To compute memory bandwidth when adding a vector of a given
+//          number of double precision values to the scalar multiple of
+//          another vector of the same length, and storing the result in
+//          a third vector.
+//
+// USAGE:   The program takes as input the number
+//          of iterations to loop over the triad vectors, the length of the
+//          vectors, and the offset between vectors
+//
+//          <progname> <# iterations> <vector length> <offset>
+//
+//          The output consists of diagnostics to make sure the
+//          algorithm worked, and of timing statistics.
+//
+// NOTES:   Bandwidth is determined as the number of words read, plus the
+//          number of words written, times the size of the words, divided
+//          by the execution time. For a vector length of N, the total
+//          number of words read and written is 4*N*sizeof(double).
+//
+// HISTORY: This code is loosely based on the Stream benchmark by John
+//          McCalpin, but does not follow all the Stream rules. Hence,
+//          reported results should not be associated with Stream in
+//          external publications
+//
+//          Converted to C++11 by Jeff Hammond, November 2017.
+//
+///////////////////////////////////////////////
+
+use std::env;
+use std::mem;
+//use std::num; // abs?
+use rayon::prelude::*;
+use std::time::{Duration, Instant};
+
+fn help() {
+    println!("Usage: <# iterations> <vector length>");
+}
+
+fn main() {
+    println!("Parallel Research Kernels");
+    println!("Rust STREAM triad: A = B + scalar * C");
+
+    ///////////////////////////////////////////////
+    // Read and test input parameters
+    ///////////////////////////////////////////////
+
+    let args: Vec<String> = env::args().collect();
+
+    let iterations: u32;
+    let length: usize;
+
+    match args.len() {
+        3 => {
+            iterations = match args[1].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            length = match args[2].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+        }
+        _ => {
+            help();
+            return;
+        }
+    }
+
+    if iterations < 1 {
+        println!("ERROR: iterations must be >= 1");
+    }
+
+    println!("Number of iterations  = {}", iterations);
+    println!("vector length         = {}", length);
+
+    ///////////////////////////////////////////////
+    // Allocate space and perform the computation
+    ///////////////////////////////////////////////
+
+    let mut a: Vec<f64> = vec![0.0; length];
+    let b: Vec<f64> = vec![2.0; length];
+    let c: Vec<f64> = vec![2.0; length];
+
+    let timer = Instant::now();
+    let mut t0: Duration = timer.elapsed();
+
+    let scalar: f64 = 3.0;
+
+    for _k in 0..iterations + 1 {
+        if _k == 1 {
+            t0 = timer.elapsed();
+        }
+
+        (&mut a, &b, &c).into_par_iter().for_each(|(x, y, z)| {
+            *x += *y + scalar * (*z);
+        });
+    }
+    let t1 = timer.elapsed();
+    let dt = (t1.checked_sub(t0)).unwrap();
+    let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
+    let nstream_time: f64 = dtt as f64 * 1.0e-9;
+
+    ///////////////////////////////////////////////
+    // Analyze and output results
+    ///////////////////////////////////////////////
+
+    let mut ar: f64 = 0.0;
+    let br: f64 = 2.0;
+    let cr: f64 = 2.0;
+    for _k in 0..iterations + 1 {
+        ar += br + scalar * cr;
+    }
+
+    ar *= length as f64;
+
+    let mut asum = 0.0;
+    for i in 0..length {
+        let absa: f64 = a[i].abs();
+        asum += absa;
+    }
+
+    let err: f64 = (ar - asum) / asum;
+    let abserr: f64 = err.abs();
+    let epsilon: f64 = 1.0e-8;
+    if abserr < epsilon {
+        println!("Solution validates");
+        let avgtime: f64 = (nstream_time as f64) / (iterations as f64);
+        let nbytes: usize = 4 * length * mem::size_of::<f64>();
+        println!(
+            "Rate (MB/s): {:10.3} Avg time (s): {:10.3}",
+            (1.0e-6_f64) * (nbytes as f64) / avgtime,
+            avgtime
+        );
+    } else {
+        println!("Failed Validation on output array");
+        println!("       Expected checksum: {}", ar);
+        println!("       Observed checksum: {}", asum);
+        println!("ERROR: solution did not validate");
+    }
+    return;
+}

From be2972f727c5b322b4cb4f3e4f79f2f4f9329002 Mon Sep 17 00:00:00 2001
From: Sajid Ali <sajidsyed2021@u.northwestern.edu>
Date: Sun, 6 Nov 2022 21:32:55 -0600
Subject: [PATCH 58/80] RUST: dgemm with iter and rayon!

	modified:   .gitignore
	modified:   RUST/Makefile
	new file:   RUST/dgemm-iter/Cargo.toml
	new file:   RUST/dgemm-iter/src/main.rs
	new file:   RUST/dgemm-rayon/Cargo.toml
	new file:   RUST/dgemm-rayon/src/main.rs
	modified:   RUST/dgemm/Cargo.toml
	modified:   RUST/dgemm/src/main.rs
	modified:   RUST/transpose/Cargo.toml
	modified:   RUST/transpose/src/main.rs
---
 .gitignore                   |   6 +
 RUST/Makefile                |  37 +++--
 RUST/dgemm-iter/Cargo.toml   |   6 +
 RUST/dgemm-iter/src/main.rs  | 202 ++++++++++++++++++++++++
 RUST/dgemm-rayon/Cargo.toml  |   9 ++
 RUST/dgemm-rayon/src/main.rs | 204 ++++++++++++++++++++++++
 RUST/dgemm/Cargo.toml        |   7 +-
 RUST/dgemm/src/main.rs       | 225 +++++++++++++--------------
 RUST/transpose/Cargo.toml    |   4 +-
 RUST/transpose/src/main.rs   | 291 +++++++++++++++++++++--------------
 10 files changed, 734 insertions(+), 257 deletions(-)
 create mode 100644 RUST/dgemm-iter/Cargo.toml
 create mode 100644 RUST/dgemm-iter/src/main.rs
 create mode 100644 RUST/dgemm-rayon/Cargo.toml
 create mode 100644 RUST/dgemm-rayon/src/main.rs

diff --git a/.gitignore b/.gitignore
index bd9ee8deb..9ba4c2b06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -379,6 +379,12 @@ RUST/nstream-iter/Cargo.lock
 RUST/nstream-iter/target/
 RUST/nstream-rayon/Cargo.lock
 RUST/nstream-rayon/target/
+RUST/dgemm/Cargo.lock
+RUST/dgemm/target/
+RUST/dgemm-iter/Cargo.lock
+RUST/dgemm-iter/target/
+RUST/dgemm-rayon/Cargo.lock
+RUST/dgemm-rayon/target/
 RUST/p2p/Cargo.lock
 RUST/p2p/target/
 RUST/stencil/Cargo.lock
diff --git a/RUST/Makefile b/RUST/Makefile
index 9904e005b..cc3fa2d06 100644
--- a/RUST/Makefile
+++ b/RUST/Makefile
@@ -13,21 +13,24 @@ RCFLAGS += --release
 .PHONY: all clean
 
 all:
-	cd nstream 	&& cargo build $(RCFLAGS)
-	cd nstream-unsafe 	&& cargo build $(RCFLAGS)
-	cd nstream-iter 	&& cargo build $(RCFLAGS)
-	cd nstream-rayon 	&& cargo build $(RCFLAGS)
-	cd p2p 		&& cargo build $(RCFLAGS)
-	cd stencil 	&& cargo build $(RCFLAGS)
-	cd transpose 	&& cargo build $(RCFLAGS)
-	cd dgemm 	&& cargo build $(RCFLAGS)
-
+	cd nstream 	   && cargo build $(RCFLAGS)
+	cd nstream-unsafe  && cargo build $(RCFLAGS)
+	cd nstream-iter    && cargo build $(RCFLAGS)
+	cd nstream-rayon   && cargo build $(RCFLAGS)
+	cd p2p 		   && cargo build $(RCFLAGS)
+	cd stencil 	   && cargo build $(RCFLAGS)
+	cd transpose 	   && cargo build $(RCFLAGS)
+	cd dgemm           && cargo build $(RCFLAGS)
+	cd dgemm-iter      && cargo build $(RCFLAGS)
+	cd dgemm-rayon     && cargo build $(RCFLAGS)
 clean:
-	cd nstream 	&& cargo clean
-	cd nstream-unsafe 	&& cargo clean
-	cd nstream-iter 	&& cargo clean
-	cd p2p 		&& cargo clean
-	cd stencil 	&& cargo clean
-	cd transpose 	&& cargo clean
-	cd dgemm 	&& cargo clean
-
+	cd nstream 	   && cargo clean
+	cd nstream-unsafe  && cargo clean
+	cd nstream-iter    && cargo clean
+	cd nstream-rayon   && cargo clean
+	cd p2p 	 	   && cargo clean
+	cd stencil 	   && cargo clean
+	cd transpose 	   && cargo clean
+	cd dgemm           && cargo clean
+	cd dgemm-iter      && cargo clean
+	cd dgemm-rayon     && cargo clean
diff --git a/RUST/dgemm-iter/Cargo.toml b/RUST/dgemm-iter/Cargo.toml
new file mode 100644
index 000000000..5714a1fa3
--- /dev/null
+++ b/RUST/dgemm-iter/Cargo.toml
@@ -0,0 +1,6 @@
+[package]
+name = "dgemm"
+version = "0.1.0"
+authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Sajid Ali <sasyed@fnal.gov>"]
+
+edition = "2021"
diff --git a/RUST/dgemm-iter/src/main.rs b/RUST/dgemm-iter/src/main.rs
new file mode 100644
index 000000000..208cc47b5
--- /dev/null
+++ b/RUST/dgemm-iter/src/main.rs
@@ -0,0 +1,202 @@
+//
+// Copyright (c) 2013, Intel Corporation
+// Copyright (c) 2022, Sajid Ali
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+// * Neither the name of Intel Corporation nor the names of its
+//       contributors may be used to endorse or promote products
+//       derived from this software without specific prior written
+//       permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////
+//
+// NAME:    transpose
+//
+// PURPOSE: This program measures the time for the transpose of a
+//          column-major stored matrix into a row-major stored matrix.
+//
+// USAGE:   Program input is the matrix order and the number of times to
+//          repeat the operation:
+//
+//          transpose <matrix_size> <# iterations> [tile size]
+//
+//          An optional parameter specifies the tile size used to divide the
+//          individual matrix blocks for improved cache and TLB performance.
+//
+//          The output consists of diagnostics to make sure the
+//          transpose worked and timing statistics.
+//
+// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+//          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+//
+///////////////////////////////////////////////
+
+use std::env;
+use std::time::{Duration, Instant};
+
+fn help() {
+    println!("Usage: <# iterations> <matrix order>");
+}
+
+fn main() {
+    println!("Parallel Research Kernels");
+    println!("Rust Dense matrix-matrix multiplication: C += A x B");
+
+    ///////////////////////////////////////////////
+    // Read and test input parameters
+    ///////////////////////////////////////////////
+
+    let args: Vec<String> = env::args().collect();
+
+    let iterations: u32;
+    let order: usize;
+
+    match args.len() {
+        3 => {
+            iterations = match args[1].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            order = match args[2].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+        }
+        _ => {
+            help();
+            return;
+        }
+    }
+
+    if iterations < 1 {
+        println!("ERROR: iterations must be >= 1");
+    }
+
+    println!("Number of iterations  = {}", iterations);
+    println!("Matrix order          = {}", order);
+
+    ///////////////////////////////////////////////
+    // Allocate space for the input and transpose matrix
+    ///////////////////////////////////////////////
+
+    let nelems: usize = order * order;
+    let mut a: Vec<f64> = vec![0.0; nelems];
+    let mut b: Vec<f64> = vec![0.0; nelems];
+    let mut c: Vec<f64> = vec![0.0; nelems];
+
+    for i in 0..order {
+        for j in 0..order {
+            a[i * order + j] = i as f64;
+            b[i * order + j] = i as f64;
+        }
+    }
+
+    let timer = Instant::now();
+    let mut t0: Duration = timer.elapsed();
+
+    for k in 0..iterations + 1 {
+        if k == 1 {
+            t0 = timer.elapsed();
+        }
+
+        // https://www.reidatcheson.com/matrix%20multiplication/rust/iterators/2021/02/26/gemm-iterators.html
+        c.chunks_exact_mut(order)
+            .zip(a.chunks_exact(order))
+            // ci_mut : mutable ith row of C
+            // ai     : immutable ith row of A
+            .for_each(|(ci_mut, ai)| {
+                // iterate over columns of ith row of a,
+                // zipped with rows of b
+                ai.iter()
+                    .zip(b.chunks_exact(order))
+                    // aik : element at row i, column k in matrix A
+                    // bk  : immutable kth row of matrix B
+                    .for_each(|(aik, bk)| {
+                        // iterate over columns of ith row of c,
+                        // zipped with columns of kth row of b
+                        ci_mut
+                            .iter_mut()
+                            .zip(bk.iter())
+                            // cij : element at row i, column j of matrix C
+                            // bkj : element at row k, column j of marrix B
+                            .for_each(|(cij, bkj)| {
+                                *cij += aik * bkj;
+                            })
+                    });
+            });
+    }
+
+    let t1 = timer.elapsed();
+    let dt = (t1.checked_sub(t0)).unwrap();
+    let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
+    let dgemm_time: f64 = dtt as f64 * 1.0e-9;
+
+    ///////////////////////////////////////////////
+    // Analyze and output results
+    ///////////////////////////////////////////////
+
+    let forder: f64 = order as f64;
+    let reference: f64 = 0.25
+        * (forder * forder * forder)
+        * (forder - 1.0)
+        * (forder - 1.0)
+        * (iterations as f64 + 1.0);
+    let mut checksum: f64 = 0.0;
+    for i in 0..order {
+        for j in 0..order {
+            checksum += c[i * order + j];
+        }
+    }
+
+    if cfg!(VERBOSE) {
+        println!("Sum of absolute differences: {:30.15}", checksum);
+    }
+
+    let epsilon: f64 = 1.0e-8;
+    let residuum: f64 = (checksum - reference) / reference;
+    if residuum < epsilon {
+        println!("Solution validates");
+        let avgtime: f64 = (dgemm_time as f64) / (iterations as f64);
+        let uorder: usize = order as usize;
+        let nflops: usize = 2_usize * uorder * uorder * uorder;
+        println!(
+            "Rate (MB/s): {:10.3} Avg time (s): {:10.3}",
+            (1.0e-6_f64) * (nflops as f64) / avgtime,
+            avgtime
+        );
+    } else {
+        println!(
+            "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}",
+            residuum, epsilon
+        );
+        return;
+    }
+}
diff --git a/RUST/dgemm-rayon/Cargo.toml b/RUST/dgemm-rayon/Cargo.toml
new file mode 100644
index 000000000..49886cd96
--- /dev/null
+++ b/RUST/dgemm-rayon/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "dgemm"
+version = "0.1.0"
+authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Sajid Ali <sasyed@fnal.gov>"]
+
+edition = "2021"
+
+[dependencies]
+rayon = "1.5"
diff --git a/RUST/dgemm-rayon/src/main.rs b/RUST/dgemm-rayon/src/main.rs
new file mode 100644
index 000000000..30dc55057
--- /dev/null
+++ b/RUST/dgemm-rayon/src/main.rs
@@ -0,0 +1,204 @@
+//
+// Copyright (c) 2013, Intel Corporation
+// Copyright (c) 2022, Sajid Ali
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+// * Neither the name of Intel Corporation nor the names of its
+//       contributors may be used to endorse or promote products
+//       derived from this software without specific prior written
+//       permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////
+//
+// NAME:    transpose
+//
+// PURPOSE: This program measures the time for the transpose of a
+//          column-major stored matrix into a row-major stored matrix.
+//
+// USAGE:   Program input is the matrix order and the number of times to
+//          repeat the operation:
+//
+//          transpose <matrix_size> <# iterations> [tile size]
+//
+//          An optional parameter specifies the tile size used to divide the
+//          individual matrix blocks for improved cache and TLB performance.
+//
+//          The output consists of diagnostics to make sure the
+//          transpose worked and timing statistics.
+//
+// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+//          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+//
+///////////////////////////////////////////////
+
+use std::env;
+use std::time::{Duration, Instant};
+
+use rayon::prelude::*;
+
+fn help() {
+    println!("Usage: <# iterations> <matrix order>");
+}
+
+fn main() {
+    println!("Parallel Research Kernels");
+    println!("Rust Dense matrix-matrix multiplication: C += A x B");
+
+    ///////////////////////////////////////////////
+    // Read and test input parameters
+    ///////////////////////////////////////////////
+
+    let args: Vec<String> = env::args().collect();
+
+    let iterations: u32;
+    let order: usize;
+
+    match args.len() {
+        3 => {
+            iterations = match args[1].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            order = match args[2].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+        }
+        _ => {
+            help();
+            return;
+        }
+    }
+
+    if iterations < 1 {
+        println!("ERROR: iterations must be >= 1");
+    }
+
+    println!("Number of iterations  = {}", iterations);
+    println!("Matrix order          = {}", order);
+
+    ///////////////////////////////////////////////
+    // Allocate space for the input and transpose matrix
+    ///////////////////////////////////////////////
+
+    let nelems: usize = order * order;
+    let mut a: Vec<f64> = vec![0.0; nelems];
+    let mut b: Vec<f64> = vec![0.0; nelems];
+    let mut c: Vec<f64> = vec![0.0; nelems];
+
+    for i in 0..order {
+        for j in 0..order {
+            a[i * order + j] = i as f64;
+            b[i * order + j] = i as f64;
+        }
+    }
+
+    let timer = Instant::now();
+    let mut t0: Duration = timer.elapsed();
+
+    for k in 0..iterations + 1 {
+        if k == 1 {
+            t0 = timer.elapsed();
+        }
+
+        // Outermost loop parallelism applied to dgemm-iter version
+        c.par_chunks_exact_mut(order)
+            .zip(a.par_chunks_exact(order))
+            // ci_mut : mutable ith row of C
+            // ai     : immutable ith row of A
+            .for_each(|(ci_mut, ai)| {
+                // iterate over columns of ith row of a,
+                // zipped with rows of b
+                ai.iter()
+                    .zip(b.chunks_exact(order))
+                    // aik : element at row i, column k in matrix A
+                    // bk  : immutable kth row of matrix B
+                    .for_each(|(aik, bk)| {
+                        // iterate over columns of ith row of c,
+                        // zipped with columns of kth row of b
+                        ci_mut
+                            .iter_mut()
+                            .zip(bk.iter())
+                            // cij : element at row i, column j of matrix C
+                            // bkj : element at row k, column j of marrix B
+                            .for_each(|(cij, bkj)| {
+                                *cij += aik * bkj;
+                            })
+                    });
+            });
+    }
+
+    let t1 = timer.elapsed();
+    let dt = (t1.checked_sub(t0)).unwrap();
+    let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
+    let dgemm_time: f64 = dtt as f64 * 1.0e-9;
+
+    ///////////////////////////////////////////////
+    // Analyze and output results
+    ///////////////////////////////////////////////
+
+    let forder: f64 = order as f64;
+    let reference: f64 = 0.25
+        * (forder * forder * forder)
+        * (forder - 1.0)
+        * (forder - 1.0)
+        * (iterations as f64 + 1.0);
+    let mut checksum: f64 = 0.0;
+    for i in 0..order {
+        for j in 0..order {
+            checksum += c[i * order + j];
+        }
+    }
+
+    if cfg!(VERBOSE) {
+        println!("Sum of absolute differences: {:30.15}", checksum);
+    }
+
+    let epsilon: f64 = 1.0e-8;
+    let residuum: f64 = (checksum - reference) / reference;
+    if residuum < epsilon {
+        println!("Solution validates");
+        let avgtime: f64 = (dgemm_time as f64) / (iterations as f64);
+        let uorder: usize = order as usize;
+        let nflops: usize = 2_usize * uorder * uorder * uorder;
+        println!(
+            "Rate (MB/s): {:10.3} Avg time (s): {:10.3}",
+            (1.0e-6_f64) * (nflops as f64) / avgtime,
+            avgtime
+        );
+    } else {
+        println!(
+            "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}",
+            residuum, epsilon
+        );
+        return;
+    }
+}
diff --git a/RUST/dgemm/Cargo.toml b/RUST/dgemm/Cargo.toml
index 4548f4f12..cd045832a 100644
--- a/RUST/dgemm/Cargo.toml
+++ b/RUST/dgemm/Cargo.toml
@@ -1,9 +1,6 @@
 [package]
 name = "dgemm"
 version = "0.1.0"
-authors = ["Jeff Hammond <jeff.r.hammond@intel.com>"]
+authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Sajid Ali <sasyed@fnal.gov>"]
 
-[dependencies]
-blas = "0.20"
-cblas = "0.2"
-blas-src = { version = "0.7", features = ["blis"] }
+edition="2021"
diff --git a/RUST/dgemm/src/main.rs b/RUST/dgemm/src/main.rs
index b0d03aaa4..930d3a60d 100644
--- a/RUST/dgemm/src/main.rs
+++ b/RUST/dgemm/src/main.rs
@@ -1,5 +1,6 @@
 //
 // Copyright (c) 2013, Intel Corporation
+// Copyright (c) 2022, Sajid Ali
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -52,136 +53,130 @@
 //
 ///////////////////////////////////////////////
 
-extern crate blas;
-extern crate cblas;
-extern crate blas_src;
-
 use std::env;
-use std::time::{Instant,Duration};
-
-//use blas::*;
-use cblas::*;
-
-fn prk_dgemm(order : usize, a : &mut Vec<f64>, b : &mut Vec<f64>, c : &mut Vec<f64>)
-{
-  for i in 0..order {
-    for k in 0..order {
-      for j in 0..order {
-        c[i*order+j] += a[i*order+k] * b[k*order+j];
-      }
-    }
-  }
-}
+use std::time::{Duration, Instant};
 
 fn help() {
-  println!("Usage: <# iterations> <matrix order>");
+    println!("Usage: <# iterations> <matrix order>");
 }
 
-fn main()
-{
-  println!("Parallel Research Kernels");
-  println!("Rust Dense matrix-matrix multiplication: C += A x B");
-
-  ///////////////////////////////////////////////
-  // Read and test input parameters
-  ///////////////////////////////////////////////
-
-  let args : Vec<String> = env::args().collect();
-
-  let iterations : u32;
-  let order      : usize;
-
-  match args.len() {
-    3 => {
-      iterations = match args[1].parse() {
-        Ok(n) => { n },
-        Err(_) => { help(); return; },
-      };
-      order = match args[2].parse() {
-        Ok(n) => { n },
-        Err(_) => { help(); return; },
-      };
-    },
-    _ => {
-      help();
-      return;
+fn main() {
+    println!("Parallel Research Kernels");
+    println!("Rust Dense matrix-matrix multiplication: C += A x B");
+
+    ///////////////////////////////////////////////
+    // Read and test input parameters
+    ///////////////////////////////////////////////
+
+    let args: Vec<String> = env::args().collect();
+
+    let iterations: u32;
+    let order: usize;
+
+    match args.len() {
+        3 => {
+            iterations = match args[1].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            order = match args[2].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+        }
+        _ => {
+            help();
+            return;
+        }
     }
-  }
 
-  if iterations < 1 {
-    println!("ERROR: iterations must be >= 1");
-  }
+    if iterations < 1 {
+        println!("ERROR: iterations must be >= 1");
+    }
 
-  println!("Number of iterations  = {}", iterations);
-  println!("Matrix order          = {}", order);
+    println!("Number of iterations  = {}", iterations);
+    println!("Matrix order          = {}", order);
 
-  ///////////////////////////////////////////////
-  // Allocate space for the input and transpose matrix
-  ///////////////////////////////////////////////
+    ///////////////////////////////////////////////
+    // Allocate space for the input and transpose matrix
+    ///////////////////////////////////////////////
 
-  let nelems : usize = order*order;
-  let mut a : Vec<f64> = vec![0.0; nelems];
-  let mut b : Vec<f64> = vec![0.0; nelems];
-  let mut c : Vec<f64> = vec![0.0; nelems];
+    let nelems: usize = order * order;
+    let mut a: Vec<f64> = vec![0.0; nelems];
+    let mut b: Vec<f64> = vec![0.0; nelems];
+    let mut c: Vec<f64> = vec![0.0; nelems];
 
-  for i in 0..order {
-    for j in 0..order {
-      a[i*order+j] = i as f64;
-      b[i*order+j] = i as f64;
+    for i in 0..order {
+        for j in 0..order {
+            a[i * order + j] = i as f64;
+            b[i * order + j] = i as f64;
+        }
     }
-  }
-
-  let timer = Instant::now();
-  let mut t0 : Duration = timer.elapsed();
 
-  for k in 0..iterations+1 {
-
-    if k == 1 { t0 = timer.elapsed(); }
+    let timer = Instant::now();
+    let mut t0: Duration = timer.elapsed();
+
+    for k in 0..iterations + 1 {
+        if k == 1 {
+            t0 = timer.elapsed();
+        }
+        for i in 0..order {
+            for k in 0..order {
+                for j in 0..order {
+                    c[i * order + j] += a[i * order + k] * b[k * order + j];
+                }
+            }
+        }
+    }
+    let t1 = timer.elapsed();
+    let dt = (t1.checked_sub(t0)).unwrap();
+    let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
+    let dgemm_time: f64 = dtt as f64 * 1.0e-9;
+
+    ///////////////////////////////////////////////
+    // Analyze and output results
+    ///////////////////////////////////////////////
+
+    let forder: f64 = order as f64;
+    let reference: f64 = 0.25
+        * (forder * forder * forder)
+        * (forder - 1.0)
+        * (forder - 1.0)
+        * (iterations as f64 + 1.0);
+    let mut checksum: f64 = 0.0;
+    for i in 0..order {
+        for j in 0..order {
+            checksum += c[i * order + j];
+        }
+    }
 
-    //prk_dgemm(order, &mut a, &mut b, &mut c);
-    let m : i32 = order as i32;
-    let n : i32 = order as i32;
-    let k : i32 = order as i32;
-    unsafe {
-        dgemm(Layout::RowMajor, Transpose::None, Transpose::None,
-              m, n, k, 1.0, &a, m, &b, k, 1.0, &mut c, m);
+    if cfg!(VERBOSE) {
+        println!("Sum of absolute differences: {:30.15}", checksum);
     }
 
-  }
-  let t1 = timer.elapsed();
-  let dt = (t1.checked_sub(t0)).unwrap();
-  let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
-  let dgemm_time : f64 = dtt as f64 * 1.0e-9;
-
-  ///////////////////////////////////////////////
-  // Analyze and output results
-  ///////////////////////////////////////////////
-
-  let forder : f64 = order as f64;
-  let reference : f64 = 0.25 * (forder*forder*forder) * (forder-1.0)*(forder-1.0) * (iterations as f64 + 1.0);
-  let mut checksum : f64 = 0.0;
-  for i in 0..order {
-    for j in 0..order {
-      checksum += c[i*order+j];
+    let epsilon: f64 = 1.0e-8;
+    let residuum: f64 = (checksum - reference) / reference;
+    if residuum < epsilon {
+        println!("Solution validates");
+        let avgtime: f64 = (dgemm_time as f64) / (iterations as f64);
+        let uorder: usize = order as usize;
+        let nflops: usize = 2_usize * uorder * uorder * uorder;
+        println!(
+            "Rate (MB/s): {:10.3} Avg time (s): {:10.3}",
+            (1.0e-6_f64) * (nflops as f64) / avgtime,
+            avgtime
+        );
+    } else {
+        println!(
+            "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}",
+            residuum, epsilon
+        );
+        return;
     }
-  }
-
-  if cfg!(VERBOSE) {
-    println!("Sum of absolute differences: {:30.15}", checksum);
-  }
-
-  let epsilon : f64 = 1.0e-8;
-  let residuum : f64 = (checksum - reference)/reference;
-  if residuum < epsilon {
-    println!("Solution validates");
-    let avgtime : f64 = (dgemm_time as f64) / (iterations as f64);
-    let uorder : usize = order as usize;
-    let nflops : usize = 2_usize * uorder * uorder * uorder;
-    println!("Rate (MB/s): {:10.3} Avg time (s): {:10.3}", (1.0e-6_f64) * (nflops as f64) / avgtime, avgtime);
-  } else {
-    println!("ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", residuum, epsilon);
-    return;
-  }
 }
-
-
diff --git a/RUST/transpose/Cargo.toml b/RUST/transpose/Cargo.toml
index 3f634d3c5..22fe9074e 100644
--- a/RUST/transpose/Cargo.toml
+++ b/RUST/transpose/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "transpose"
 version = "0.1.0"
-authors = ["Jeff Hammond <jeff.r.hammond@intel.com>"]
+authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Sajid Ali <sasyed@fnal.gov>"]
 
-[dependencies]
+edition = "2021"
diff --git a/RUST/transpose/src/main.rs b/RUST/transpose/src/main.rs
index 935addae8..baace9c90 100644
--- a/RUST/transpose/src/main.rs
+++ b/RUST/transpose/src/main.rs
@@ -1,5 +1,6 @@
 //
 // Copyright (c) 2013, Intel Corporation
+// Copyright (c) 2022, Sajid Ali
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -54,137 +55,191 @@
 
 use std::env;
 use std::mem;
-use std::time::{Instant,Duration};
+use std::time::{Duration, Instant};
 
 fn help() {
-  println!("Usage: <# iterations> <matrix order> [tile size]");
+    println!("Usage: <# iterations> <matrix order> [tile size]");
 }
 
-fn main()
-{
-  println!("Parallel Research Kernels");
-  println!("Rust Matrix transpose: B = A^T");
-
-  ///////////////////////////////////////////////
-  // Read and test input parameters
-  ///////////////////////////////////////////////
-
-  let args : Vec<String> = env::args().collect();
-
-  let iterations : u32;
-  let order      : usize;
-  let tilesize   : usize;
-
-  match args.len() {
-    3 => {
-      iterations = match args[1].parse() {
-        Ok(n) => { n },
-        Err(_) => { help(); return; },
-      };
-      order = match args[2].parse() {
-        Ok(n) => { n },
-        Err(_) => { help(); return; },
-      };
-      tilesize = 32;
-    },
-    4 => {
-      iterations = match args[1].parse() {
-        Ok(n) => { n },
-        Err(_) => { help(); return; },
-      };
-      order = match args[2].parse() {
-        Ok(n) => { n },
-        Err(_) => { help(); return; },
-      };
-      tilesize = match args[3].parse() {
-        Ok(n) => { n },
-        Err(_) => { help(); return; },
-      };
-    },
-    _ => {
-      help();
-      return;
+fn main() {
+    println!("Parallel Research Kernels");
+    println!("Rust Matrix transpose: B = A^T");
+
+    ///////////////////////////////////////////////
+    // Read and test input parameters
+    ///////////////////////////////////////////////
+
+    let args: Vec<String> = env::args().collect();
+
+    let iterations: u32;
+    let order: usize;
+    let tilesize: usize;
+
+    match args.len() {
+        3 => {
+            iterations = match args[1].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            order = match args[2].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            tilesize = 32;
+        }
+        4 => {
+            iterations = match args[1].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            order = match args[2].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            tilesize = match args[3].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+        }
+        _ => {
+            help();
+            return;
+        }
     }
-  }
-
-  if iterations < 1 {
-    println!("ERROR: iterations must be >= 1");
-  }
-  if tilesize > order {
-    println!("ERROR: tilesize cannot be > order");
-  }
-
-  println!("Number of iterations  = {}", iterations);
-  println!("Matrix order          = {}", order);
-  if tilesize < order {
-      println!("Tile size             = {}", tilesize);
-  } else {
-      println!("Untiled");
-  }
-
-  ///////////////////////////////////////////////
-  // Allocate space for the input and transpose matrix
-  ///////////////////////////////////////////////
-
-  let nelems : usize = order*order;
-  let mut a : Vec<f64> = vec![0.0; nelems];
-  let mut b : Vec<f64> = vec![0.0; nelems];
-
-  for i in 0..order {
-    for j in 0..order {
-      a[i*order+j] = (i*order+j) as f64;
+
+    if tilesize > order {
+        println!("Warning: tilesize cannot be > order, will not use tiling!");
     }
-  }
 
-  let timer = Instant::now();
-  let mut t0 : Duration = timer.elapsed();
+    println!("Number of iterations  = {}", iterations);
+    println!("Matrix order          = {}", order);
+    if tilesize < order {
+        println!("Tile size             = {}", tilesize);
+    } else {
+        println!("Untiled");
+    }
 
-  for k in 0..iterations+1 {
+    /////////////////////////////////////////////////////
+    // Allocate space for the input and transpose matrix
+    /////////////////////////////////////////////////////
 
-    if k == 1 { t0 = timer.elapsed(); }
+    let nelems: usize = order * order;
+    let mut a: Vec<f64> = vec![0.0; nelems];
+    let mut b: Vec<f64> = vec![0.0; nelems];
 
+    // Initialize matrices
     for i in 0..order {
-      for j in 0..order {
-        b[j*order+i] += a[i*order+j];
-        a[i*order+j] += 1.0;
-      }
+        for j in 0..order {
+            a[i * order + j] = (i * order + j) as f64;
+        }
     }
 
-  }
-  let t1 = timer.elapsed();
-  let dt = (t1.checked_sub(t0)).unwrap();
-  let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
-  let transpose_time : f64 = dtt as f64 * 1.0e-9;
-
-  ///////////////////////////////////////////////
-  // Analyze and output results
-  ///////////////////////////////////////////////
-
-  let addit : usize = ((iterations as usize + 1) * (iterations as usize)) / 2;
-  let mut abserr : f64 = 0.0;
-  for i in 0..order {
-    for j in 0..order {
-      let ij = i*order+j;
-      let ji = j*order+i;
-      let reference : f64 = (ij*(iterations as usize + 1)+addit) as f64;
-      abserr += (b[ji] - reference).abs();
+    let (num_tiles, boundscheck): (usize, bool) = if order % tilesize == 0 {
+        (order / tilesize, false) // all tiles have same size
+    } else {
+        (order / tilesize + 1, true) // last tile has size < tilesize
+    };
+
+    println!("Initialization done, running algorithm");
+    if boundscheck {
+        println!("Warning: Matrix order not divisible by tilesize, will employ bounds checking!")
     }
-  }
-
-  if cfg!(VERBOSE) {
-    println!("Sum of absolute differences: {:30.15}", abserr);
-  }
-
-  let epsilon : f64 = 1.0e-8;
-  if abserr < epsilon {
-    println!("Solution validates");
-    let avgtime : f64 = (transpose_time as f64) / (iterations as f64);
-    let bytes : usize = 2_usize * nelems * mem::size_of::<f64>();
-    println!("Rate (MB/s): {:10.3} Avg time (s): {:10.3}", (1.0e-6_f64) * (bytes as f64) / avgtime, avgtime);
-  } else {
-    println!("ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", abserr, epsilon);
-    return;
-  }
-}
 
+    let timer = Instant::now();
+    let mut t0: Duration = timer.elapsed();
+
+    for k in 0..iterations + 1 {
+        if k == 1 {
+            t0 = timer.elapsed();
+        }
+
+        // Version with no bounds check
+        if !boundscheck {
+            for row_tile in 0..num_tiles {
+                for col_tile in 0..num_tiles {
+                    for i in 0..tilesize {
+                        for j in 0..tilesize {
+                            let rowidx = row_tile * tilesize + i;
+                            let colidx = col_tile * tilesize + j;
+                            b[colidx * order + rowidx] += a[rowidx * order + colidx];
+                            a[rowidx * order + colidx] += 1.0;
+                        }
+                    }
+                }
+            }
+        } else {
+            // Version with bounds check
+            for row_tile in 0..num_tiles {
+                for col_tile in 0..num_tiles {
+                    for i in 0..tilesize {
+                        for j in 0..tilesize {
+                            let rowidx = row_tile * tilesize + i;
+                            let colidx = col_tile * tilesize + j;
+                            if rowidx < order && colidx < order {
+                                b[colidx * order + rowidx] += a[rowidx * order + colidx];
+                                a[rowidx * order + colidx] += 1.0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    let t1 = timer.elapsed();
+    let dt = (t1.checked_sub(t0)).unwrap();
+    let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
+    let transpose_time: f64 = dtt as f64 * 1.0e-9;
+
+    ///////////////////////////////////////////////
+    // Analyze and output results
+    ///////////////////////////////////////////////
 
+    let addit: usize = ((iterations as usize + 1) * (iterations as usize)) / 2;
+    let mut abserr: f64 = 0.0;
+    for i in 0..order {
+        for j in 0..order {
+            let ij = i * order + j;
+            let ji = j * order + i;
+            let reference: f64 = (ij * (iterations as usize + 1) + addit) as f64;
+            abserr += (b[ji] - reference).abs();
+        }
+    }
+
+    if cfg!(VERBOSE) {
+        println!("Sum of absolute differences: {:30.15}", abserr);
+    }
+
+    let epsilon: f64 = 1.0e-8;
+    if abserr < epsilon {
+        println!("Solution validates");
+        let avgtime: f64 = (transpose_time as f64) / (iterations as f64);
+        let bytes: usize = 2_usize * nelems * mem::size_of::<f64>();
+        println!(
+            "Rate (MB/s): {:10.3} Avg time (s): {:10.3}",
+            (1.0e-6_f64) * (bytes as f64) / avgtime,
+            avgtime
+        );
+    } else {
+        println!(
+            "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}",
+            abserr, epsilon
+        );
+        return;
+    }
+}

From e5009d17778e35c547326fa689ff144948bb0fcc Mon Sep 17 00:00:00 2001
From: Sajid Ali <sajidsyed2021@u.northwestern.edu>
Date: Mon, 7 Nov 2022 07:35:05 -0600
Subject: [PATCH 59/80] RUST: transpose with iter!

	modified:   .gitignore
	new file:   RUST/transpose-iter/Cargo.toml
	new file:   RUST/transpose-iter/src/main.rs
---
 .gitignore                      |   2 +
 RUST/transpose-iter/Cargo.toml  |   6 +
 RUST/transpose-iter/src/main.rs | 258 ++++++++++++++++++++++++++++++++
 3 files changed, 266 insertions(+)
 create mode 100644 RUST/transpose-iter/Cargo.toml
 create mode 100644 RUST/transpose-iter/src/main.rs

diff --git a/.gitignore b/.gitignore
index 9ba4c2b06..eef173b49 100644
--- a/.gitignore
+++ b/.gitignore
@@ -391,6 +391,8 @@ RUST/stencil/Cargo.lock
 RUST/stencil/target/
 RUST/transpose/Cargo.lock
 RUST/transpose/target/
+RUST/transpose-iter/Cargo.lock
+RUST/transpose-iter/target/
 SERIAL/AMR/amr
 SERIAL/Branch/branch
 SERIAL/DGEMM/dgemm
diff --git a/RUST/transpose-iter/Cargo.toml b/RUST/transpose-iter/Cargo.toml
new file mode 100644
index 000000000..22fe9074e
--- /dev/null
+++ b/RUST/transpose-iter/Cargo.toml
@@ -0,0 +1,6 @@
+[package]
+name = "transpose"
+version = "0.1.0"
+authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Sajid Ali <sasyed@fnal.gov>"]
+
+edition = "2021"
diff --git a/RUST/transpose-iter/src/main.rs b/RUST/transpose-iter/src/main.rs
new file mode 100644
index 000000000..f50e7dd27
--- /dev/null
+++ b/RUST/transpose-iter/src/main.rs
@@ -0,0 +1,258 @@
+//
+// Copyright (c) 2013, Intel Corporation
+// Copyright (c) 2022, Sajid Ali
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+// * Neither the name of Intel Corporation nor the names of its
+//       contributors may be used to endorse or promote products
+//       derived from this software without specific prior written
+//       permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+///////////////////////////////////////////////
+//
+// NAME:    transpose
+//
+// PURPOSE: This program measures the time for the transpose of a
+//          column-major stored matrix into a row-major stored matrix.
+//
+// USAGE:   Program input is the matrix order and the number of times to
+//          repeat the operation:
+//
+//          transpose <matrix_size> <# iterations> [tile size]
+//
+//          An optional parameter specifies the tile size used to divide the
+//          individual matrix blocks for improved cache and TLB performance.
+//
+//          The output consists of diagnostics to make sure the
+//          transpose worked and timing statistics.
+//
+// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+//          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+//
+///////////////////////////////////////////////
+
+use std::env;
+use std::mem;
+use std::time::{Duration, Instant};
+
+fn help() {
+    println!("Usage: <# iterations> <matrix order> [tile size]");
+}
+
+fn main() {
+    println!("Parallel Research Kernels");
+    println!("Rust Matrix transpose: B = A^T");
+
+    ///////////////////////////////////////////////
+    // Read and test input parameters
+    ///////////////////////////////////////////////
+
+    let args: Vec<String> = env::args().collect();
+
+    let iterations: u32;
+    let order: usize;
+    let mut tilesize: usize;
+
+    match args.len() {
+        3 => {
+            iterations = match args[1].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            order = match args[2].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            tilesize = 32;
+        }
+        4 => {
+            iterations = match args[1].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            order = match args[2].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            tilesize = match args[3].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+        }
+        _ => {
+            help();
+            return;
+        }
+    }
+
+    if tilesize > order {
+        println!("Warning: tilesize cannot be > order, will not use tiling!");
+    }
+
+    println!("Number of iterations  = {}", iterations);
+    println!("Matrix order          = {}", order);
+    if tilesize < order {
+        println!("Tile size             = {}", tilesize);
+    } else {
+        tilesize = order;
+        println!("Untiled");
+    }
+
+    if order % tilesize != 0 && tilesize < order {
+        panic!("Cannot use the given tilesize!")
+    };
+
+    let num_tiles: usize = order / tilesize; // all tiles have same size
+
+    /////////////////////////////////////////////////////
+    // Allocate space for the input and transpose matrix
+    /////////////////////////////////////////////////////
+
+    let nelems: usize = order * order;
+    let mut a: Vec<f64> = vec![0.0; nelems];
+    let mut b: Vec<f64> = vec![0.0; nelems];
+
+    // Initialize matrices
+    for i in 0..order {
+        for j in 0..order {
+            a[i * order + j] = (i * order + j) as f64;
+        }
+    }
+
+    println!("Initialization done, running algorithm");
+
+    let timer = Instant::now();
+    let mut t0: Duration = timer.elapsed();
+
+    for k in 0..iterations + 1 {
+        if k == 1 {
+            t0 = timer.elapsed();
+        }
+
+        /*
+        (0..num_tiles).for_each(|row_tile_idx| {
+            (0..num_tiles).for_each(|col_tile_idx| {
+                (0..tilesize).for_each(|row_within_tile| {
+                    (0..tilesize).for_each(|col_within_tile| {
+                        let rowidx: usize = row_tile_idx * tilesize + row_within_tile;
+                        let colidx: usize = col_tile_idx * tilesize + col_within_tile;
+                        b[rowidx * order + colidx] += a[colidx * order + rowidx];
+                    })
+                })
+            })
+        });
+        */
+
+        b.chunks_exact_mut(tilesize * order)
+            .enumerate()
+            // for the current set of row tiles
+            // and the rows corresponding to this row tile
+            .for_each(|(row_tile_idx, b_rows)| {
+                // iterator over all column tiles
+                (0..num_tiles).for_each(|col_tile_idx| {
+                    // within the tile, iterate over *tilesize* rows of b
+                    // zipped together with rows of b available in the tile
+                    (0..tilesize).zip(b_rows.chunks_exact_mut(order)).for_each(
+                        |(row_within_tile, bi)| {
+                            let bi_subset_cols = bi
+                                .get_mut((col_tile_idx * tilesize)..((col_tile_idx + 1) * tilesize))
+                                .unwrap();
+                            // within the tile, iterator over *tilesize* columns of b
+                            // zipped together with subset of columns of b
+                            (0..tilesize).zip(bi_subset_cols.iter_mut()).for_each(
+                                |(col_within_tile, b_element)| {
+                                    let rowidx: usize = row_tile_idx * tilesize + row_within_tile;
+                                    let colidx: usize = col_tile_idx * tilesize + col_within_tile;
+                                    *b_element += a[colidx * order + rowidx];
+                                },
+                            )
+                        },
+                    )
+                })
+            });
+
+        // straightforward addition of 1.0 to all elements of A
+        a.iter_mut().for_each(|a_element| {
+            *a_element += 1.0;
+        });
+    }
+
+    let t1 = timer.elapsed();
+    let dt = (t1.checked_sub(t0)).unwrap();
+    let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
+    let transpose_time: f64 = dtt as f64 * 1.0e-9;
+
+    ///////////////////////////////////////////////
+    // Analyze and output results
+    ///////////////////////////////////////////////
+
+    let addit: usize = ((iterations as usize + 1) * (iterations as usize)) / 2;
+    let mut abserr: f64 = 0.0;
+    for i in 0..order {
+        for j in 0..order {
+            let ij = i * order + j;
+            let ji = j * order + i;
+            let reference: f64 = (ij * (iterations as usize + 1) + addit) as f64;
+            abserr += (b[ji] - reference).abs();
+        }
+    }
+
+    if cfg!(VERBOSE) {
+        println!("Sum of absolute differences: {:30.15}", abserr);
+    }
+
+    let epsilon: f64 = 1.0e-8;
+    if abserr < epsilon {
+        println!("Solution validates");
+        let avgtime: f64 = (transpose_time as f64) / (iterations as f64);
+        let bytes: usize = 2_usize * nelems * mem::size_of::<f64>();
+        println!(
+            "Rate (MB/s): {:10.3} Avg time (s): {:10.3}",
+            (1.0e-6_f64) * (bytes as f64) / avgtime,
+            avgtime
+        );
+    } else {
+        println!(
+            "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}",
+            abserr, epsilon
+        );
+        return;
+    }
+}

From 1c30af0a05e9738a1733f1c2cef51067495a3f6d Mon Sep 17 00:00:00 2001
From: Sajid Ali <sajidsyed2021@u.northwestern.edu>
Date: Tue, 8 Nov 2022 09:47:15 -0600
Subject: [PATCH 60/80] RUST: add transpose-rayon

	modified:   .gitignore
	modified:   RUST/Makefile
	modified:   RUST/transpose-iter/src/main.rs
	new file:   RUST/transpose-rayon/Cargo.toml
	new file:   RUST/transpose-rayon/src/main.rs
---
 .gitignore                       |   2 +
 RUST/Makefile                    |   4 +
 RUST/transpose-iter/src/main.rs  |  19 +--
 RUST/transpose-rayon/Cargo.toml  |   9 ++
 RUST/transpose-rayon/src/main.rs | 247 +++++++++++++++++++++++++++++++
 5 files changed, 265 insertions(+), 16 deletions(-)
 create mode 100644 RUST/transpose-rayon/Cargo.toml
 create mode 100644 RUST/transpose-rayon/src/main.rs

diff --git a/.gitignore b/.gitignore
index eef173b49..1bacbfade 100644
--- a/.gitignore
+++ b/.gitignore
@@ -393,6 +393,8 @@ RUST/transpose/Cargo.lock
 RUST/transpose/target/
 RUST/transpose-iter/Cargo.lock
 RUST/transpose-iter/target/
+RUST/transpose-rayon/Cargo.lock
+RUST/transpose-rayon/target/
 SERIAL/AMR/amr
 SERIAL/Branch/branch
 SERIAL/DGEMM/dgemm
diff --git a/RUST/Makefile b/RUST/Makefile
index cc3fa2d06..f72474c64 100644
--- a/RUST/Makefile
+++ b/RUST/Makefile
@@ -20,6 +20,8 @@ all:
 	cd p2p 		   && cargo build $(RCFLAGS)
 	cd stencil 	   && cargo build $(RCFLAGS)
 	cd transpose 	   && cargo build $(RCFLAGS)
+	cd transpose-iter  && cargo build $(RCFLAGS)
+	cd transpose-rayon && cargo build $(RCFLAGS)
 	cd dgemm           && cargo build $(RCFLAGS)
 	cd dgemm-iter      && cargo build $(RCFLAGS)
 	cd dgemm-rayon     && cargo build $(RCFLAGS)
@@ -31,6 +33,8 @@ clean:
 	cd p2p 	 	   && cargo clean
 	cd stencil 	   && cargo clean
 	cd transpose 	   && cargo clean
+	cd transpose-iter  && cargo clean
+	cd transpose-rayon && cargo clean
 	cd dgemm           && cargo clean
 	cd dgemm-iter      && cargo clean
 	cd dgemm-rayon     && cargo clean
diff --git a/RUST/transpose-iter/src/main.rs b/RUST/transpose-iter/src/main.rs
index f50e7dd27..4aae33be1 100644
--- a/RUST/transpose-iter/src/main.rs
+++ b/RUST/transpose-iter/src/main.rs
@@ -166,35 +166,22 @@ fn main() {
             t0 = timer.elapsed();
         }
 
-        /*
-        (0..num_tiles).for_each(|row_tile_idx| {
-            (0..num_tiles).for_each(|col_tile_idx| {
-                (0..tilesize).for_each(|row_within_tile| {
-                    (0..tilesize).for_each(|col_within_tile| {
-                        let rowidx: usize = row_tile_idx * tilesize + row_within_tile;
-                        let colidx: usize = col_tile_idx * tilesize + col_within_tile;
-                        b[rowidx * order + colidx] += a[colidx * order + rowidx];
-                    })
-                })
-            })
-        });
-        */
-
         b.chunks_exact_mut(tilesize * order)
             .enumerate()
             // for the current set of row tiles
             // and the rows corresponding to this row tile
             .for_each(|(row_tile_idx, b_rows)| {
-                // iterator over all column tiles
+                // iterate over all column tiles
                 (0..num_tiles).for_each(|col_tile_idx| {
                     // within the tile, iterate over *tilesize* rows of b
                     // zipped together with rows of b available in the tile
                     (0..tilesize).zip(b_rows.chunks_exact_mut(order)).for_each(
+                        // bi is the ith row of b
                         |(row_within_tile, bi)| {
                             let bi_subset_cols = bi
                                 .get_mut((col_tile_idx * tilesize)..((col_tile_idx + 1) * tilesize))
                                 .unwrap();
-                            // within the tile, iterator over *tilesize* columns of b
+                            // within the tile, iterate over *tilesize* columns of b
                             // zipped together with subset of columns of b
                             (0..tilesize).zip(bi_subset_cols.iter_mut()).for_each(
                                 |(col_within_tile, b_element)| {
diff --git a/RUST/transpose-rayon/Cargo.toml b/RUST/transpose-rayon/Cargo.toml
new file mode 100644
index 000000000..fa75e1f79
--- /dev/null
+++ b/RUST/transpose-rayon/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "transpose"
+version = "0.1.0"
+authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Sajid Ali <sasyed@fnal.gov>"]
+
+edition = "2021"
+
+[dependencies]
+rayon = "1.5"
diff --git a/RUST/transpose-rayon/src/main.rs b/RUST/transpose-rayon/src/main.rs
new file mode 100644
index 000000000..8cfced9c0
--- /dev/null
+++ b/RUST/transpose-rayon/src/main.rs
@@ -0,0 +1,247 @@
+//
+// Copyright (c) 2013, Intel Corporation
+// Copyright (c) 2022, Sajid Ali
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+// * Neither the name of Intel Corporation nor the names of its
+//       contributors may be used to endorse or promote products
+//       derived from this software without specific prior written
+//       permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+///////////////////////////////////////////////
+//
+// NAME:    transpose
+//
+// PURPOSE: This program measures the time for the transpose of a
+//          column-major stored matrix into a row-major stored matrix.
+//
+// USAGE:   Program input is the matrix order and the number of times to
+//          repeat the operation:
+//
+//          transpose <matrix_size> <# iterations> [tile size]
+//
+//          An optional parameter specifies the tile size used to divide the
+//          individual matrix blocks for improved cache and TLB performance.
+//
+//          The output consists of diagnostics to make sure the
+//          transpose worked and timing statistics.
+//
+// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+//          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+//
+///////////////////////////////////////////////
+
+use rayon::prelude::*;
+use std::env;
+use std::mem;
+use std::time::{Duration, Instant};
+
+fn help() {
+    println!("Usage: <# iterations> <matrix order> [tile size]");
+}
+
+fn main() {
+    println!("Parallel Research Kernels");
+    println!("Rust Matrix transpose: B = A^T");
+
+    ///////////////////////////////////////////////
+    // Read and test input parameters
+    ///////////////////////////////////////////////
+
+    let args: Vec<String> = env::args().collect();
+
+    let iterations: u32;
+    let order: usize;
+    let mut tilesize: usize;
+
+    match args.len() {
+        3 => {
+            iterations = match args[1].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            order = match args[2].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            tilesize = 32;
+        }
+        4 => {
+            iterations = match args[1].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            order = match args[2].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            tilesize = match args[3].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+        }
+        _ => {
+            help();
+            return;
+        }
+    }
+
+    if tilesize > order {
+        println!("Warning: tilesize cannot be > order, will not use tiling!");
+    }
+
+    println!("Number of iterations  = {}", iterations);
+    println!("Matrix order          = {}", order);
+    if tilesize < order {
+        println!("Tile size             = {}", tilesize);
+    } else {
+        tilesize = order;
+        println!("Untiled");
+    }
+
+    if order % tilesize != 0 && tilesize < order {
+        panic!("Cannot use the given tilesize!")
+    };
+
+    let num_tiles: usize = order / tilesize; // all tiles have same size
+
+    /////////////////////////////////////////////////////
+    // Allocate space for the input and transpose matrix
+    /////////////////////////////////////////////////////
+
+    let nelems: usize = order * order;
+    let mut a: Vec<f64> = vec![0.0; nelems];
+    let mut b: Vec<f64> = vec![0.0; nelems];
+
+    // Initialize matrices
+    for i in 0..order {
+        for j in 0..order {
+            a[i * order + j] = (i * order + j) as f64;
+        }
+    }
+
+    println!("Initialization done, running algorithm");
+
+    let timer = Instant::now();
+    let mut t0: Duration = timer.elapsed();
+
+    for k in 0..iterations + 1 {
+        if k == 1 {
+            t0 = timer.elapsed();
+        }
+
+        // parallelisze outermost loop with rayon
+        b.par_chunks_exact_mut(tilesize * order)
+            .enumerate()
+            // for the current set of row tiles
+            // and the rows corresponding to this row tile
+            .for_each(|(row_tile_idx, b_rows)| {
+                // iterate over all column tiles
+                (0..num_tiles).for_each(|col_tile_idx| {
+                    // within the tile, iterate over *tilesize* rows of b
+                    // zipped together with rows of b available in the tile
+                    (0..tilesize).zip(b_rows.chunks_exact_mut(order)).for_each(
+                        // bi is the ith row of b
+                        |(row_within_tile, bi)| {
+                            let bi_subset_cols = bi
+                                .get_mut((col_tile_idx * tilesize)..((col_tile_idx + 1) * tilesize))
+                                .unwrap();
+                            // within the tile, iterate over *tilesize* columns of b
+                            // zipped together with subset of columns of b
+                            (0..tilesize).zip(bi_subset_cols.iter_mut()).for_each(
+                                |(col_within_tile, b_element)| {
+                                    let rowidx: usize = row_tile_idx * tilesize + row_within_tile;
+                                    let colidx: usize = col_tile_idx * tilesize + col_within_tile;
+                                    *b_element += a[colidx * order + rowidx];
+                                },
+                            )
+                        },
+                    )
+                })
+            });
+
+        // straightforward addition of 1.0 to all elements of A
+        a.par_iter_mut().for_each(|a_element| {
+            *a_element += 1.0;
+        });
+    }
+
+    let t1 = timer.elapsed();
+    let dt = (t1.checked_sub(t0)).unwrap();
+    let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
+    let transpose_time: f64 = dtt as f64 * 1.0e-9;
+
+    ///////////////////////////////////////////////
+    // Analyze and output results
+    ///////////////////////////////////////////////
+
+    let addit: usize = ((iterations as usize + 1) * (iterations as usize)) / 2;
+    let mut abserr: f64 = 0.0;
+    for i in 0..order {
+        for j in 0..order {
+            let ij = i * order + j;
+            let ji = j * order + i;
+            let reference: f64 = (ij * (iterations as usize + 1) + addit) as f64;
+            abserr += (b[ji] - reference).abs();
+        }
+    }
+
+    if cfg!(VERBOSE) {
+        println!("Sum of absolute differences: {:30.15}", abserr);
+    }
+
+    let epsilon: f64 = 1.0e-8;
+    if abserr < epsilon {
+        println!("Solution validates");
+        let avgtime: f64 = (transpose_time as f64) / (iterations as f64);
+        let bytes: usize = 2_usize * nelems * mem::size_of::<f64>();
+        println!(
+            "Rate (MB/s): {:10.3} Avg time (s): {:10.3}",
+            (1.0e-6_f64) * (bytes as f64) / avgtime,
+            avgtime
+        );
+    } else {
+        println!(
+            "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}",
+            abserr, epsilon
+        );
+        return;
+    }
+}

From 3036da41ec0e783595c0b54d72d62b72321a8163 Mon Sep 17 00:00:00 2001
From: Sajid Ali <sajidsyed2021@u.northwestern.edu>
Date: Tue, 8 Nov 2022 17:07:50 -0600
Subject: [PATCH 61/80] Update nstream-kokkos for kokkos-3.7 compatibility

	modified:   Cxx11/nstream-kokkos.cc
---
 Cxx11/nstream-kokkos.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc
index 6ec3528de..340ce1819 100644
--- a/Cxx11/nstream-kokkos.cc
+++ b/Cxx11/nstream-kokkos.cc
@@ -163,7 +163,7 @@ int main(int argc, char * argv[])
 
     double asum(0);
     Kokkos::parallel_reduce(length, KOKKOS_LAMBDA(size_t const i, double & inner) {
-        using Kokkos::Experimental::fabs;
+        using Kokkos::fabs;
         inner += fabs(A(i));
     }, asum);
     Kokkos::fence();

From a7687472c15942630b42fca50a68f558afc79f4a Mon Sep 17 00:00:00 2001
From: Sajid Ali <sajidsyed2021@u.northwestern.edu>
Date: Wed, 9 Nov 2022 10:22:31 -0600
Subject: [PATCH 62/80] RUST: clarify naming, and add old blis based dgemm as a
 separate kernel

	modified:   .gitignore
	new file:   RUST/dgemm-blis/Cargo.toml
	new file:   RUST/dgemm-blis/src/main.rs
	modified:   RUST/dgemm-iter/Cargo.toml
	modified:   RUST/dgemm-rayon/Cargo.toml
	modified:   RUST/nstream-iter/Cargo.toml
	modified:   RUST/nstream-rayon/Cargo.toml
	modified:   RUST/transpose-iter/Cargo.toml
	modified:   RUST/transpose-rayon/Cargo.toml
---
 .gitignore                      |   2 +
 RUST/dgemm-blis/Cargo.toml      |  10 ++
 RUST/dgemm-blis/src/main.rs     | 202 ++++++++++++++++++++++++++++++++
 RUST/dgemm-iter/Cargo.toml      |   2 +-
 RUST/dgemm-rayon/Cargo.toml     |   2 +-
 RUST/nstream-iter/Cargo.toml    |   2 +-
 RUST/nstream-rayon/Cargo.toml   |   2 +-
 RUST/transpose-iter/Cargo.toml  |   2 +-
 RUST/transpose-rayon/Cargo.toml |   2 +-
 9 files changed, 220 insertions(+), 6 deletions(-)
 create mode 100644 RUST/dgemm-blis/Cargo.toml
 create mode 100644 RUST/dgemm-blis/src/main.rs

diff --git a/.gitignore b/.gitignore
index 1bacbfade..73e16e2da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -381,6 +381,8 @@ RUST/nstream-rayon/Cargo.lock
 RUST/nstream-rayon/target/
 RUST/dgemm/Cargo.lock
 RUST/dgemm/target/
+RUST/dgemm-blis/Cargo.lock
+RUST/dgemm-blis/target/
 RUST/dgemm-iter/Cargo.lock
 RUST/dgemm-iter/target/
 RUST/dgemm-rayon/Cargo.lock
diff --git a/RUST/dgemm-blis/Cargo.toml b/RUST/dgemm-blis/Cargo.toml
new file mode 100644
index 000000000..3ea994400
--- /dev/null
+++ b/RUST/dgemm-blis/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "dgemm-blis"
+version = "0.1.0"
+authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Sajid Ali <sasyed@fnal.gov>"]
+
+edition="2021"
+
+[dependencies]
+cblas = "0.4"
+blas-src = { version = "0.8", features = ["blis"] }
diff --git a/RUST/dgemm-blis/src/main.rs b/RUST/dgemm-blis/src/main.rs
new file mode 100644
index 000000000..83ff6d041
--- /dev/null
+++ b/RUST/dgemm-blis/src/main.rs
@@ -0,0 +1,202 @@
+//
+// Copyright (c) 2013, Intel Corporation
+// Copyright (c) 2022, Sajid Ali
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+// * Neither the name of Intel Corporation nor the names of its
+//       contributors may be used to endorse or promote products
+//       derived from this software without specific prior written
+//       permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////
+//
+// NAME:    transpose
+//
+// PURPOSE: This program measures the time for the transpose of a
+//          column-major stored matrix into a row-major stored matrix.
+//
+// USAGE:   Program input is the matrix order and the number of times to
+//          repeat the operation:
+//
+//          transpose <matrix_size> <# iterations> [tile size]
+//
+//          An optional parameter specifies the tile size used to divide the
+//          individual matrix blocks for improved cache and TLB performance.
+//
+//          The output consists of diagnostics to make sure the
+//          transpose worked and timing statistics.
+//
+// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+//          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+//
+///////////////////////////////////////////////
+
+// Need the following to prevent linker errors per
+// https://github.com/blas-lapack-rs/blas-lapack-rs.github.io/wiki
+extern crate blas_src;
+
+use std::env;
+use std::time::{Duration, Instant};
+
+fn help() {
+    println!("Usage: <# iterations> <matrix order>");
+}
+
+fn main() {
+    println!("Parallel Research Kernels");
+    println!("Rust Dense matrix-matrix multiplication: C += A x B");
+
+    ///////////////////////////////////////////////
+    // Read and test input parameters
+    ///////////////////////////////////////////////
+
+    let args: Vec<String> = env::args().collect();
+
+    let iterations: u32;
+    let order: usize;
+
+    match args.len() {
+        3 => {
+            iterations = match args[1].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+            order = match args[2].parse() {
+                Ok(n) => n,
+                Err(_) => {
+                    help();
+                    return;
+                }
+            };
+        }
+        _ => {
+            help();
+            return;
+        }
+    }
+
+    if iterations < 1 {
+        println!("ERROR: iterations must be >= 1");
+    }
+
+    println!("Number of iterations  = {}", iterations);
+    println!("Matrix order          = {}", order);
+
+    ///////////////////////////////////////////////
+    // Allocate space for the input and transpose matrix
+    ///////////////////////////////////////////////
+
+    let nelems: usize = order * order;
+    let mut a: Vec<f64> = vec![0.0; nelems];
+    let mut b: Vec<f64> = vec![0.0; nelems];
+    let mut c: Vec<f64> = vec![0.0; nelems];
+
+    for i in 0..order {
+        for j in 0..order {
+            a[i * order + j] = i as f64;
+            b[i * order + j] = i as f64;
+        }
+    }
+
+    let timer = Instant::now();
+    let mut t0: Duration = timer.elapsed();
+
+    for k in 0..iterations + 1 {
+        if k == 1 {
+            t0 = timer.elapsed();
+        }
+
+        //prk_dgemm(order, &mut a, &mut b, &mut c);
+        let m: i32 = order as i32;
+        let n: i32 = order as i32;
+        let k: i32 = order as i32;
+        unsafe {
+            cblas::dgemm(
+                cblas::Layout::RowMajor,
+                cblas::Transpose::None,
+                cblas::Transpose::None,
+                m,
+                n,
+                k,
+                1.0,
+                &a,
+                m,
+                &b,
+                k,
+                1.0,
+                &mut c,
+                m,
+            );
+        }
+    }
+    let t1 = timer.elapsed();
+    let dt = (t1.checked_sub(t0)).unwrap();
+    let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64;
+    let dgemm_time: f64 = dtt as f64 * 1.0e-9;
+
+    ///////////////////////////////////////////////
+    // Analyze and output results
+    ///////////////////////////////////////////////
+
+    let forder: f64 = order as f64;
+    let reference: f64 = 0.25
+        * (forder * forder * forder)
+        * (forder - 1.0)
+        * (forder - 1.0)
+        * (iterations as f64 + 1.0);
+    let mut checksum: f64 = 0.0;
+    for i in 0..order {
+        for j in 0..order {
+            checksum += c[i * order + j];
+        }
+    }
+
+    if cfg!(VERBOSE) {
+        println!("Sum of absolute differences: {:30.15}", checksum);
+    }
+
+    let epsilon: f64 = 1.0e-8;
+    let residuum: f64 = (checksum - reference) / reference;
+    if residuum < epsilon {
+        println!("Solution validates");
+        let avgtime: f64 = (dgemm_time as f64) / (iterations as f64);
+        let uorder: usize = order as usize;
+        let nflops: usize = 2_usize * uorder * uorder * uorder;
+        println!(
+            "Rate (MB/s): {:10.3} Avg time (s): {:10.3}",
+            (1.0e-6_f64) * (nflops as f64) / avgtime,
+            avgtime
+        );
+    } else {
+        println!(
+            "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}",
+            residuum, epsilon
+        );
+        return;
+    }
+}
diff --git a/RUST/dgemm-iter/Cargo.toml b/RUST/dgemm-iter/Cargo.toml
index 5714a1fa3..af296857c 100644
--- a/RUST/dgemm-iter/Cargo.toml
+++ b/RUST/dgemm-iter/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "dgemm"
+name = "dgemm-iter"
 version = "0.1.0"
 authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Sajid Ali <sasyed@fnal.gov>"]
 
diff --git a/RUST/dgemm-rayon/Cargo.toml b/RUST/dgemm-rayon/Cargo.toml
index 49886cd96..905e888df 100644
--- a/RUST/dgemm-rayon/Cargo.toml
+++ b/RUST/dgemm-rayon/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "dgemm"
+name = "dgemm-rayon"
 version = "0.1.0"
 authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Sajid Ali <sasyed@fnal.gov>"]
 
diff --git a/RUST/nstream-iter/Cargo.toml b/RUST/nstream-iter/Cargo.toml
index 479e87e60..b43f54b10 100644
--- a/RUST/nstream-iter/Cargo.toml
+++ b/RUST/nstream-iter/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "nstream"
+name = "nstream-iter"
 version = "0.1.0"
 authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Thomas Hayward-Schneider <thomas.hayward@ipp.mpg.de>"]
 
diff --git a/RUST/nstream-rayon/Cargo.toml b/RUST/nstream-rayon/Cargo.toml
index 054caa930..af291bdbf 100644
--- a/RUST/nstream-rayon/Cargo.toml
+++ b/RUST/nstream-rayon/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "nstream"
+name = "nstream-rayon"
 version = "0.1.0"
 authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Thomas Hayward-Schneider <thomas.hayward@ipp.mpg.de>", "Sajid Ali <sasyed@fnal.gov>"]
 
diff --git a/RUST/transpose-iter/Cargo.toml b/RUST/transpose-iter/Cargo.toml
index 22fe9074e..840edb129 100644
--- a/RUST/transpose-iter/Cargo.toml
+++ b/RUST/transpose-iter/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "transpose"
+name = "transpose-iter"
 version = "0.1.0"
 authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Sajid Ali <sasyed@fnal.gov>"]
 
diff --git a/RUST/transpose-rayon/Cargo.toml b/RUST/transpose-rayon/Cargo.toml
index fa75e1f79..540969f59 100644
--- a/RUST/transpose-rayon/Cargo.toml
+++ b/RUST/transpose-rayon/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "transpose"
+name = "transpose-rayon"
 version = "0.1.0"
 authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Sajid Ali <sasyed@fnal.gov>"]
 

From 929548b9fb5508e56f5e63588f5b2845089b808a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 17 Nov 2022 13:24:42 +0200
Subject: [PATCH 63/80] GCC OpenACC does not support runtime tilesizes

---
 C1z/transpose-openacc.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/C1z/transpose-openacc.c b/C1z/transpose-openacc.c
index 0ffc76c8e..8bd66d14d 100644
--- a/C1z/transpose-openacc.c
+++ b/C1z/transpose-openacc.c
@@ -90,7 +90,11 @@ int main(int argc, char * argv[])
 
   printf("Number of iterations  = %d\n", iterations);
   printf("Matrix order          = %d\n", order);
+#ifdef __GNUC__
+  printf("Tile size             = %s\n", "automatic (GCC)");
+#else
   printf("Tile size             = %d\n", tile_size);
+#endif
 
   //////////////////////////////////////////////////////////////////////
   /// Allocate space for the input and transpose matrix
@@ -115,7 +119,11 @@ int main(int argc, char * argv[])
 
       if (iter==1) trans_time = prk_wtime();
 
+#ifdef __GNUC__
+      #pragma acc parallel loop tile(*,*) deviceptr(A,B)
+#else
       #pragma acc parallel loop tile(tile_size,tile_size) deviceptr(A,B)
+#endif
       for (int i=0;i<order; i++) {
         for (int j=0;j<order;j++) {
           B[i*order+j] += A[j*order+i];

From 6c97f2e24709fa8bed077a117c0562194f101773 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 17 Nov 2022 13:25:57 +0200
Subject: [PATCH 64/80] GCC OpenACC does not support runtime tilesizes

---
 Cxx11/transpose-openacc.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Cxx11/transpose-openacc.cc b/Cxx11/transpose-openacc.cc
index 130d424d3..c93e0414c 100644
--- a/Cxx11/transpose-openacc.cc
+++ b/Cxx11/transpose-openacc.cc
@@ -96,7 +96,11 @@ int main(int argc, char * argv[])
 
   std::cout << "Number of iterations  = " << iterations << std::endl;
   std::cout << "Matrix order          = " << order << std::endl;
+#ifdef __GNUC__
+  std::cout << "Tile size             = " << "automatic (GCC)" << std::endl;
+#else
   std::cout << "Tile size             = " << tile_size << std::endl;
+#endif
 
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation
@@ -121,7 +125,11 @@ int main(int argc, char * argv[])
 
       if (iter==1) trans_time = prk::wtime();
 
+#ifdef __GNUC__
+      #pragma acc parallel loop tile(*,*) deviceptr(A,B)
+#else
       #pragma acc parallel loop tile(tile_size,tile_size) deviceptr(A,B)
+#endif
       for (int i=0;i<order; i++) {
         for (int j=0;j<order;j++) {
           B[i*order+j] += A[j*order+i];

From 4bc64cf3181a4247dc7d2ad21df09b80841a2e77 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 17 Nov 2022 13:47:34 +0200
Subject: [PATCH 65/80] fix restrict->RESTRICT

---
 Cxx11/transpose-openacc.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/transpose-openacc.cc b/Cxx11/transpose-openacc.cc
index c93e0414c..258064534 100644
--- a/Cxx11/transpose-openacc.cc
+++ b/Cxx11/transpose-openacc.cc
@@ -109,8 +109,8 @@ int main(int argc, char * argv[])
   double trans_time{0};
 
   size_t bytes = order*order*sizeof(double);
-  double * restrict A = (double *)acc_malloc(bytes);
-  double * restrict B = (double *)acc_malloc(bytes);
+  double * RESTRICT A = (double *)acc_malloc(bytes);
+  double * RESTRICT B = (double *)acc_malloc(bytes);
 
   {
     #pragma acc parallel loop deviceptr(A,B)

From 793667cbd2eb137c73b1675894cd7ad1f21ce2f5 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Mon, 7 Nov 2022 08:44:06 +0200
Subject: [PATCH 66/80] add SGEMM CBLAS

---
 Cxx11/Makefile       |   2 +-
 Cxx11/sgemm-cblas.cc | 340 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 341 insertions(+), 1 deletion(-)
 create mode 100644 Cxx11/sgemm-cblas.cc

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 3a50f690a..ee69f1a75 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -133,7 +133,7 @@ thrust: nstream-host-thrust nstream-device-thrust \
 
 cublas: transpose-cublas nstream-cublas dgemm-cublas dgemm-multigpu-cublas dgemm-mpi-cublas sgemm-cublas
 
-cblas: transpose-cblas dgemm-cblas
+cblas: transpose-cblas dgemm-cblas sgemm-cblas
 
 onemkl: nstream-onemkl dgemm-onemkl dgemm-multigpu-onemkl
 
diff --git a/Cxx11/sgemm-cblas.cc b/Cxx11/sgemm-cblas.cc
new file mode 100644
index 000000000..625ce693f
--- /dev/null
+++ b/Cxx11/sgemm-cblas.cc
@@ -0,0 +1,340 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+/// Copyright (c) 2021, NVIDIA
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    sgemm
+///
+/// PURPOSE: This program tests the efficiency with which a dense matrix
+///          dense multiplication is carried out
+///
+/// USAGE:   The program takes as input the matrix order,
+///          the number of times the matrix-matrix multiplication
+///          is carried out, and, optionally, a tile size for matrix
+///          blocking
+///
+///          <progname> <# iterations> <matrix order> [<batches>]
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than OpenMP or standard C functions, the following
+///          functions are used in this program:
+///
+///          cblas_sgemm()
+///          cblas_sgemm_batch()
+///
+/// HISTORY: Written by Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, December, 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+
+#if defined(MKL)
+#include <mkl.h>
+#ifdef MKL_ILP64
+#error Use the MKL library for 32-bit integers!
+#endif
+#elif defined(ACCELERATE)
+// The location of cblas.h is not in the system include path when -framework Accelerate is provided.
+#include <Accelerate/Accelerate.h>
+#else
+#include <cblas.h>
+#endif
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef PRK_DEBUG
+#include <random>
+void prk_sgemm_loops(const int order,
+               const std::vector<float> & A,
+               const std::vector<float> & B,
+                     std::vector<float> & C)
+{
+    for (int i=0; i<order; ++i) {
+      for (int j=0; j<order; ++j) {
+        for (int k=0; k<order; ++k) {
+            C[i*order+j] += A[i*order+k] * B[k*order+j];
+        }
+      }
+    }
+}
+#endif
+
+void prk_sgemm(const int order,
+               const std::vector<float> & A,
+               const std::vector<float> & B,
+                     std::vector<float> & C)
+{
+    const int n = order;
+    const float alpha = 1.0;
+    const float beta  = 1.0;
+
+    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n, n, n, alpha, A.data(), n, B.data(), n, beta, C.data(), n);
+}
+
+void prk_sgemm(const int order, const int batches,
+               const std::vector<std::vector<float>> & A,
+               const std::vector<std::vector<float>> & B,
+                     std::vector<std::vector<float>> & C)
+{
+    const int n = order;
+    const float alpha = 1.0;
+    const float beta  = 1.0;
+
+    for (int b=0; b<batches; ++b) {
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                    n, n, n, alpha, &(A[b][0]), n, &(B[b][0]), n, beta, &(C[b][0]), n);
+    }
+}
+
+void prk_sgemm(const int order, const int batches, const int nt,
+               const std::vector<std::vector<float>> & A,
+               const std::vector<std::vector<float>> & B,
+                     std::vector<std::vector<float>> & C)
+{
+    const int n = order;
+    const float alpha = 1.0;
+    const float beta  = 1.0;
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic) num_threads(nt)
+#endif
+    for (int b=0; b<batches; ++b) {
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                    n, n, n, alpha, A[b].data(), n, B[b].data(), n, beta, C[b].data(), n);
+    }
+}
+
+void prk_sgemm(const int order, const int batches,
+               float** & A,
+               float** & B,
+               float** & C)
+{
+    const int n = order;
+    const float alpha = 1.0;
+    const float beta  = 1.0;
+
+    const int group_count = 1;
+    PRK_UNUSED const int group_size[group_count] = { batches };
+
+    const CBLAS_TRANSPOSE transa_array[group_count] = { CblasNoTrans };
+    const CBLAS_TRANSPOSE transb_array[group_count] = { CblasNoTrans };
+
+    const int n_array[group_count] = { n };
+
+    const float alpha_array[group_count] = { alpha };
+    const float beta_array[group_count]  = { beta };
+
+#ifdef MKL
+    cblas_sgemm_batch(CblasRowMajor, transa_array, transb_array,
+                      n_array, n_array, n_array,
+                      alpha_array,
+                      (const float**) A, n_array,
+                      (const float**) B, n_array,
+                      beta_array,
+                      C, n_array,
+                      group_count, group_size);
+#else // e.g. Accelerate does not have batched BLAS
+    for (int b=0; b<batches; ++b) {
+        cblas_sgemm(CblasRowMajor,
+                    transa_array[0], transb_array[0],
+                    n_array[0], n_array[0], n_array[0],
+                    alpha_array[0],
+                    A[b], n_array[0],
+                    B[b], n_array[0],
+                    beta_array[0],
+                    C[b], n_array[0]);
+    }
+#endif
+}
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/CBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int order;
+  int batches = 0;
+  int batch_threads = 1;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order> [<batches> <batch threads>]";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > prk::get_max_matrix_size()) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+
+      if (argc > 3) {
+        batches = std::atoi(argv[3]);
+      }
+
+      if (argc>4) {
+        batch_threads = std::atoi(argv[4]);
+      } else {
+#ifdef _OPENMP
+        batch_threads = omp_get_max_threads();
+#endif
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+  if (batches == 0) {
+      std::cout << "No batching" << std::endl;
+  } else if (batches > 0) {
+#ifdef MKL
+      std::cout << "Batch size           = " <<  batches << " (batched BLAS)" << std::endl;
+#else
+      std::cout << "Batch size           = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl;
+#endif
+  } else if (batches < 0) {
+      if (batch_threads > 1) {
+          std::cout << "Batch size           = " << std::abs(batches) << " (loop over legacy BLAS with " << batch_threads << " threads)" << std::endl;
+      } else {
+          std::cout << "Batch size           = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl;
+      }
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space for matrices
+  //////////////////////////////////////////////////////////////////////
+
+  double gemm_time(0);
+
+  const int matrices = (batches==0 ? 1 : abs(batches));
+
+  std::vector<float> const M(order*order,0);
+  std::vector<std::vector<float>> A(matrices,M);
+  std::vector<std::vector<float>> B(matrices,M);
+  std::vector<std::vector<float>> C(matrices,M);
+  for (int b=0; b<matrices; ++b) {
+    for (int i=0; i<order; ++i) {
+      for (int j=0; j<order; ++j) {
+         A[b][i*order+j] = i;
+         B[b][i*order+j] = i;
+         C[b][i*order+j] = 0;
+      }
+    }
+  }
+
+  float ** pA = new float*[matrices];
+  float ** pB = new float*[matrices];
+  float ** pC = new float*[matrices];
+
+  for (int b=0; b<matrices; ++b) {
+     pA[b] = A[b].data();
+     pB[b] = B[b].data();
+     pC[b] = C[b].data();
+  }
+
+  {
+    for (int iter = 0; iter<=iterations; iter++) {
+
+      if (iter==1) gemm_time = prk::wtime();
+
+      if (batches == 0) {
+          prk_sgemm(order, A[0], B[0], C[0]);
+      } else if (batches < 0) {
+          prk_sgemm(order, matrices, batch_threads, A, B, C);
+      } else if (batches > 0) {
+          prk_sgemm(order, matrices, pA, pB, pC);
+      }
+    }
+    gemm_time = prk::wtime() - gemm_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  const double epsilon = 1.0e-8;
+  const double forder = static_cast<float>(order);
+  const double reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1);
+  double residuum(0);
+  for (int b=0; b<matrices; ++b) {
+      const auto checksum = prk::reduce(C[b].begin(), C[b].end(), 0.0);
+      residuum += std::abs(checksum - reference) / reference;
+  }
+  residuum /= matrices;
+
+  if (residuum < epsilon) {
+#if VERBOSE
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Actual checksum = " << checksum << std::endl;
+#endif
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = gemm_time/iterations/matrices;
+    auto nflops = 2.0 * prk::pow(forder,3);
+    std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Residuum           = " << residuum << std::endl;
+#if VERBOSE
+    std::cout << "i, j, A, B, C, D" << std::endl;
+    for (int i=0; i<order; ++i)
+      for (int j=0; j<order; ++j)
+        std::cout << i << "," << j << " = " << A[i*order+j] << ", " << B[i*order+j] << ", " << C[i*order+j] << ", " << D[i*order+j] << "\n";
+    std::cout << std::endl;
+#endif
+    return 1;
+  }
+
+  return 0;
+}
+
+

From 3ca8d45fd4cead4c51dc3074417817a41ee20059 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Mon, 23 Jan 2023 14:58:18 +0200
Subject: [PATCH 67/80] add shmem4py (#618)

* add shmem4py example
* better install directions
* shmem alltoall behaves different than mpi alltoall, so we have to add a barrier.
* abort does not flush print
Signed-off-by: Jeff Hammond <jeff.science@gmail.com>
Co-authored-by: Lisandro Dalcin <dalcinl@gmail.com>
---
 PYTHON/README.md                  |  22 ++++
 PYTHON/nstream-numpy-shmem.py     | 175 ++++++++++++++++++++++++
 PYTHON/stencil-numpy-mpi.py       |   0
 PYTHON/transpose-numpy-mpi-rma.py |  11 +-
 PYTHON/transpose-numpy-mpi.py     |   5 +-
 PYTHON/transpose-numpy-shmem.py   | 212 ++++++++++++++++++++++++++++++
 6 files changed, 415 insertions(+), 10 deletions(-)
 create mode 100755 PYTHON/nstream-numpy-shmem.py
 mode change 100644 => 100755 PYTHON/stencil-numpy-mpi.py
 create mode 100755 PYTHON/transpose-numpy-shmem.py

diff --git a/PYTHON/README.md b/PYTHON/README.md
index 7f670436f..9c624b775 100644
--- a/PYTHON/README.md
+++ b/PYTHON/README.md
@@ -1,5 +1,7 @@
 # How to run
 
+## mpi4py
+
 ```
  mpiexec -n 4 python3 -m mpi4py nstream-numpy-mpi.py 10 10000000
  mpiexec -n 4 python3 -m mpi4py transpose-numpy-mpi.py 10 1000
@@ -11,3 +13,23 @@ On Mac with Homebrew, this might work better:
  mpiexec -n 4 ./nstream-numpy-mpi.py 10 10000000
  mpiexec -n 4 ./transpose-numpy-mpi.py 10 1000
 ```
+
+## shmem4py
+
+Checkout shmem4py and build against e.g. SOS like this:
+```
+$ export OSHCC=oshcc
+$ python3 -m pip install .
+```
+
+Run like this:
+```
+$ oshrun -n 4 python3 nstream-numpy-shmem.py 10 10000000
+Parallel Research Kernels version
+Python SHMEM/NumPy STREAM triad: A = B + scalar * C
+Number of ranks      =  4
+Number of iterations =  10
+Vector length        =  10000000
+Solution validates
+Rate (MB/s):  22345.12038433607  Avg time (s):  0.0143208
+```
diff --git a/PYTHON/nstream-numpy-shmem.py b/PYTHON/nstream-numpy-shmem.py
new file mode 100755
index 000000000..3b42f0488
--- /dev/null
+++ b/PYTHON/nstream-numpy-shmem.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2020, Intel Corporation
+# Copyright (c) 2023, NVIDIA
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above
+#      copyright notice, this list of conditions and the following
+#      disclaimer in the documentation and/or other materials provided
+#      with the distribution.
+# * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products
+#      derived from this software without specific prior written
+#      permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+#*******************************************************************
+#
+# NAME:    nstream
+#
+# PURPOSE: To compute memory bandwidth when adding a vector of a given
+#          number of double precision values to the scalar multiple of
+#          another vector of the same length, and storing the result in
+#          a third vector.
+#
+# USAGE:   The program takes as input the number
+#          of iterations to loop over the triad vectors, the length of the
+#          vectors, and the offset between vectors
+#
+#          <progname> <# iterations> <vector length> <offset>
+#
+#          The output consists of diagnostics to make sure the
+#          algorithm worked, and of timing statistics.
+#
+# NOTES:   Bandwidth is determined as the number of words read, plus the
+#          number of words written, times the size of the words, divided
+#          by the execution time. For a vector length of N, the total
+#          number of words read and written is 4*N*sizeof(double).
+#
+#
+# HISTORY: This code is loosely based on the Stream benchmark by John
+#          McCalpin, but does not follow all the Stream rules. Hence,
+#          reported results should not be associated with Stream in
+#          external publications
+#
+#          Converted to Python by Jeff Hammond, October 2017.
+#
+# *******************************************************************
+
+import sys
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
+from shmem4py import shmem
+import numpy
+
+def main():
+
+    me = shmem.my_pe()
+    np = shmem.n_pes()
+
+    # ********************************************************************
+    # read and test input parameters
+    # ********************************************************************
+
+    if (me==0):
+        print('Parallel Research Kernels version ') #, PRKVERSION
+        print('Python SHMEM/Numpy STREAM triad: A = B + scalar * C')
+
+    if len(sys.argv) != 3:
+        print('argument count = ', len(sys.argv))
+        sys.exit("Usage: python nstream.py <# iterations> <vector length>")
+
+    iterations = int(sys.argv[1])
+    if iterations < 1:
+        sys.exit("ERROR: iterations must be >= 1")
+
+    total_length = int(sys.argv[2])
+    if total_length < 1:
+        sys.exit("ERROR: length must be positive")
+
+    length = int(total_length / np)
+    remainder = total_length % np
+    if (remainder > 0):
+        if (me < remainder):
+            length += 1
+
+    if (me==0):
+        print('Number of ranks      = ', np)
+        print('Number of iterations = ', iterations)
+        print('Vector length        = ', total_length)
+
+    shmem.barrier_all()
+
+    # ********************************************************************
+    # ** Allocate space for the input and execute STREAM triad
+    # ********************************************************************
+
+    # 0.0 is a float, which is 64b (53b of precision)
+    A = numpy.zeros(length)
+    B = numpy.full(length,2.0)
+    C = numpy.full(length,2.0)
+
+    scalar = 3.0
+
+    for k in range(0,iterations+1):
+
+        if k<1:
+            shmem.barrier_all()
+            t0 = timer()
+
+        A += B + scalar * C
+
+
+    shmem.barrier_all()
+    t1 = timer()
+    nstream_time = t1 - t0
+
+    # ********************************************************************
+    # ** Analyze and output results.
+    # ********************************************************************
+
+    ar = 0.0
+    br = 2.0
+    cr = 2.0
+    ref = 0.0
+    for k in range(0,iterations+1):
+        ar += br + scalar * cr
+
+    ar *= total_length
+
+    #asum = numpy.linalg.norm(A, ord=1)
+    #shmem.reduce(asum)
+
+    asum = numpy.linalg.norm(A, ord=1)
+    src = shmem.full(1, asum)
+    tgt = shmem.full(1, 0.0)
+    shmem.reduce(tgt,src)
+    asum = tgt
+
+    epsilon=1.e-8
+    if abs(ar-asum)/asum > epsilon:
+        if (me==0):
+            print('Failed Validation on output array');
+            print('        Expected checksum: ',ar);
+            print('        Observed checksum: ',asum);
+            print("ERROR: solution did not validate")
+    else:
+        if (me==0):
+            print('Solution validates')
+            avgtime = nstream_time/iterations
+            nbytes = 4.0 * total_length * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc.
+            print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PYTHON/stencil-numpy-mpi.py b/PYTHON/stencil-numpy-mpi.py
old mode 100644
new mode 100755
diff --git a/PYTHON/transpose-numpy-mpi-rma.py b/PYTHON/transpose-numpy-mpi-rma.py
index efa3ca359..b064596ee 100755
--- a/PYTHON/transpose-numpy-mpi-rma.py
+++ b/PYTHON/transpose-numpy-mpi-rma.py
@@ -159,8 +159,10 @@ def main():
         for phase in range(0,np):
             recv_from = (me + phase) % np
             bsize = block_order * block_order
-            WA.Get(T, recv_from, [bsize * recv_from, bsize, MPI.DOUBLE])
-            WA.Flush_all()
+            #WA.Get(T, recv_from, [bsize * me, bsize, MPI.DOUBLE])
+            #WA.Flush(recv_from)
+            r = WA.Rget(T, recv_from, [bsize * me, bsize, MPI.DOUBLE])
+            r.Wait()
                   
             lo = block_order * recv_from 
             hi = block_order * (recv_from+1)
@@ -200,10 +202,7 @@ def main():
     else:
         if (me==0):
             print('error ',abserr, ' exceeds threshold ',epsilon)
-            print("ERROR: solution did not validate")
-            comm.Abort()
-        #sys.exit("ERROR: solution did not validate")
-
+            sys.exit("ERROR: solution did not validate")
 
 if __name__ == '__main__':
     main()
diff --git a/PYTHON/transpose-numpy-mpi.py b/PYTHON/transpose-numpy-mpi.py
index 5dacbd5ea..d0413f52f 100755
--- a/PYTHON/transpose-numpy-mpi.py
+++ b/PYTHON/transpose-numpy-mpi.py
@@ -190,10 +190,7 @@ def main():
     else:
         if (me==0):
             print('error ',abserr, ' exceeds threshold ',epsilon)
-            print("ERROR: solution did not validate")
-            comm.Abort()
-        #sys.exit("ERROR: solution did not validate")
-
+            sys.exit("ERROR: solution did not validate")
 
 if __name__ == '__main__':
     main()
diff --git a/PYTHON/transpose-numpy-shmem.py b/PYTHON/transpose-numpy-shmem.py
new file mode 100755
index 000000000..1495dec53
--- /dev/null
+++ b/PYTHON/transpose-numpy-shmem.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2020, Intel Corporation
+# Copyright (c) 2023, NVIDIA
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above
+#      copyright notice, this list of conditions and the following
+#      disclaimer in the documentation and/or other materials provided
+#      with the distribution.
+# * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products
+#      derived from this software without specific prior written
+#      permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+#*******************************************************************
+#
+# NAME:    transpose
+#
+# PURPOSE: This program measures the time for the transpose of a
+#          column-major stored matrix into a row-major stored matrix.
+#
+# USAGE:   Program input is the matrix order and the number of times to
+#          repeat the operation:
+#
+#          transpose <# iterations> <matrix_size>
+#
+#          The output consists of diagnostics to make sure the
+#          transpose worked and timing statistics.
+#
+# HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+#          Converted to Python by Jeff Hammond, February 2016.
+#
+# *******************************************************************
+
+#                     Layout nomenclature
+#                     -------------------
+#
+# - Each rank owns one block of columns (Colblock) of the overall
+#   matrix to be transposed, as well as of the transposed matrix.
+# - Colblock is stored contiguously in the memory of the rank.
+#   The stored format is column major, which means that matrix
+#   elements (i,j) and (i+1,j) are adjacent, and (i,j) and (i,j+1)
+#   are "order" words apart
+# - Colblock is logically composed of #ranks Blocks, but a Block is
+#   not stored contiguously in memory. Conceptually, the Block is
+#   the unit of data that gets communicated between ranks. Block i of
+#   rank j is locally transposed and gathered into a buffer called Work,
+#   which is sent to rank i, where it is scattered into Block j of the
+#   transposed matrix.
+# - When tiling is applied to reduce TLB misses, each block gets
+#   accessed by tiles.
+# - The original and transposed matrices are called A and B
+#
+# +-----------------------------------------------------------------+
+# |           |           |           |                             |
+# | Colblock  |           |           |                             |
+# |           |           |           |                             |
+# |           |           |           |                             |
+# |           |           |           |                             |
+# |        -------------------------------                          |
+# |           |           |           |                             |
+# |           |  Block    |           |                             |
+# |           |           |           |                             |
+# |           |           |           |                             |
+# |           |           |           |                             |
+# |        -------------------------------                          |
+# |           |           |           |                             |
+# |           |           |           |   Overall Matrix            |
+# |           |           |           |                             |
+# |           |           |           |                             |
+# |           |           |           |                             |
+# |        -------------------------------                          |
+# |           |           |           |                             |
+# |           |           |           |                             |
+# |           |           |           |                             |
+# |           |           |           |                             |
+# |           |           |           |                             |
+# +-----------------------------------------------------------------+
+
+import sys
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
+from shmem4py import shmem
+import numpy
+
+def main():
+
+    me = shmem.my_pe()
+    np = shmem.n_pes()
+
+    # ********************************************************************
+    # read and test input parameters
+    # ********************************************************************
+
+    if (me==0):
+        print('Parallel Research Kernels version ') #, PRKVERSION
+        print('Python SHMEM/Numpy  Matrix transpose: B = A^T')
+
+    if len(sys.argv) != 3:
+        print('argument count = ', len(sys.argv))
+        sys.exit("Usage: ./transpose <# iterations> <matrix order>")
+
+    iterations = int(sys.argv[1])
+    if iterations < 1:
+        sys.exit("ERROR: iterations must be >= 1")
+
+    order = int(sys.argv[2])
+    if order < 1:
+        sys.exit("ERROR: order must be >= 1")
+
+    if order % np != 0:
+        sys.exit("ERROR: matrix order ", order," should be divisible by # procs", np)
+
+    block_order = int(order / np)
+
+    if (me==0):
+        print('Number of ranks      = ', np)
+        print('Number of iterations = ', iterations)
+        print('Matrix order         = ', order)
+
+    shmem.barrier_all()
+
+    # ********************************************************************
+    # ** Allocate space for the input and transpose matrix
+    # ********************************************************************
+
+    LA = numpy.fromfunction(lambda i,j:  me * block_order + i*order + j, (order,block_order), dtype=float)
+    A = shmem.full((order,block_order),LA)
+    B = shmem.zeros((order,block_order))
+    T = shmem.zeros((order,block_order))
+
+    for k in range(0,iterations+1):
+
+        if k<1:
+            shmem.barrier_all()
+            t0 = timer()
+
+        # this actually forms the transpose of A
+        #B += numpy.transpose(A)
+        # this only uses the transpose _view_ of A
+        #B += A.T
+
+        # barrier required before alltoall for correctness
+        shmem.barrier_all()
+        shmem.alltoall(T, A)
+        for r in range(0,np):
+            lo = block_order * r
+            hi = block_order * (r+1)
+            #B[lo:hi,:] += numpy.transpose(T[lo:hi,:])
+            B[lo:hi,:] += T[lo:hi,:].T
+
+        A += 1.0
+
+    shmem.barrier_all()
+    t1 = timer()
+    trans_time = t1 - t0
+
+    shmem.free(A)
+    shmem.free(T)
+
+    # ********************************************************************
+    # ** Analyze and output results.
+    # ********************************************************************
+
+    # allgather is non-scalable but was easier to debug
+    F = shmem.zeros((np,order,block_order))
+    shmem.fcollect(F,B)
+    G = numpy.concatenate(F,axis=1)
+    #if (me==0):
+    #    print(G)
+    H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype=float)
+    abserr = numpy.linalg.norm(numpy.reshape(G-H,order*order),ord=1)
+
+    shmem.free(B)
+    shmem.free(F)
+
+    epsilon=1.e-8
+    nbytes = 2 * order**2 * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc.
+    if abserr < epsilon:
+        if (me==0):
+            print('Solution validates')
+            avgtime = trans_time/iterations
+            print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime)
+    else:
+        if (me==0):
+            print('error ',abserr, ' exceeds threshold ',epsilon)
+            print("ERROR: solution did not validate")
+
+
+if __name__ == '__main__':
+    main()

From e39cd7ced7e3118352b88ab18c834111e95b3004 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Wed, 25 Jan 2023 14:05:49 +0200
Subject: [PATCH 68/80] rename (#619)

---
 FORTRAN/Makefile                                             | 2 +-
 FORTRAN/{nstream-cufortran.cuf => nstream-cufortran.F90}     | 0
 FORTRAN/{transpose-cufortran.cuf => transpose-cufortran.F90} | 0
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename FORTRAN/{nstream-cufortran.cuf => nstream-cufortran.F90} (100%)
 rename FORTRAN/{transpose-cufortran.cuf => transpose-cufortran.F90} (100%)

diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index 6d3b0c1f1..625490385 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -141,7 +141,7 @@ dgemm-blas: dgemm-blas.F90 prk.mod
 %-openacc: %-openacc.F90 prk.mod
 	$(FC) $(FCFLAGS) $(OPENACCFLAG) $< prk_mod.o -o $@
 
-%-cufortran: %-cufortran.cuf prk.mod
+%-cufortran: %-cufortran.F90 prk.mod
 	$(FC) $(FCFLAGS) $(CUFORTFLAG) $< prk_mod.o -o $@
 
 %-stdpar: %-stdpar.F90 prk.mod
diff --git a/FORTRAN/nstream-cufortran.cuf b/FORTRAN/nstream-cufortran.F90
similarity index 100%
rename from FORTRAN/nstream-cufortran.cuf
rename to FORTRAN/nstream-cufortran.F90
diff --git a/FORTRAN/transpose-cufortran.cuf b/FORTRAN/transpose-cufortran.F90
similarity index 100%
rename from FORTRAN/transpose-cufortran.cuf
rename to FORTRAN/transpose-cufortran.F90

From a8c9d697317b21860039a72d6937968223ad81d7 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Tue, 15 Nov 2022 16:03:46 +0200
Subject: [PATCH 69/80] fix name

---
 RUST/nstream-unsafe/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RUST/nstream-unsafe/Cargo.toml b/RUST/nstream-unsafe/Cargo.toml
index 479e87e60..81a229d01 100644
--- a/RUST/nstream-unsafe/Cargo.toml
+++ b/RUST/nstream-unsafe/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "nstream"
+name = "nstream-unsafe"
 version = "0.1.0"
 authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Thomas Hayward-Schneider <thomas.hayward@ipp.mpg.de>"]
 

From 02937f26557fd35738016ff8bc788e8da26a2848 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Tue, 15 Nov 2022 16:27:15 +0200
Subject: [PATCH 70/80] add dgemm-blis

---
 RUST/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/RUST/Makefile b/RUST/Makefile
index f72474c64..3516f35f1 100644
--- a/RUST/Makefile
+++ b/RUST/Makefile
@@ -23,6 +23,7 @@ all:
 	cd transpose-iter  && cargo build $(RCFLAGS)
 	cd transpose-rayon && cargo build $(RCFLAGS)
 	cd dgemm           && cargo build $(RCFLAGS)
+	cd dgemm-blis      && cargo build $(RCFLAGS)
 	cd dgemm-iter      && cargo build $(RCFLAGS)
 	cd dgemm-rayon     && cargo build $(RCFLAGS)
 clean:
@@ -36,5 +37,6 @@ clean:
 	cd transpose-iter  && cargo clean
 	cd transpose-rayon && cargo clean
 	cd dgemm           && cargo clean
+	cd dgemm-blis      && cargo clean
 	cd dgemm-iter      && cargo clean
 	cd dgemm-rayon     && cargo clean

From 98ad8948ad85599a836160a3aa6c789dba3618c9 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Thu, 9 Mar 2023 09:55:48 +0200
Subject: [PATCH 71/80] dunno

---
 RUST/dgemm-blis/Cargo.toml |  2 +-
 common/make.defs.gcc       | 55 +++++++++++++++++++-------------------
 2 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/RUST/dgemm-blis/Cargo.toml b/RUST/dgemm-blis/Cargo.toml
index 3ea994400..249b6fb8c 100644
--- a/RUST/dgemm-blis/Cargo.toml
+++ b/RUST/dgemm-blis/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "dgemm-blis"
-version = "0.1.0"
+version = "0.5.0"
 authors = ["Jeff Hammond <jeff.r.hammond@intel.com>", "Sajid Ali <sasyed@fnal.gov>"]
 
 edition="2021"
diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index afcf1a6ae..62e540298 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -4,7 +4,7 @@
 #
 # Base compilers and language options
 #
-VERSION=-11
+VERSION=-10
 # C99 is required in some implementations.
 CC=gcc${VERSION} -std=c11 -pthread
 #EXTRA_CLIBS=-lrt
@@ -43,15 +43,15 @@ OPENACCFLAG=-fopenacc
 # OpenCL flags
 #
 # MacOS
-OPENCLFLAG=-framework OpenCL
+#OPENCLFLAG=-framework OpenCL
 # POCL
 # http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct...
 #OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL
 # Linux
-#OPENCLDIR=/etc/alternatives/opencl-intel-tools
-#OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
+OPENCLDIR=/etc/alternatives/opencl-intel-tools
+OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL
 OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations
-METALFLAG=-framework MetalPerformanceShaders
+#METALFLAG=-framework MetalPerformanceShaders
 #
 # SYCL flags
 #
@@ -98,19 +98,17 @@ METALFLAG=-framework MetalPerformanceShaders
 #
 # hipSYCL
 #
-SYCLDIR=/opt/hipSYCL
-SYCLCXX=${SYCLDIR}/bin/syclcc-clang
-SYCLFLAG=-std=c++17 -O3
-SYCLFLAG+=-DHIPSYCL
+#SYCLDIR=/opt/hipSYCL
+#SYCLCXX=${SYCLDIR}/bin/syclcc-clang
+#SYCLFLAG=-std=c++17 -O3
+#SYCLFLAG+=-DHIPSYCL
 # CPU platform
-SYCLFLAG+=--hipsycl-platform=cpu
-SYCLFLAG+=-Wl,-rpath=/opt/hipSYCL/llvm/lib
+#SYCLFLAG+=--hipsycl-platform=cpu
+#SYCLFLAG+=-Wl,-rpath=/opt/hipSYCL/llvm/lib
 #
-CELERITYDIR=${SYCLDIR}
-CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor
-CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime
-MPIINC=-I/usr/include/mpich-3.2-x86_64
-MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi
+#CELERITYDIR=${SYCLDIR}
+#CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor
+#CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime
 #
 # OCCA
 #
@@ -162,19 +160,19 @@ UPCXXFLAG+=-mtune=native -ffast-math
 #
 #BLASFLAG=-L${HOME}/BLIS/lib -lblis #-fopenmp -lpthread
 #CBLASFLAG=-I${HOME}/BLIS/include
-BLASFLAG=-DACCELERATE -framework Accelerate
-CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
-#BLASFLAG=-lblas
-#`CBLASFLAG=-lblas
+#BLASFLAG=-DACCELERATE -framework Accelerate
+#CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions
+BLASFLAG=-lblas
+CBLASFLAG=-lblas
 #
 # CUDA flags
 #
 # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander
-NVCC=/opt/llvm/cocl/bin/cocl
+#NVCC=/opt/llvm/cocl/bin/cocl
 # Linux w/ NVIDIA CUDA
-NVCC=nvcc
+NVCC=/usr/local/cuda-11.4/bin/nvcc
 CUDAFLAGS=-g -O3 -std=c++11
-CUDAFLAGS+=-arch=sm_50
+CUDAFLAGS+=-arch=sm_87
 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
 #CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED
 #
@@ -205,10 +203,10 @@ ISPCFLAG=-O3 --target=host --opt=fast-math
 #
 # MPI-3
 #
-MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.4
-MPICC=${MPIDIR}/bin/mpicc
-MPICXX=${MPIDIR}/bin/mpicxx
-MPIFORT=${MPIDIR}/bin/mpifort
+MPIDIR=/usr
+MPICC=${MPIDIR}/bin/mpicc.mpich
+MPICXX=${MPIDIR}/bin/mpicxx.mpich
+MPIFORT=${MPIDIR}/bin/mpifort.mpich
 MPIINC=-I${MPIDIR}/include
 MPILIB=-L${MPIDIR}/lib -lmpi_usempif08 -lmpi
 #MPILIB=-L${MPIDIR}/lib -lmpifort -lmpi
@@ -241,7 +239,8 @@ PETSCFLAG+=-Wl,-rpath=${PETSCDIR}/lib
 # single-node
 #COARRAYFLAG=-fcoarray=single -lcaf_single
 # multi-node
-COARRAYFLAG=-fcoarray=lib -L/opt/homebrew/lib -lcaf_mpi
+#COARRAYFLAG=-fcoarray=lib -L/opt/homebrew/lib -lcaf_mpi
+COARRAYFLAG=-fcoarray=lib -L/usr/lib/x86_64-linux-gnu/open-coarrays/mpich/lib -lcaf_mpi
 #
 # MEMKIND (used in C1z)
 #

From 53c10ce02dbd3fd6d4bb2fd1028bd5c8987ff7cc Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Fri, 31 Mar 2023 15:33:58 +0300
Subject: [PATCH 72/80] gcc apple update for ventura (#623)

---
 common/make.defs.gcc | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 62e540298..2f52fa0c6 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -4,7 +4,7 @@
 #
 # Base compilers and language options
 #
-VERSION=-10
+VERSION=-12
 # C99 is required in some implementations.
 CC=gcc${VERSION} -std=c11 -pthread
 #EXTRA_CLIBS=-lrt
@@ -186,16 +186,6 @@ HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide
 HALIDEFLAG+=${DEFAULT_OPT_FLAGS}
 HALIDEFLAG+=-std=c++17
 #
-# Halide
-#
-HALIDECXX=${CXX}
-HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux
-HALIDEFLAG=-I${HALIDEDIR}/include
-HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide
-#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0
-HALIDEFLAG+=${DEFAULT_OPT_FLAGS}
-HALIDEFLAG+=-std=c++17
-#
 # ISPC
 #
 ISPC=ispc
@@ -203,10 +193,10 @@ ISPCFLAG=-O3 --target=host --opt=fast-math
 #
 # MPI-3
 #
-MPIDIR=/usr
-MPICC=${MPIDIR}/bin/mpicc.mpich
-MPICXX=${MPIDIR}/bin/mpicxx.mpich
-MPIFORT=${MPIDIR}/bin/mpifort.mpich
+MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.5
+MPICC=${MPIDIR}/bin/mpicc
+MPICXX=${MPIDIR}/bin/mpicxx
+MPIFORT=${MPIDIR}/bin/mpifort
 MPIINC=-I${MPIDIR}/include
 MPILIB=-L${MPIDIR}/lib -lmpi_usempif08 -lmpi
 #MPILIB=-L${MPIDIR}/lib -lmpifort -lmpi

From f3a392e609078f3a23715c609ab5069e1a4fe961 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 25 Apr 2023 11:13:26 +0300
Subject: [PATCH 73/80] brew tbb update

---
 common/make.defs.gcc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/make.defs.gcc b/common/make.defs.gcc
index 2f52fa0c6..1d6dd89e8 100644
--- a/common/make.defs.gcc
+++ b/common/make.defs.gcc
@@ -121,7 +121,7 @@ OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations
 # TBB
 #
 #TBBDIR=/usr/lib/x86_64-linux-gnu
-TBBDIR=/opt/homebrew/Cellar/tbb/2020_U3_1
+TBBDIR=/opt/homebrew/Cellar/tbb/2021.8.0
 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
 #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb
 #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb

From 7925db02a5ebe6d65a1b49348e31151b4a01668a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Tue, 25 Apr 2023 11:23:24 +0300
Subject: [PATCH 74/80] add flang-new docs

---
 doc/flang-new.md | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 doc/flang-new.md

diff --git a/doc/flang-new.md b/doc/flang-new.md
new file mode 100644
index 000000000..fe47c6e13
--- /dev/null
+++ b/doc/flang-new.md
@@ -0,0 +1,6 @@
+This works, but -flang-experimental-exec` and `-Wall` are ignored.
+
+```
+/opt/llvm/latest/bin/flang-new -flang-experimental-exec -g -O3 -ffast-math -Wall  -DRADIUS=2 -DSTAR -c p2p.F90
+ld -L /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib -lSystem p2p.o prk_mod.o -o p2p /opt/llvm/latest/lib/libFortran*a
+```

From 252bbb5e047cd4e95e550597b7afe6c54170abf8 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Wed, 19 Jul 2023 11:04:29 +0300
Subject: [PATCH 75/80] fix petsc transpose - closes #615 (#626)

---
 C1z/nstream-petsc.c   | 2 +-
 C1z/transpose-petsc.c | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/C1z/nstream-petsc.c b/C1z/nstream-petsc.c
index 93b931872..aec86ff5c 100644
--- a/C1z/nstream-petsc.c
+++ b/C1z/nstream-petsc.c
@@ -119,7 +119,7 @@ int main(int argc, char * argv[])
 #endif
   PetscPrintf(PETSC_COMM_WORLD,"Number of processes  = %d\n", np);
   PetscPrintf(PETSC_COMM_WORLD,"Number of iterations = %d\n", iterations);
-  PetscPrintf(PETSC_COMM_WORLD,"Vector length        = %zu\n", length);
+  PetscPrintf(PETSC_COMM_WORLD,"Vector length        = %zu\n", (size_t)length);
 
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation
diff --git a/C1z/transpose-petsc.c b/C1z/transpose-petsc.c
index ed631cf88..aa3219c68 100644
--- a/C1z/transpose-petsc.c
+++ b/C1z/transpose-petsc.c
@@ -118,10 +118,7 @@ int main(int argc, char * argv[])
 
   double trans_time = 0.0;
 
-  PetscReal zero  = 0;
   PetscReal one   = 1;
-  PetscReal two   = 2;
-  PetscReal three = 3;
 
   Mat A;
   Mat B;
@@ -144,6 +141,8 @@ int main(int argc, char * argv[])
     }
   }
   ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
+  // https://petsc.org/main/manualpages/Mat/MatTransposeSetPrecursor/
+  ierr = MatTransposeSetPrecursor(A, AT); CHKERRQ(ierr);
 
   // B[i,j] = 0
 #if 0
@@ -196,9 +195,9 @@ int main(int argc, char * argv[])
   // Analyze and output results
   //////////////////////////////////////////////////////////////////////
 
-  PetscReal addit = (iterations+1)*(iterations)/2;
   PetscReal abserr = 0;
 #if 0
+  PetscReal addit = (iterations+1)*(iterations)/2;
   for (int j=0; j<order; j++) {
     for (int i=0; i<order; i++) {
       const size_t ij = i*order+j;

From 9272001d12e32f1b67c692ea7a49a0901f78932a Mon Sep 17 00:00:00 2001
From: jbrodman <james.brodman@intel.com>
Date: Wed, 19 Jul 2023 14:47:55 -0400
Subject: [PATCH 76/80] Update Intel SYCL compiler driver. Update device
 selectors and accessors to SYCL2020. (#629)

Signed-off-by: James Brodman <james.brodman@intel.com>
---
 Cxx11/dgemm-onemkl.cc              |  2 +-
 Cxx11/dgemm-sycl.cc                |  8 +++---
 Cxx11/generate-sycl-stencil.py     |  4 +--
 Cxx11/nstream-dpcpp.cc             |  2 +-
 Cxx11/nstream-onedpl.cc            |  2 +-
 Cxx11/nstream-onemkl.cc            |  2 +-
 Cxx11/nstream-sycl-explicit-usm.cc | 23 ++---------------
 Cxx11/nstream-sycl-explicit.cc     | 38 +++++++---------------------
 Cxx11/nstream-sycl-usm.cc          | 23 ++---------------
 Cxx11/nstream-sycl.cc              | 30 ++++------------------
 Cxx11/p2p-hyperplane-sycl.cc       |  4 +--
 Cxx11/pic-sycl.cc                  | 15 +++++------
 Cxx11/prk_sycl.h                   |  2 --
 Cxx11/stencil-2d-sycl.cc           | 27 +++-----------------
 Cxx11/stencil-sycl-usm.cc          | 23 ++---------------
 Cxx11/stencil-sycl.cc              | 27 +++-----------------
 Cxx11/stencil_sycl.hpp             | 40 +++++++++++++++---------------
 Cxx11/transpose-2d-sycl.cc         | 27 +++-----------------
 Cxx11/transpose-dpcpp.cc           |  2 +-
 Cxx11/transpose-sycl-usm.cc        | 23 ++---------------
 Cxx11/transpose-sycl.cc            | 27 +++-----------------
 Cxx11/xgemm-onemkl.cc              | 21 ++--------------
 common/make.defs.oneapi            |  2 +-
 23 files changed, 80 insertions(+), 294 deletions(-)

diff --git a/Cxx11/dgemm-onemkl.cc b/Cxx11/dgemm-onemkl.cc
index d1f9b65ec..0ebccd128 100644
--- a/Cxx11/dgemm-onemkl.cc
+++ b/Cxx11/dgemm-onemkl.cc
@@ -126,7 +126,7 @@ int main(int argc, char * argv[])
   }
   std::cout << "Input copy           = " << (input_copy ? "yes" : "no") << std::endl;
 
-  sycl::queue q(sycl::default_selector{});
+  sycl::queue q(sycl::default_selector_v);
   prk::SYCL::print_device_platform(q);
 
   //////////////////////////////////////////////////////////////////////
diff --git a/Cxx11/dgemm-sycl.cc b/Cxx11/dgemm-sycl.cc
index a7ca3dd4f..dda801652 100644
--- a/Cxx11/dgemm-sycl.cc
+++ b/Cxx11/dgemm-sycl.cc
@@ -73,9 +73,9 @@ void prk_dgemm(sycl::queue & q,
 {
     q.submit([&](sycl::handler& h) {
 
-      auto A = d_A.get_access<sycl::access::mode::read>(h);
-      auto B = d_B.get_access<sycl::access::mode::read>(h);
-      auto C = d_C.get_access<sycl::access::mode::read_write>(h);
+      sycl::accessor A(d_A, h, sycl::read_only);
+      sycl::accessor B(d_B, h, sycl::read_only);
+      sycl::accessor C(d_C, h);
 
       h.parallel_for<class dgemm>( sycl::range<2>{order,order}, [=] (sycl::id<2> it) {
 
@@ -130,7 +130,7 @@ int main(int argc, char * argv[])
     return 1;
   }
 
-  sycl::queue q(sycl::default_selector{});
+  sycl::queue q(sycl::default_selector_v);
   prk::SYCL::print_device_platform(q);
 
   if (tile_size < order) {
diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py
index c67f2d124..9a28bdb2e 100755
--- a/Cxx11/generate-sycl-stencil.py
+++ b/Cxx11/generate-sycl-stencil.py
@@ -26,8 +26,8 @@ def codegen(src,pattern,stencil_size,radius,model,dim,usm):
     src.write('{\n')
     src.write('  q.submit([&](sycl::handler& h) {\n')
     if (not usm):
-        src.write('    auto in  = d_in.template get_access<sycl::access::mode::read>(h);\n')
-        src.write('    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);\n')
+        src.write('    sycl::accessor in(d_in, h, sycl::read_only);\n')
+        src.write('    sycl::accessor out(d_out, h);\n')
     if (dim==2):
         for r in range(1,radius+1):
             src.write('    sycl::id<2> dx'+str(r)+'(sycl::range<2> {'+str(r)+',0});\n')
diff --git a/Cxx11/nstream-dpcpp.cc b/Cxx11/nstream-dpcpp.cc
index efc0fcaf3..4306adc12 100644
--- a/Cxx11/nstream-dpcpp.cc
+++ b/Cxx11/nstream-dpcpp.cc
@@ -106,7 +106,7 @@ int main(int argc, char * argv[])
   std::cout << "Vector length        = " << length << std::endl;
   std::cout << "Block size           = " << block_size << std::endl;
 
-  sycl::queue q(sycl::default_selector{});
+  sycl::queue q(sycl::default_selector_v);
   prk::SYCL::print_device_platform(q);
 
   size_t padded_length = block_size * prk::divceil(length,block_size);
diff --git a/Cxx11/nstream-onedpl.cc b/Cxx11/nstream-onedpl.cc
index 963683945..8cd48fc2a 100644
--- a/Cxx11/nstream-onedpl.cc
+++ b/Cxx11/nstream-onedpl.cc
@@ -101,7 +101,7 @@ int main(int argc, char *argv[]) {
   std::cout << "Number of iterations = " << iterations << std::endl;
   std::cout << "Vector length        = " << length << std::endl;
 
-  sycl::queue q(sycl::default_selector{});
+  sycl::queue q(sycl::default_selector_v);
   prk::SYCL::print_device_platform(q);
 
   //////////////////////////////////////////////////////////////////////
diff --git a/Cxx11/nstream-onemkl.cc b/Cxx11/nstream-onemkl.cc
index 0c69f9808..55448ec74 100644
--- a/Cxx11/nstream-onemkl.cc
+++ b/Cxx11/nstream-onemkl.cc
@@ -106,7 +106,7 @@ int main(int argc, char * argv[])
   std::cout << "Number of iterations = " << iterations << std::endl;
   std::cout << "Vector length        = " << length << std::endl;
 
-  sycl::queue q(sycl::default_selector{}, sycl::property::queue::in_order{});
+  sycl::queue q(sycl::default_selector_v, sycl::property::queue::in_order{});
 
   //////////////////////////////////////////////////////////////////////
   // Allocate space and perform the computation
diff --git a/Cxx11/nstream-sycl-explicit-usm.cc b/Cxx11/nstream-sycl-explicit-usm.cc
index aa5c5c690..cf5f9f89a 100644
--- a/Cxx11/nstream-sycl-explicit-usm.cc
+++ b/Cxx11/nstream-sycl-explicit-usm.cc
@@ -275,7 +275,7 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
-    sycl::queue q{sycl::host_selector{}};
+    sycl::queue q{sycl::cpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
 #ifndef DPCPP_NO_DOUBLE
@@ -294,26 +294,7 @@ int main(int argc, char * argv[])
   }
 
   try {
-    sycl::queue q{sycl::cpu_selector{}};
-    prk::SYCL::print_device_platform(q);
-    run<float>(q, iterations, length, block_size);
-#ifndef DPCPP_NO_DOUBLE
-    run<double>(q, iterations, length, block_size);
-#endif
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-  }
-
-  try {
-    sycl::queue q{sycl::gpu_selector{}};
+    sycl::queue q{sycl::gpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
 #ifndef DPCPP_NO_DOUBLE
diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc
index adf045d32..e7cf0bd57 100644
--- a/Cxx11/nstream-sycl-explicit.cc
+++ b/Cxx11/nstream-sycl-explicit.cc
@@ -100,15 +100,15 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size)
     sycl::buffer<T> d_C { sycl::range<1>{length} };
 
     q.submit([&](sycl::handler& h) {
-        sycl::accessor<T, 1, sycl::access::mode::discard_write, sycl::access::target::global_buffer> A(d_A, h, sycl::range<1>(length), sycl::id<1>(0));
+        sycl::accessor A(d_A, h, sycl::no_init);
         h.fill(A,(T)0);
     });
     q.submit([&](sycl::handler& h) {
-        sycl::accessor<T, 1, sycl::access::mode::discard_write, sycl::access::target::global_buffer> B(d_B, h, sycl::range<1>(length), sycl::id<1>(0));
+        sycl::accessor B(d_B, h, sycl::no_init);
         h.fill(B,(T)2);
     });
     q.submit([&](sycl::handler& h) {
-        sycl::accessor<T, 1, sycl::access::mode::discard_write, sycl::access::target::global_buffer> C(d_C, h, sycl::range<1>(length), sycl::id<1>(0));
+        sycl::accessor C(d_C, h, sycl::no_init);
         h.fill(C,(T)2);
     });
     q.wait();
@@ -118,10 +118,9 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size)
       if (iter==1) nstream_time = prk::wtime();
 
       q.submit([&](sycl::handler& h) {
-
-        auto A = d_A.template get_access<sycl::access::mode::read_write>(h);
-        auto B = d_B.template get_access<sycl::access::mode::read>(h);
-        auto C = d_C.template get_access<sycl::access::mode::read>(h);
+        sycl::accessor A(d_A, h);
+        sycl::accessor B(d_B, h, sycl::read_only);
+        sycl::accessor C(d_C, h, sycl::read_only);
 
         if (block_size == 0) {
             // hipSYCL prefers range to nd_range because no barriers
@@ -164,7 +163,7 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size)
     nstream_time = prk::wtime() - nstream_time;
 
     q.submit([&](sycl::handler& h) {
-        sycl::accessor<T, 1, sycl::access::mode::read, sycl::access::target::global_buffer> A(d_A, h, sycl::range<1>(length), sycl::id<1>(0));
+        sycl::accessor A(d_A, h, sycl::read_only);
         h.copy(A,h_A.data());
     });
     q.wait();
@@ -268,26 +267,7 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
-    sycl::queue q{sycl::host_selector{}};
-    prk::SYCL::print_device_platform(q);
-    run<float>(q, iterations, length, block_size);
-#ifndef DPCPP_NO_DOUBLE
-    run<double>(q, iterations, length, block_size);
-#endif
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-  }
-
-  try {
-    sycl::queue q{sycl::cpu_selector{}};
+    sycl::queue q{sycl::cpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
 #ifndef DPCPP_NO_DOUBLE
@@ -306,7 +286,7 @@ int main(int argc, char * argv[])
   }
 
   try {
-    sycl::queue q{sycl::gpu_selector{}};
+    sycl::queue q{sycl::gpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
 #ifndef DPCPP_NO_DOUBLE
diff --git a/Cxx11/nstream-sycl-usm.cc b/Cxx11/nstream-sycl-usm.cc
index e872a5130..cc2865324 100644
--- a/Cxx11/nstream-sycl-usm.cc
+++ b/Cxx11/nstream-sycl-usm.cc
@@ -253,7 +253,7 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
-    sycl::queue q{sycl::host_selector{}};
+    sycl::queue q{sycl::cpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
 #ifndef DPCPP_NO_DOUBLE
@@ -272,26 +272,7 @@ int main(int argc, char * argv[])
   }
 
   try {
-    sycl::queue q{sycl::cpu_selector{}};
-    prk::SYCL::print_device_platform(q);
-    run<float>(q, iterations, length, block_size);
-#ifndef DPCPP_NO_DOUBLE
-    run<double>(q, iterations, length, block_size);
-#endif
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-  }
-
-  try {
-    sycl::queue q{sycl::gpu_selector{}};
+    sycl::queue q{sycl::gpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
 #ifndef DPCPP_NO_DOUBLE
diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc
index 140125f9d..902291315 100644
--- a/Cxx11/nstream-sycl.cc
+++ b/Cxx11/nstream-sycl.cc
@@ -106,10 +106,9 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size)
       if (iter==1) nstream_time = prk::wtime();
 
       q.submit([&](sycl::handler& h) {
-
-        auto A = d_A.template get_access<sycl::access::mode::read_write>(h);
-        auto B = d_B.template get_access<sycl::access::mode::read>(h);
-        auto C = d_C.template get_access<sycl::access::mode::read>(h);
+        sycl::accessor A(d_A, h);
+        sycl::accessor B(d_B, h, sycl::read_only);
+        sycl::accessor C(d_C, h, sycl::read_only);
 
         if (block_size == 0) {
             // hipSYCL prefers range to nd_range because no barriers
@@ -250,26 +249,7 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
-    sycl::queue q{sycl::host_selector{}};
-    prk::SYCL::print_device_platform(q);
-    run<float>(q, iterations, length, block_size);
-#ifndef DPCPP_NO_DOUBLE
-    run<double>(q, iterations, length, block_size);
-#endif
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-  }
-
-  try {
-    sycl::queue q{sycl::cpu_selector{}};
+    sycl::queue q{sycl::cpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
 #ifndef DPCPP_NO_DOUBLE
@@ -288,7 +268,7 @@ int main(int argc, char * argv[])
   }
 
   try {
-    sycl::queue q{sycl::gpu_selector{}};
+    sycl::queue q{sycl::gpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, length, block_size);
 #ifndef DPCPP_NO_DOUBLE
diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc
index 1e2083982..5611a84a7 100644
--- a/Cxx11/p2p-hyperplane-sycl.cc
+++ b/Cxx11/p2p-hyperplane-sycl.cc
@@ -148,7 +148,7 @@ int main(int argc, char* argv[])
 
         q.submit([&](sycl::handler& h) {
 
-          auto grid = d_grid.get_access<sycl::access::mode::read_write>(h);
+          sycl::accessor grid(d_grid, h);
 
           unsigned begin = std::max(2,i-n+2);
           unsigned end   = std::min(i,n)+1;
@@ -172,7 +172,7 @@ int main(int argc, char* argv[])
       }
       q.submit([&](sycl::handler& h) {
 
-        auto grid = d_grid.get_access<sycl::access::mode::read_write>(h);
+        sycl::accessor grid(d_grid, h);
 
         h.single_task<class corner>([=] {
             grid[0*n+0] = -grid[(n-1)*n+(n-1)];
diff --git a/Cxx11/pic-sycl.cc b/Cxx11/pic-sycl.cc
index c55e5f4ff..b47572ba7 100644
--- a/Cxx11/pic-sycl.cc
+++ b/Cxx11/pic-sycl.cc
@@ -523,14 +523,12 @@ int main(int argc, char ** argv) {
   std::string devname = (devchar==NULL ? "None" : devchar);
   sycl::device d;
   if (devname == "CPU") {
-      d = sycl::cpu_selector{}.select_device();
+      d = sycl::device{sycl::cpu_selector_v};
   } else if (devname == "GPU") {
-      d = sycl::gpu_selector{}.select_device();
-  } else if (devname == "HOST") {
-      d = sycl::host_selector{}.select_device();
+      d = sycl::device{sycl::gpu_selector_v};
   } else {
-      std::cout << "PRK_DEVICE should be CPU, GPU or HOST" << std::endl;
-      d = sycl::default_selector{}.select_device();
+      std::cout << "PRK_DEVICE should be CPU or GPU" << std::endl;
+      d = sycl::device{sycl::default_selector_v};
   }
   sycl::queue q(d);
   prk::SYCL::print_device_platform(q);
@@ -603,9 +601,8 @@ int main(int argc, char ** argv) {
 
           /* Calculate forces on particles and update positions */
           q.submit([&](sycl::handler& cgh) {
-
-              auto p = d_particles.get_access<sycl::access::mode::read_write>(cgh);
-              auto q = d_Qgrid.get_access<sycl::access::mode::read>(cgh);
+              sycl::accessor p(d_particles, cgh);
+              sycl::accessor q(d_Qgrid, cgh, sycl::read_only);
 
               cgh.parallel_for<class pic>(sycl::nd_range<1>(sycl::range<1>(global_work_size), sycl::range<1>(local_work_size)), [=] (sycl::nd_item<1> item) {
                   auto i = item.get_global_id(0);
diff --git a/Cxx11/prk_sycl.h b/Cxx11/prk_sycl.h
index 8d37e489d..f70516d89 100644
--- a/Cxx11/prk_sycl.h
+++ b/Cxx11/prk_sycl.h
@@ -6,8 +6,6 @@
 
 #include "CL/sycl.hpp"
 
-namespace sycl = cl::sycl;
-
 #if defined(__LIBSYCL_MAJOR_VERSION) && defined(__LIBSYCL_MINOR_VERSION) && defined(__LIBSYCL_PATCH_VERSION)
 #    define __LIBSYCL_VERSION                                                                                          \
         (__LIBSYCL_MAJOR_VERSION * 10000 + __LIBSYCL_MINOR_VERSION * 100 + __LIBSYCL_PATCH_VERSION)
diff --git a/Cxx11/stencil-2d-sycl.cc b/Cxx11/stencil-2d-sycl.cc
index b6eeb09bc..b945e9ad7 100644
--- a/Cxx11/stencil-2d-sycl.cc
+++ b/Cxx11/stencil-2d-sycl.cc
@@ -123,7 +123,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star
     q.submit([&](sycl::handler& h) {
 
       // accessor methods
-      auto in  = d_in.template get_access<sycl::access::mode::read_write>(h);
+      sycl::accessor in(d_in, h);
 
       h.parallel_for<class init<T>>(sycl::range<2> {n, n}, [=] (sycl::item<2> it) {
           sycl::id<2> xy = it.get_id();
@@ -142,7 +142,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star
       q.wait();
 
       q.submit([&](sycl::handler& h) {
-        auto in  = d_in.template get_access<sycl::access::mode::read_write>(h);
+        sycl::accessor in(d_in, h);
         // Add constant to solution to force refresh of neighbor data, if any
         h.parallel_for<class add<T>>(sycl::range<2> {n, n}, [=] (sycl::item<2> it) {
             sycl::id<2> xy = it.get_id();
@@ -278,7 +278,7 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
-    sycl::queue q{sycl::host_selector{}};
+    sycl::queue q{sycl::cpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, n, block_size, star, radius);
 #ifndef DPCPP_NO_DOUBLE
@@ -297,26 +297,7 @@ int main(int argc, char * argv[])
   }
 
   try {
-    sycl::queue q{sycl::cpu_selector{}};
-    prk::SYCL::print_device_platform(q);
-    run<float>(q, iterations, n, block_size, star, radius);
-#ifndef DPCPP_NO_DOUBLE
-    run<double>(q, iterations, n, block_size, star, radius);
-#endif
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-  }
-
-  try {
-    sycl::queue q{sycl::gpu_selector{}};
+    sycl::queue q{sycl::gpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, n, block_size, star, radius);
 #ifndef DPCPP_NO_DOUBLE
diff --git a/Cxx11/stencil-sycl-usm.cc b/Cxx11/stencil-sycl-usm.cc
index b219b24f1..3f4a687fd 100644
--- a/Cxx11/stencil-sycl-usm.cc
+++ b/Cxx11/stencil-sycl-usm.cc
@@ -267,7 +267,7 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
-    sycl::queue q(sycl::host_selector{}, sycl::property::queue::in_order{});
+    sycl::queue q(sycl::cpu_selector_v, sycl::property::queue::in_order{});
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, n, block_size, star, radius);
 #ifndef DPCPP_NO_DOUBLE
@@ -286,26 +286,7 @@ int main(int argc, char * argv[])
   }
 
   try {
-    sycl::queue q(sycl::cpu_selector{}, sycl::property::queue::in_order{});
-    prk::SYCL::print_device_platform(q);
-    run<float>(q, iterations, n, block_size, star, radius);
-#ifndef DPCPP_NO_DOUBLE
-    run<double>(q, iterations, n, block_size, star, radius);
-#endif
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-  }
-
-  try {
-    sycl::queue q(sycl::gpu_selector{}, sycl::property::queue::in_order{});
+    sycl::queue q(sycl::gpu_selector_v, sycl::property::queue::in_order{});
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, n, block_size, star, radius);
 #ifndef DPCPP_NO_DOUBLE
diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc
index 8947c8dee..f5eb3f6f5 100644
--- a/Cxx11/stencil-sycl.cc
+++ b/Cxx11/stencil-sycl.cc
@@ -121,7 +121,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star
     sycl::buffer<T> d_out { h_out.data(), h_out.size() };
 
     q.submit([&](sycl::handler& h) {
-      auto in  = d_in.template get_access<sycl::access::mode::read_write>(h);
+      sycl::accessor in(d_in, h);
       h.parallel_for<class init<T>>(sycl::nd_range{global, local}, [=](sycl::nd_item<2> it) {
           const size_t i = it.get_global_id(0);
           const size_t j = it.get_global_id(1);
@@ -140,7 +140,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star
       q.wait();
 
       q.submit([&](sycl::handler& h) {
-        auto in  = d_in.template get_access<sycl::access::mode::read_write>(h);
+        sycl::accessor in(d_in, h);
         h.parallel_for<class add<T>>(sycl::nd_range{global, local}, [=](sycl::nd_item<2> it) {
             const size_t i = it.get_global_id(0);
             const size_t j = it.get_global_id(1);
@@ -276,7 +276,7 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
-    sycl::queue q{sycl::host_selector{}};
+    sycl::queue q{sycl::cpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, n, block_size, star, radius);
 #ifndef DPCPP_NO_DOUBLE
@@ -295,26 +295,7 @@ int main(int argc, char * argv[])
   }
 
   try {
-    sycl::queue q{sycl::cpu_selector{}};
-    prk::SYCL::print_device_platform(q);
-    run<float>(q, iterations, n, block_size, star, radius);
-#ifndef DPCPP_NO_DOUBLE
-    run<double>(q, iterations, n, block_size, star, radius);
-#endif
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-  }
-
-  try {
-    sycl::queue q{sycl::gpu_selector{}};
+    sycl::queue q{sycl::gpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, n, block_size, star, radius);
 #ifndef DPCPP_NO_DOUBLE
diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp
index 64af40b79..5339a6826 100644
--- a/Cxx11/stencil_sycl.hpp
+++ b/Cxx11/stencil_sycl.hpp
@@ -5,8 +5,8 @@ template <typename T>
 void star1(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer<T> & d_out)
 {
   q.submit([&](sycl::handler& h) {
-    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::accessor in(d_in, h, sycl::read_only);
+    sycl::accessor out(d_out, h);
     h.parallel_for<class star1_1d<T>>(sycl::range<2> {n-1,n-1}, [=] (sycl::item<2> it) {
         const auto i = it[0] + 1;
         const auto j = it[1] + 1;
@@ -25,8 +25,8 @@ template <typename T>
 void star1(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](sycl::handler& h) {
-    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::accessor in(d_in, h, sycl::read_only);
+    sycl::accessor out(d_out, h);
     sycl::id<2> dx1(sycl::range<2> {1,0});
     sycl::id<2> dy1(sycl::range<2> {0,1});
     h.parallel_for<class star1_2d<T>>(sycl::range<2> {n-1,n-1}, [=] (sycl::item<2> it) {
@@ -64,8 +64,8 @@ template <typename T>
 void star2(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer<T> & d_out)
 {
   q.submit([&](sycl::handler& h) {
-    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::accessor in(d_in, h, sycl::read_only);
+    sycl::accessor out(d_out, h);
     h.parallel_for<class star2_1d<T>>(sycl::range<2> {n-2,n-2}, [=] (sycl::item<2> it) {
         const auto i = it[0] + 2;
         const auto j = it[1] + 2;
@@ -88,8 +88,8 @@ template <typename T>
 void star2(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](sycl::handler& h) {
-    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::accessor in(d_in, h, sycl::read_only);
+    sycl::accessor out(d_out, h);
     sycl::id<2> dx1(sycl::range<2> {1,0});
     sycl::id<2> dy1(sycl::range<2> {0,1});
     sycl::id<2> dx2(sycl::range<2> {2,0});
@@ -137,8 +137,8 @@ template <typename T>
 void star3(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer<T> & d_out)
 {
   q.submit([&](sycl::handler& h) {
-    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::accessor in(d_in, h, sycl::read_only);
+    sycl::accessor out(d_out, h);
     h.parallel_for<class star3_1d<T>>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) {
         const auto i = it[0] + 3;
         const auto j = it[1] + 3;
@@ -165,8 +165,8 @@ template <typename T>
 void star3(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](sycl::handler& h) {
-    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::accessor in(d_in, h, sycl::read_only);
+    sycl::accessor out(d_out, h);
     sycl::id<2> dx1(sycl::range<2> {1,0});
     sycl::id<2> dy1(sycl::range<2> {0,1});
     sycl::id<2> dx2(sycl::range<2> {2,0});
@@ -224,8 +224,8 @@ template <typename T>
 void star4(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer<T> & d_out)
 {
   q.submit([&](sycl::handler& h) {
-    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::accessor in(d_in, h, sycl::read_only);
+    sycl::accessor out(d_out, h);
     h.parallel_for<class star4_1d<T>>(sycl::range<2> {n-4,n-4}, [=] (sycl::item<2> it) {
         const auto i = it[0] + 4;
         const auto j = it[1] + 4;
@@ -256,8 +256,8 @@ template <typename T>
 void star4(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](sycl::handler& h) {
-    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::accessor in(d_in, h, sycl::read_only);
+    sycl::accessor out(d_out, h);
     sycl::id<2> dx1(sycl::range<2> {1,0});
     sycl::id<2> dy1(sycl::range<2> {0,1});
     sycl::id<2> dx2(sycl::range<2> {2,0});
@@ -325,8 +325,8 @@ template <typename T>
 void star5(sycl::queue & q, const size_t n, sycl::buffer<T> & d_in, sycl::buffer<T> & d_out)
 {
   q.submit([&](sycl::handler& h) {
-    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::accessor in(d_in, h, sycl::read_only);
+    sycl::accessor out(d_out, h);
     h.parallel_for<class star5_1d<T>>(sycl::range<2> {n-5,n-5}, [=] (sycl::item<2> it) {
         const auto i = it[0] + 5;
         const auto j = it[1] + 5;
@@ -361,8 +361,8 @@ template <typename T>
 void star5(sycl::queue & q, const size_t n, sycl::buffer<T, 2> & d_in, sycl::buffer<T, 2> & d_out)
 {
   q.submit([&](sycl::handler& h) {
-    auto in  = d_in.template get_access<sycl::access::mode::read>(h);
-    auto out = d_out.template get_access<sycl::access::mode::read_write>(h);
+    sycl::accessor in(d_in, h, sycl::read_only);
+    sycl::accessor out(d_out, h);
     sycl::id<2> dx1(sycl::range<2> {1,0});
     sycl::id<2> dy1(sycl::range<2> {0,1});
     sycl::id<2> dx2(sycl::range<2> {2,0});
diff --git a/Cxx11/transpose-2d-sycl.cc b/Cxx11/transpose-2d-sycl.cc
index 2fbe8938b..55d3b8393 100644
--- a/Cxx11/transpose-2d-sycl.cc
+++ b/Cxx11/transpose-2d-sycl.cc
@@ -91,8 +91,8 @@ void run(sycl::queue & q, int iterations, size_t order, size_t block_size)
       q.submit([&](sycl::handler& h) {
 
         // accessor methods
-        auto A = d_A.template get_access<sycl::access::mode::read_write>(h);
-        auto B = d_B.template get_access<sycl::access::mode::read_write>(h);
+        sycl::accessor A(d_A, h);
+        sycl::accessor B(d_B, h);
 
         h.parallel_for<class transpose<T>>(
 #if PREBUILD_KERNEL
@@ -214,7 +214,7 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
-    sycl::queue q{sycl::host_selector{}};
+    sycl::queue q{sycl::cpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order, block_size);
 #ifndef DPCPP_NO_DOUBLE
@@ -233,26 +233,7 @@ int main(int argc, char * argv[])
   }
 
   try {
-    sycl::queue q{sycl::cpu_selector{}};
-    prk::SYCL::print_device_platform(q);
-    run<float>(q, iterations, order, block_size);
-#ifndef DPCPP_NO_DOUBLE
-    run<double>(q, iterations, order, block_size);
-#endif
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-  }
-
-  try {
-    sycl::queue q{sycl::gpu_selector{}};
+    sycl::queue q{sycl::gpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order, block_size);
 #ifndef DPCPP_NO_DOUBLE
diff --git a/Cxx11/transpose-dpcpp.cc b/Cxx11/transpose-dpcpp.cc
index efdb159e3..ccd1403e0 100644
--- a/Cxx11/transpose-dpcpp.cc
+++ b/Cxx11/transpose-dpcpp.cc
@@ -96,7 +96,7 @@ int main(int argc, char * argv[])
   std::cout << "Matrix order          = " << order << std::endl;
   std::cout << "Block size            = " << block_size << std::endl;
 
-  sycl::queue q(sycl::default_selector{});
+  sycl::queue q(sycl::default_selector_v);
   prk::SYCL::print_device_platform(q);
 
   size_t padded_order = block_size * prk::divceil(order,block_size);
diff --git a/Cxx11/transpose-sycl-usm.cc b/Cxx11/transpose-sycl-usm.cc
index 1ec5c1470..249440ee0 100644
--- a/Cxx11/transpose-sycl-usm.cc
+++ b/Cxx11/transpose-sycl-usm.cc
@@ -197,7 +197,7 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
-    sycl::queue q{sycl::host_selector{}};
+    sycl::queue q{sycl::cpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order, block_size);
 #ifndef DPCPP_NO_DOUBLE
@@ -216,26 +216,7 @@ int main(int argc, char * argv[])
   }
 
   try {
-    sycl::queue q{sycl::cpu_selector{}};
-    prk::SYCL::print_device_platform(q);
-    run<float>(q, iterations, order, block_size);
-#ifndef DPCPP_NO_DOUBLE
-    run<double>(q, iterations, order, block_size);
-#endif
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-  }
-
-  try {
-    sycl::queue q{sycl::gpu_selector{}};
+    sycl::queue q{sycl::gpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order, block_size);
 #ifndef DPCPP_NO_DOUBLE
diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc
index da0d596c0..894a916bd 100644
--- a/Cxx11/transpose-sycl.cc
+++ b/Cxx11/transpose-sycl.cc
@@ -91,8 +91,8 @@ void run(sycl::queue & q, int iterations, size_t order, size_t block_size)
       q.submit([&](sycl::handler& h) {
 
         // accessor methods
-        auto A = d_A.template get_access<sycl::access::mode::read_write>(h);
-        auto B = d_B.template get_access<sycl::access::mode::read_write>(h);
+        sycl::accessor A(d_A, h);
+        sycl::accessor B(d_B, h);
 
         h.parallel_for<class transpose<T>>(
 #if PREBUILD_KERNEL
@@ -213,7 +213,7 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
-    sycl::queue q{sycl::host_selector{}};
+    sycl::queue q{sycl::cpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order, block_size);
 #ifndef DPCPP_NO_DOUBLE
@@ -232,26 +232,7 @@ int main(int argc, char * argv[])
   }
 
   try {
-    sycl::queue q{sycl::cpu_selector{}};
-    prk::SYCL::print_device_platform(q);
-    run<float>(q, iterations, order, block_size);
-#ifndef DPCPP_NO_DOUBLE
-    run<double>(q, iterations, order, block_size);
-#endif
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-  }
-
-  try {
-    sycl::queue q{sycl::gpu_selector{}};
+    sycl::queue q{sycl::gpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order, block_size);
 #ifndef DPCPP_NO_DOUBLE
diff --git a/Cxx11/xgemm-onemkl.cc b/Cxx11/xgemm-onemkl.cc
index 68dfcb587..446777a4a 100644
--- a/Cxx11/xgemm-onemkl.cc
+++ b/Cxx11/xgemm-onemkl.cc
@@ -199,7 +199,7 @@ int main(int argc, char * argv[])
   //////////////////////////////////////////////////////////////////////
 
   try {
-    sycl::queue q{sycl::host_selector{}};
+    sycl::queue q{sycl::cpu_selector_v};
     prk::SYCL::print_device_platform(q);
     run<float>(q, iterations, order);
     run<double>(q, iterations, order);
@@ -216,24 +216,7 @@ int main(int argc, char * argv[])
   }
 
   try {
-    sycl::queue q{sycl::cpu_selector{}};
-    prk::SYCL::print_device_platform(q);
-    run<float>(q, iterations, order);
-    run<double>(q, iterations, order);
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-  }
-
-  try {
-    sycl::queue q{sycl::gpu_selector{}};
+    sycl::queue q{sycl::gpu_selector_v};
     prk::SYCL::print_device_platform(q);
     bool has_fp64 = prk::SYCL::has_fp64(q);
     if (has_fp64) {
diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi
index 66b5dbd8e..fd8b7ece0 100644
--- a/common/make.defs.oneapi
+++ b/common/make.defs.oneapi
@@ -58,7 +58,7 @@ OPENCLFLAG=-I${OPENCLDIR}/include/sycl -L${OPENCLDIR}/lib -lOpenCL
 #
 # Intel oneAPI
 #
-SYCLCXX=dpcpp
+SYCLCXX=icpx
 SYCLFLAG=-fsycl
 SYCLFLAG+=-std=c++17 -O3 -g3
 SYCLFLAG+=-DDPCPP

From 712ff1b3795670cb5c7498f2a8bc4a155f6bb707 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Thu, 20 Jul 2023 10:25:46 +0300
Subject: [PATCH 77/80] better xgemm test for onemkl (#630)

---
 Cxx11/xgemm-onemkl.cc | 81 ++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 43 deletions(-)

diff --git a/Cxx11/xgemm-onemkl.cc b/Cxx11/xgemm-onemkl.cc
index 446777a4a..5dcac9384 100644
--- a/Cxx11/xgemm-onemkl.cc
+++ b/Cxx11/xgemm-onemkl.cc
@@ -1,5 +1,6 @@
 ///
 /// Copyright (c) 2020, Intel Corporation
+/// Copyright (c) 2023, NVIDIA
 ///
 /// Redistribution and use in source and binary forms, with or without
 /// modification, are permitted provided that the following conditions
@@ -63,6 +64,7 @@
 #include <mkl_blas_sycl.hpp>
 #else
 #include <oneapi/mkl/blas.hpp>
+#include <oneapi/mkl/bfloat16.hpp>
 #endif
 
 using namespace oneapi; // oneapi::mkl -> mkl
@@ -139,7 +141,7 @@ void run(sycl::queue & q, int iterations, int order)
   }
   const double residuum = std::abs(checksum - reference) / reference;
   const double epsilon{1.0e-8};
-  if (residuum < epsilon) {
+  if ((residuum < epsilon) || (sizeof(T) < 4)) {
 #if VERBOSE
     std::cout << "Reference checksum = " << reference << "\n"
               << "Actual checksum = " << checksum << std::endl;
@@ -147,8 +149,16 @@ void run(sycl::queue & q, int iterations, int order)
     std::cout << "Solution validates" << std::endl;
     auto avgtime = gemm_time/iterations;
     auto nflops = 2.0 * prk::pow(forder,3);
-    std::cout << "FP" << 8*sizeof(T)
-              << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime
+    auto is_fp64 = (typeid(T) == typeid(double));
+    auto is_fp32 = (typeid(T) == typeid(float));
+    auto is_fp16 = (typeid(T) == typeid(sycl::half));
+    auto is_bf16 = (typeid(T) == typeid(oneapi::mkl::bfloat16));
+    auto pname = (is_fp64 ? "FP64" :
+                  (is_fp32 ? "FP32" :
+                   (is_fp16 ? "FP16" :
+                    (is_bf16 ? "BF16" : "Unknown FP type"))));
+    std::cout << pname
+              << " Rate (MF/s): " << 1.0e-6 * nflops/avgtime
               << " Avg time (s): " << avgtime << std::endl;
   } else {
     std::cout << "Reference checksum = " << reference << "\n"
@@ -198,46 +208,31 @@ int main(int argc, char * argv[])
   /// Setup SYCL environment
   //////////////////////////////////////////////////////////////////////
 
-  try {
-    sycl::queue q{sycl::cpu_selector_v};
-    prk::SYCL::print_device_platform(q);
-    run<float>(q, iterations, order);
-    run<double>(q, iterations, order);
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
-  }
-
-  try {
-    sycl::queue q{sycl::gpu_selector_v};
-    prk::SYCL::print_device_platform(q);
-    bool has_fp64 = prk::SYCL::has_fp64(q);
-    if (has_fp64) {
-      if (prk::SYCL::print_gen12lp_helper(q)) return 1;
-    }
-    run<float>(q, iterations, order);
-    if (has_fp64) {
-      run<double>(q, iterations, order);
-    } else {
-      std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
-    }
-  }
-  catch (sycl::exception & e) {
-    std::cout << e.what() << std::endl;
-    prk::SYCL::print_exception_details(e);
-  }
-  catch (std::exception & e) {
-    std::cout << e.what() << std::endl;
-  }
-  catch (const char * e) {
-    std::cout << e << std::endl;
+  sycl::queue qs[2] = { sycl::queue{sycl::cpu_selector_v},
+                        sycl::queue{sycl::gpu_selector_v} };
+  for (auto q : qs) {
+      try {
+        prk::SYCL::print_device_platform(q);
+        bool has_fp64 = prk::SYCL::has_fp64(q);
+        run<sycl::half>(q, iterations, order);
+        run<oneapi::mkl::bfloat16>(q, iterations, order);
+        run<float>(q, iterations, order);
+        if (has_fp64) {
+          run<double>(q, iterations, order);
+        } else {
+          std::cout << "SYCL device lacks FP64 support." << std::endl;
+        }
+      }
+      catch (sycl::exception & e) {
+        std::cout << e.what() << std::endl;
+        prk::SYCL::print_exception_details(e);
+      }
+      catch (std::exception & e) {
+        std::cout << e.what() << std::endl;
+      }
+      catch (const char * e) {
+        std::cout << e << std::endl;
+      }
   }
 
   return 0;

From 549978bc93f1a6c02715bc6874bb4b801c1b2045 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 4 May 2023 13:10:25 +0300
Subject: [PATCH 78/80] fix C ism bug

---
 Cxx11/pic-sycl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cxx11/pic-sycl.cc b/Cxx11/pic-sycl.cc
index b47572ba7..6d5d31503 100644
--- a/Cxx11/pic-sycl.cc
+++ b/Cxx11/pic-sycl.cc
@@ -126,7 +126,7 @@ double * initializeGrid(uint64_t L)
 }
 
 /* Completes particle distribution */
-void finish_distribution(const uint64_t n, particle_t p[const n])
+void finish_distribution(const uint64_t n, particle_t p[])
 {
   for (uint64_t pi=0; pi<n; pi++) {
     double x_coord = p[pi].x;

From 3de3cd8b720ed79721a5c0884470a8134fcd6558 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 4 May 2023 13:10:32 +0300
Subject: [PATCH 79/80] par exec needed

---
 Cxx11/nstream-stdpar.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Cxx11/nstream-stdpar.cc b/Cxx11/nstream-stdpar.cc
index 4723f0a93..0c4cd5ecf 100644
--- a/Cxx11/nstream-stdpar.cc
+++ b/Cxx11/nstream-stdpar.cc
@@ -145,7 +145,8 @@ int main(int argc, char * argv[])
       auto nstream = [=] (thrust::tuple<double&,double,double> t) {
           return thrust::get<0>(t) +  thrust::get<1>(t) + scalar * thrust::get<2>(t);
       };
-      std::transform( thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+      std::transform( std::execution::par_unseq,
+                      thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
                       thrust::make_zip_iterator(thrust::make_tuple(A.end()  , B.end()  , C.end())),
                       A.begin(),
                       nstream);

From 65547411769eec741f3ce403469ea77ee3712291 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Thu, 20 Jul 2023 10:35:35 +0300
Subject: [PATCH 80/80] disable TBB and related because they keep breaking it

---
 Cxx11/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index ee69f1a75..7e8cb9ce3 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -60,13 +60,13 @@ ifdef OCCADIR
 endif
 OCCAFLAGS = -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib -locca
 
-.PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl \
+.PHONY: all clean vector valarray openmp target opencl taskloop stl \
 	ranges kokkos raja cuda cublas sycl dpcpp \
 	boost-compute thrust executor oneapi onemkl
 
 EXTRA=
 ifneq ($(findstring nvc++,$(CXX)),nvc++)
-  EXTRA += ranges stl pstl
+  EXTRA += ranges stl #pstl tbb # TBB keeps breaking due to API changes
 endif
 ifneq ($(OPENACCFLAG),)
   EXTRA += openacc