From f5595d379b4f9e2b768c8096d67903722a59b3e1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 25 Feb 2020 13:14:17 -0800 Subject: [PATCH 01/80] remove HPX-5, HPX-3 is the only HPX from now on --- travis/build-run-prk.sh | 3 --- travis/install-deps.sh | 9 ++----- travis/install-hpx.sh | 40 +++++++++++++++++++++++++++++ travis/install-hpx3.sh | 57 ----------------------------------------- travis/install-hpx5.sh | 39 ---------------------------- 5 files changed, 42 insertions(+), 106 deletions(-) create mode 100755 travis/install-hpx.sh delete mode 100755 travis/install-hpx3.sh delete mode 100755 travis/install-hpx5.sh diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index c9e98f2f0..14e505265 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -1134,9 +1134,6 @@ case "$PRK_TARGET" in allhpx3) echo "Nothing to do yet" ;; - allhpx5) - echo "Nothing to do yet" - ;; alllegion) echo "Legion" echo "LEGIONTOP=${TRAVIS_ROOT}/legion" > common/make.defs diff --git a/travis/install-deps.sh b/travis/install-deps.sh index 433ebc44a..3c4fc29a1 100755 --- a/travis/install-deps.sh +++ b/travis/install-deps.sh @@ -169,16 +169,11 @@ case "$PRK_TARGET" in echo "Chapel" sh ./travis/install-chapel.sh $TRAVIS_ROOT ;; - allhpx3) - echo "HPX-3" + allhpx) + echo "HPX" sh ./travis/install-cmake.sh $TRAVIS_ROOT sh ./travis/install-hpx3.sh $TRAVIS_ROOT ;; - allhpx5) - echo "HPX-5" - sh ./travis/install-autotools.sh $TRAVIS_ROOT - sh ./travis/install-hpx5.sh $TRAVIS_ROOT - ;; alllegion) echo "Legion" # GASNet is not needed, it seems diff --git a/travis/install-hpx.sh b/travis/install-hpx.sh new file mode 100755 index 000000000..5fa0cda89 --- /dev/null +++ b/travis/install-hpx.sh @@ -0,0 +1,40 @@ +#!/bin/sh + +set -e +set -x + +if [ -f ~/use-intel-compilers ] ; then + export CC=icc + export CXX=icpc + export FC=ifort +fi + +TRAVIS_ROOT="$1" + +case "$TRAVIS_OS_NAME" in + linux) + ;; + osx) + set +e + brew update + for p in boost jemalloc gperftools ; do + brew install $p || brew upgrade $p + done + set -e + ;; +esac + +if [ ! -d "$TRAVIS_ROOT/hpx" ]; then + cd $TRAVIS_ROOT + git clone --depth 1 https://github.com/STEllAR-GROUP/hpx.git hpx-source + cd hpx-source + mkdir build + cd build + cmake .. -DCMAKE_INSTALL_PREFIX:PATH=$TRAVIS_ROOT/hpx -DCMAKE_MACOSX_RPATH=YES -DHPX_WITH_HWLOC=OFF + make -j2 + # make check # target does not exist + make install +else + echo "HPX installed..." + find $TRAVIS_ROOT/hpx +fi diff --git a/travis/install-hpx3.sh b/travis/install-hpx3.sh deleted file mode 100755 index 50bf6878d..000000000 --- a/travis/install-hpx3.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/sh - -set -e -set -x - -if [ -f ~/use-intel-compilers ] ; then - export CC=icc - export CXX=icpc - export FC=ifort -fi - -TRAVIS_ROOT="$1" - -case "$TRAVIS_OS_NAME" in - linux) - ;; - osx) - set +e - brew update - if [ "$USE_HPX_TARBALL" ] ; then - export HPX_BOOST="homebrew/versions/boost155" - else - export HPX_BOOST="boost" - fi - for p in $HPX_BOOST jemalloc gperftools ; do - brew install $p || brew upgrade $p - done - set -e - ;; -esac - -if [ ! -d "$TRAVIS_ROOT/hpx3" ]; then - cd $TRAVIS_ROOT - #if [ "$USE_HPX_TARBALL" ] ; then - # wget -q --no-check-certificate http://stellar.cct.lsu.edu/files/hpx_0.9.11.tar.bz2 - # if [ `which md5` ] ; then - # echo "MD5 signature is:" - # md5 hpx_0.9.11.tar.bz2 - # echo "MD5 signature should be:" - # echo "86a71189fb6344d27bf53d6aa2b33122" - # fi - # tar -xjf hpx_0.9.11.tar.bz2 - # cd hpx_0.9.11 - #else - git clone --depth 1 https://github.com/STEllAR-GROUP/hpx.git hpx3-source - cd hpx3-source - #fi - mkdir build - cd build - cmake .. -DCMAKE_INSTALL_PREFIX:PATH=$TRAVIS_ROOT/hpx3 -DCMAKE_MACOSX_RPATH=YES -DHPX_WITH_HWLOC=OFF - make -j2 - # make check # target does not exist - make install -else - echo "HPX-3 installed..." - find $TRAVIS_ROOT/hpx3 -fi diff --git a/travis/install-hpx5.sh b/travis/install-hpx5.sh deleted file mode 100755 index 6fa6f29d9..000000000 --- a/travis/install-hpx5.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/sh - -set -e -set -x - -if [ -f ~/use-intel-compilers ] ; then - export CC=icc - export CXX=icpc - export FC=ifort -fi - -TRAVIS_ROOT="$1" - -if [ ! -d "$TRAVIS_ROOT/hpx5" ] ; then - cd $TRAVIS_ROOT - if [ "0" = "1" ] ; then - wget -q --no-check-certificate http://hpx.crest.iu.edu/release/HPX_Release_v2.0.0.tar.gz - if [ `which shasum` ] ; then - echo "SHA-256 signature is:" - shasum -a 256 HPX_Release_v2.0.0.tar.gz - echo "SHA-256 signature should be:" - echo "647c5f0ef3618f734066c91d741021d7bd38cf21" - fi - tar -xzf HPX_Release_v2.0.0.tar.gz - cd HPX_Release_v2.0.0/hpx - else - export GIT_SSL_NO_VERIFY=1 - git clone --depth 1 http://gitlab.crest.iu.edu/extreme/hpx.git hpx5-source - cd hpx5-source - fi - ./bootstrap - ./configure --prefix=$TRAVIS_ROOT/hpx5 - make -j2 - make check - make install -else - echo "HPX-5 installed..." - find $TRAVIS_ROOT/hpx5 -name hpx-config -fi From 042b1855aa7618669e242682c36fb7574134d57f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 25 Feb 2020 22:46:03 -0800 Subject: [PATCH 02/80] HPX is WIP --- Cxx11/Makefile | 7 ++ Cxx11/nstream-hpx.cc | 176 +++++++++++++++++++++++++++++++++++++++++++ Cxx11/prk_hpx.h | 41 ++++++++++ doc/HPX.md | 13 ++++ 4 files changed, 237 insertions(+) create mode 100644 Cxx11/nstream-hpx.cc create mode 100644 Cxx11/prk_hpx.h create mode 100644 doc/HPX.md diff --git a/Cxx11/Makefile b/Cxx11/Makefile index e3b9e76fe..2adb8c486 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -44,6 +44,7 @@ RANGEFLAGS = $(RANGEFLAG) -DUSE_RANGES STLFLAGS = $(STLFLAG) $(RANGEFLAGS) PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA +HPXFLAGS = -I$(HPXDIR)/include -DUSE_HPX -L$(HPXDIR)/lib $(BOOSTFLAG) $(HWLOCFLAG) $(RANGEFLAGS) THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 @@ -122,6 +123,8 @@ rangefor: stencil-vector-rangefor transpose-vector-rangefor nstream-vector-range kokkos: stencil-kokkos transpose-kokkos nstream-kokkos +hpx: nstream-hpx + raja: p2p-vector-raja stencil-vector-raja nstream-vector-raja \ p2p-raja transpose-raja nstream-raja stencil-raja # transpose-vector-raja @@ -193,6 +196,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h %-raja: %-raja.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@ +%-hpx: %-hpx.cc prk_util.h prk_hpx.h + $(CXX) $(CXXFLAGS) $< $(HPXFLAGS) -o $@ + ifeq ($(PRK_KOKKOS_BACKEND),Cuda) %-kokkos: %-kokkos.cc prk_util.h ${KOKKOSDIR}/bin/nvcc_wrapper $(CPPFLAGS) $(CUDAFLAGS) $< $(KOKKOSFLAG) -DUSE_KOKKOS -DPRK_KOKKOS_BACKEND=Cuda -o $@ @@ -262,6 +268,7 @@ clean: -rm -f *-rangefor -rm -f *-raja -rm -f *-kokkos + -rm -f *-hpx -rm -f *-thrust -rm -f *-cuda -rm -f *-cublas diff --git a/Cxx11/nstream-hpx.cc b/Cxx11/nstream-hpx.cc new file mode 100644 index 000000000..d41f5fe6d --- /dev/null +++ b/Cxx11/nstream-hpx.cc @@ -0,0 +1,176 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_hpx.h" +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++ HPX STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto nstream_time = 0.0; + + std::vector A(length); + std::vector B(length); + std::vector C(length); + + auto range = prk::range(static_cast(0), length); + + double scalar(3); + + { + std::for_each( std::begin(range), std::end(range), [&] (size_t i) { + A[i] = 0; + B[i] = 2; + C[i] = 2; + }); + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) nstream_time = prk::wtime(); + + std::for_each( std::begin(range), std::end(range), [&] (size_t i) { + A[i] += B[i] + scalar * C[i]; + }); + } + nstream_time = prk::wtime() - nstream_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (auto i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/prk_hpx.h b/Cxx11/prk_hpx.h new file mode 100644 index 000000000..36c523eb1 --- /dev/null +++ b/Cxx11/prk_hpx.h @@ -0,0 +1,41 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +#ifndef PRK_HPX_H +#define PRK_HPX_H + +#include +#include + +#include +#include + +#endif /* PRK_HPX_H */ diff --git a/doc/HPX.md b/doc/HPX.md new file mode 100644 index 000000000..1fce480f7 --- /dev/null +++ b/doc/HPX.md @@ -0,0 +1,13 @@ +# + +```sh +cmake .. -DCMAKE_INSTALL_PREFIX=$PRK_DIR/Cxx11/hpx \ + -DCMAKE_CXX_COMPILER=/usr/local/Cellar/llvm/9.0.1/bin/clang++ \ + -DCMAKE_C_COMPILER=/usr/local/Cellar/llvm/9.0.1/bin/clang \ + -DHPX_WITH_TESTS:BOOL=Off \ + -DHPX_WITH_TESTS_BENCHMARKS:BOOL=Off \ + -DHPX_WITH_TESTS_EXAMPLES:BOOL=Off \ + -DHPX_WITH_TESTS_REGRESSIONS:BOOL=Off \ + -DHPX_WITH_TESTS_UNIT:BOOL=Off +make install +``` From 8c63805f1d6267beaa91398618f4c494224977c7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 10:46:26 -0800 Subject: [PATCH 03/80] add HPX flags to make.defs. Signed-off-by: Jeff Hammond --- Cxx11/Makefile | 2 +- common/make.defs.gcc | 44 ++++++++++++++++++++++++++++++++++++++++-- common/make.defs.intel | 36 +++++++++++++++++++++++++--------- common/make.defs.llvm | 13 ++++++++++--- 4 files changed, 80 insertions(+), 15 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 2adb8c486..478b29cb4 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -197,7 +197,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@ %-hpx: %-hpx.cc prk_util.h prk_hpx.h - $(CXX) $(CXXFLAGS) $< $(HPXFLAGS) -o $@ + $(HPXCXX) --exe=$@ $(CXXFLAGS) $(HPXFLAGS) $< ifeq ($(PRK_KOKKOS_BACKEND),Cuda) %-kokkos: %-kokkos.cc prk_util.h diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 8df7db087..4fd4a74ff 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -44,6 +44,9 @@ ORNLACCFLAG=-fopenacc # # MacOS OPENCLFLAG=-framework OpenCL +# POCL +# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct... +#OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL # Linux #OPENCLDIR=/etc/alternatives/opencl-intel-tools #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL @@ -52,13 +55,33 @@ METALFLAG=-framework MetalPerformanceShaders # # SYCL flags # +# Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md +#SYCLDIR=/opt/isycl +#SYCLCXX=${SYCLDIR}/bin/clang++ +#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib +#SYCLFLAG+=-std=c++17 -O3 +# CodePlay ComputeCpp +#SYCLDIR=/opt/sycl/latest +#SYCLCXX=${SYCLDIR}/bin/compute++ +#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp +#SYCLFLAG+=-std=c++14 -O3 +# This makes a huge difference in e.g. nstream... +#SYCLFLAG+=-no-serial-memop +# CentOS7 and Ubuntu14 built for this +#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +# PRK header rejects GCC4 +#SYCLFLAG+=--gcc-toolchain=/swtools/gcc/5.4.0 +# If not found automatically +#SYCLFLAG+=${OPENCLFLAG} +# NVIDIA target +#SYCLFLAG+=-sycl-target ptx64 +# # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... #SYCLDIR=./triSYCL #SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) #SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL # -METALFLAG=-framework MetalPerformanceShaders # # OCCA # @@ -71,7 +94,7 @@ METALFLAG=-framework MetalPerformanceShaders # TBB # #TBBDIR=/usr/lib/x86_64-linux-gnu -TBBDIR=/usr/local/Cellar/tbb/2019_U8 +TBBDIR=/usr/local/Cellar/tbb/2020_U0 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb @@ -79,6 +102,9 @@ TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # Parallel STL, Boost, etc. # #BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include +BOOSTROOT=/usr/local/Cellar/boost/1.72.0/include +BOOSTFLAG+=-I${BOOSTROOT} +BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11 RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} #RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} @@ -89,6 +115,10 @@ RAJADIR=/opt/raja/gcc RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +# HPX is more complicated... +HPXDIR=./hpx +HPXCXX=${HPXDIR}/bin/hpxcxx +HWLOCFLAG=-I/usr/local/include # # CBLAS for C++ DGEMM # @@ -105,6 +135,16 @@ CUDAFLAGS=-g -O3 -std=c++11 -arch=sm_50 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # +# Halide +# +HALIDECXX=c++ +HALIDEDIR=/opt/halide +HALIDEFLAG=-I${HALIDEDIR}/include +HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide +#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +HALIDEFLAG+=${DEFAULT_OPT_FLAGS} +HALIDEFLAG+=-std=c++17 -g3 +# # ISPC # ISPC=ispc diff --git a/common/make.defs.intel b/common/make.defs.intel index cab461c08..145b1e750 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -42,19 +42,37 @@ OFFLOADFLAG=-qopenmp-offload=host # Linux OPENCLDIR=/etc/alternatives/opencl-intel-tools OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +#OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations +METALFLAG=-framework MetalPerformanceShaders # # SYCL flags # +# Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md +#SYCLDIR=/opt/isycl +#SYCLCXX=${SYCLDIR}/bin/clang++ +#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib +#SYCLFLAG+=-std=c++17 -O3 +# CodePlay ComputeCpp +#SYCLDIR=/opt/sycl/latest +#SYCLCXX=${SYCLDIR}/bin/compute++ +#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp +#SYCLFLAG+=-std=c++14 -O3 +# This makes a huge difference in e.g. nstream... +#SYCLFLAG+=-no-serial-memop +# CentOS7 and Ubuntu14 built for this +#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +# PRK header rejects GCC4 +#SYCLFLAG+=--gcc-toolchain=/swtools/gcc/5.4.0 +# If not found automatically +#SYCLFLAG+=${OPENCLFLAG} +# NVIDIA target +#SYCLFLAG+=-sycl-target ptx64 +# # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -SYCLDIR=./triSYCL -SYCLCXX=${CXX} ${OPENMPFLAG} -SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include -# ProGTX -# https://github.com/ProGTX/sycl-gtx -#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx -#SYCLCXX=${CXX} ${OPENMPFLAG} -#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +#SYCLDIR=./triSYCL +#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) +#SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL # # OCCA # @@ -62,7 +80,7 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include # # Cilk # -CILKFLAG=-intel-extensions # default +#CILKFLAG=-intel-extensions # default # # TBB # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 6a668bf14..b65febe80 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -85,7 +85,7 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL -SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS) +SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL # # OCCA @@ -95,7 +95,7 @@ SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL # TBB # #TBBDIR=/usr/lib/x86_64-linux-gnu -TBBDIR=/usr/local/Cellar/tbb/2019_U8 +TBBDIR=/usr/local/Cellar/tbb/2020_U0 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb @@ -103,15 +103,22 @@ TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # Parallel STL, Boost, etc. # #BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include +BOOSTROOT=/usr/local/Cellar/boost/1.72.0/include +BOOSTFLAG+=-I${BOOSTROOT} +BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11 #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include -PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} -Wno-\#pragma-messages -DUSE_INTEL_PSTL -I./pstl/include KOKKOSDIR=/opt/kokkos/clang KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl RAJADIR=/opt/raja/clang RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/opt/nvidia/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +# HPX is more complicated... +HPXDIR=./hpx +HPXCXX=${HPXDIR}/bin/hpxcxx +HWLOCFLAG=-I/usr/local/include # # CBLAS for C++ DGEMM # From 0556b39e400134cd586a5b79dd3fc43730876ca2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 10:53:02 -0800 Subject: [PATCH 04/80] update HPX flags --- Cxx11/Makefile | 2 +- common/make.defs.gcc | 3 ++- common/make.defs.intel | 5 +++++ common/make.defs.llvm | 3 ++- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 478b29cb4..72526a38a 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -44,7 +44,7 @@ RANGEFLAGS = $(RANGEFLAG) -DUSE_RANGES STLFLAGS = $(STLFLAG) $(RANGEFLAGS) PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA -HPXFLAGS = -I$(HPXDIR)/include -DUSE_HPX -L$(HPXDIR)/lib $(BOOSTFLAG) $(HWLOCFLAG) $(RANGEFLAGS) +HPXFLAGS = -DUSE_HPX $(HPXFLAG) $(BOOSTFLAG) $(RANGEFLAGS) THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 4fd4a74ff..7e0736211 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -116,9 +116,10 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # HPX is more complicated... +HWLOCFLAG=-I/usr/local/include HPXDIR=./hpx HPXCXX=${HPXDIR}/bin/hpxcxx -HWLOCFLAG=-I/usr/local/include +HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} # # CBLAS for C++ DGEMM # diff --git a/common/make.defs.intel b/common/make.defs.intel index 145b1e750..92a0d4e64 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -99,6 +99,11 @@ RAJADIR=/opt/raja/intel RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/opt/nvidia/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +# HPX is more complicated... +HWLOCFLAG=-I/usr/local/include +HPXDIR=./hpx +HPXCXX=${HPXDIR}/bin/hpxcxx +HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} # # CBLAS for C++ DGEMM # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index b65febe80..4021f6c6d 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -116,9 +116,10 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/opt/nvidia/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # HPX is more complicated... +HWLOCFLAG=-I/usr/local/include HPXDIR=./hpx HPXCXX=${HPXDIR}/bin/hpxcxx -HWLOCFLAG=-I/usr/local/include +HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} # # CBLAS for C++ DGEMM # From 5c799cc1faf5261a6a8a05399246eff5fb2cc0b6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 10:53:32 -0800 Subject: [PATCH 05/80] this template is really dated at this point --- common/{make.defs.in => make.defs.old} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename common/{make.defs.in => make.defs.old} (100%) diff --git a/common/make.defs.in b/common/make.defs.old similarity index 100% rename from common/make.defs.in rename to common/make.defs.old From d8ef1bdec5f9b27d08655f9ae47ff2b75573d743 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 15:02:07 -0800 Subject: [PATCH 06/80] use HPX for_each --- Cxx11/nstream-hpx.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/nstream-hpx.cc b/Cxx11/nstream-hpx.cc index d41f5fe6d..47d1b775c 100644 --- a/Cxx11/nstream-hpx.cc +++ b/Cxx11/nstream-hpx.cc @@ -120,7 +120,7 @@ int main(int argc, char * argv[]) double scalar(3); { - std::for_each( std::begin(range), std::end(range), [&] (size_t i) { + hpx::parallel::for_each(hpx::parallel::execution::seq, std::begin(range), std::end(range), [&] (size_t i) { A[i] = 0; B[i] = 2; C[i] = 2; @@ -130,7 +130,7 @@ int main(int argc, char * argv[]) if (iter==1) nstream_time = prk::wtime(); - std::for_each( std::begin(range), std::end(range), [&] (size_t i) { + hpx::parallel::for_each(hpx::parallel::execution::seq, std::begin(range), std::end(range), [&] (size_t i) { A[i] += B[i] + scalar * C[i]; }); } From 21621722e7e93426ec6e9b35ae211ab1e2ce6fbe Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 15:20:10 -0800 Subject: [PATCH 07/80] UPC++ support --- travis/install-deps.sh | 1 + travis/install-upcxx.sh | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100755 travis/install-upcxx.sh diff --git a/travis/install-deps.sh b/travis/install-deps.sh index 433ebc44a..72effa7b6 100755 --- a/travis/install-deps.sh +++ b/travis/install-deps.sh @@ -76,6 +76,7 @@ case "$PRK_TARGET" in sh ./travis/install-kokkos.sh $TRAVIS_ROOT #sh ./travis/install-occa.sh $TRAVIS_ROOT sh ./travis/install-sycl.sh $TRAVIS_ROOT + sh ./travis/install-upcxx.sh $TRAVIS_ROOT ;; allfortran) echo "Fortran" diff --git a/travis/install-upcxx.sh b/travis/install-upcxx.sh new file mode 100755 index 000000000..3725d361f --- /dev/null +++ b/travis/install-upcxx.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +set -e +set -x + +if [ -f ~/use-intel-compilers ] ; then + export CC=icc + export CXX=icpc + export FC=ifort +fi + +TRAVIS_ROOT="$1" + +UPCXX_RELEASE=upcxx-2019.9.0 +UPCXX_PREFIX=$TRAVIS_ROOT/$UPCXX_RELEASE + +if [ ! -d "$UPCXX_PREFIX" ]; then + cd $TRAVIS_ROOT + wget --no-check-certificate -q https://bitbucket.org/berkeleylab/upcxx/downloads/${UPCXX_RELEASE}.tar.gz + tar -xzf $UPCXX_RELEASE.tar.gz + cd $UPCXX_RELEASE + ./install $TRAVIS_ROOT/upcxx +else + echo "UPC++ installed..." + find $TRAVIS_ROOT/upcxx -name upcxx -type f +fi + From 33cc8d5928b58a5f3656c0fb384ba7f7bbfb4448 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 15:21:34 -0800 Subject: [PATCH 08/80] install-hpx in deps --- travis/install-deps.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/travis/install-deps.sh b/travis/install-deps.sh index 3c4fc29a1..19aa5bdcc 100755 --- a/travis/install-deps.sh +++ b/travis/install-deps.sh @@ -75,6 +75,7 @@ case "$PRK_TARGET" in #sh ./travis/install-raja.sh $TRAVIS_ROOT sh ./travis/install-kokkos.sh $TRAVIS_ROOT #sh ./travis/install-occa.sh $TRAVIS_ROOT + sh ./travis/install-hpx.sh $TRAVIS_ROOT sh ./travis/install-sycl.sh $TRAVIS_ROOT ;; allfortran) @@ -169,11 +170,6 @@ case "$PRK_TARGET" in echo "Chapel" sh ./travis/install-chapel.sh $TRAVIS_ROOT ;; - allhpx) - echo "HPX" - sh ./travis/install-cmake.sh $TRAVIS_ROOT - sh ./travis/install-hpx3.sh $TRAVIS_ROOT - ;; alllegion) echo "Legion" # GASNet is not needed, it seems From 69efdf6efc464fc3d1cee82877b1eb9ab275618c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 15:45:51 -0800 Subject: [PATCH 09/80] add UPC++ skeleton --- Cxx11/Makefile | 6 ++ Cxx11/nstream-upcxx.cc | 184 +++++++++++++++++++++++++++++++++++++++++ Cxx11/prk_ranges.h | 18 ++-- Cxx11/prk_upcxx.h | 37 +++++++++ common/make.defs.gcc | 6 ++ common/make.defs.llvm | 6 ++ 6 files changed, 247 insertions(+), 10 deletions(-) create mode 100644 Cxx11/nstream-upcxx.cc create mode 100644 Cxx11/prk_upcxx.h diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 72526a38a..2bb1bc4ba 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -45,6 +45,7 @@ STLFLAGS = $(STLFLAG) $(RANGEFLAGS) PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA HPXFLAGS = -DUSE_HPX $(HPXFLAG) $(BOOSTFLAG) $(RANGEFLAGS) +UPCXXFLAGS = $(CPPFLAGS) -DUSE_UPCXX $(UPCXXFLAG) $(BOOSTFLAG) $(RANGEFLAGS) THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 @@ -125,6 +126,8 @@ kokkos: stencil-kokkos transpose-kokkos nstream-kokkos hpx: nstream-hpx +upcxx: nstream-upcxx + raja: p2p-vector-raja stencil-vector-raja nstream-vector-raja \ p2p-raja transpose-raja nstream-raja stencil-raja # transpose-vector-raja @@ -196,6 +199,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h %-raja: %-raja.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@ +%-upcxx: %-upcxx.cc prk_util.h prk_upcxx.h + $(UPCXX) $(UPCXXFLAGS) $< -o $@ + %-hpx: %-hpx.cc prk_util.h prk_hpx.h $(HPXCXX) --exe=$@ $(CXXFLAGS) $(HPXFLAGS) $< diff --git a/Cxx11/nstream-upcxx.cc b/Cxx11/nstream-upcxx.cc new file mode 100644 index 000000000..083ab96b4 --- /dev/null +++ b/Cxx11/nstream-upcxx.cc @@ -0,0 +1,184 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_upcxx.h" +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + upcxx::init(); + + const int me = upcxx::rank_me(); + const int np = upcxx::rank_n(); + + if (me==0) { + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++ HPX STREAM triad: A = B + scalar * C" << std::endl; + } + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto nstream_time = 0.0; + + std::vector A(length); + std::vector B(length); + std::vector C(length); + + auto range = prk::range(static_cast(0), length); + + double scalar(3); + + { + std::for_each(std::begin(range), std::end(range), [&] (size_t i) { + A[i] = 0; + B[i] = 2; + C[i] = 2; + }); + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) nstream_time = prk::wtime(); + + std::for_each(std::begin(range), std::end(range), [&] (size_t i) { + A[i] += B[i] + scalar * C[i]; + }); + } + nstream_time = prk::wtime() - nstream_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (auto i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + upcxx::finalize(); + return 0; +} + + diff --git a/Cxx11/prk_ranges.h b/Cxx11/prk_ranges.h index 9eb081844..62281e043 100644 --- a/Cxx11/prk_ranges.h +++ b/Cxx11/prk_ranges.h @@ -32,16 +32,14 @@ #ifndef PRK_RANGES_H #define PRK_RANGES_H -#if defined(USE_RANGES) -# if defined(USE_BOOST_IRANGE) -# include "boost/range/irange.hpp" -# elif defined(USE_RANGES_TS) -# include "range/v3/view/iota.hpp" -# include "range/v3/view/slice.hpp" -# include "range/v3/view/stride.hpp" -# else -# error You have not provided a version of ranges to use. -# endif +#if defined(USE_BOOST_IRANGE) +# include "boost/range/irange.hpp" +#elif defined(USE_RANGES_TS) +# include "range/v3/view/iota.hpp" +# include "range/v3/view/slice.hpp" +# include "range/v3/view/stride.hpp" +#else +# error You have not provided a version of ranges to use. #endif namespace prk { diff --git a/Cxx11/prk_upcxx.h b/Cxx11/prk_upcxx.h new file mode 100644 index 000000000..27db8592e --- /dev/null +++ b/Cxx11/prk_upcxx.h @@ -0,0 +1,37 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +#ifndef PRK_UPCXX_H +#define PRK_UPCXX_H + +#include + +#endif /* PRK_UPCXX_H */ diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 7e0736211..ec0535f57 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -120,6 +120,12 @@ HWLOCFLAG=-I/usr/local/include HPXDIR=./hpx HPXCXX=${HPXDIR}/bin/hpxcxx HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} +# UPC++ +UPCXXDIR=./upcxx +UPCXX=${UPCXXDIR}/bin/upcxx +UPCXXFLAG=-codemode={O3,debug} +UPCXXFLAG+=-std=c++17 +UPCXXFLAG+=-mtune=native -ffast-math # # CBLAS for C++ DGEMM # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 4021f6c6d..c150d9ed2 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -120,6 +120,12 @@ HWLOCFLAG=-I/usr/local/include HPXDIR=./hpx HPXCXX=${HPXDIR}/bin/hpxcxx HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} +# UPC++ +UPCXXDIR=./upcxx +UPCXX=${UPCXXDIR}/bin/upcxx +UPCXXFLAG=-codemode={O3,debug} +UPCXXFLAG+=-std=c++17 +UPCXXFLAG+=-mtune=native -ffast-math # # CBLAS for C++ DGEMM # From 2699970ba76e369e157fd72c0a466628d9bc98a4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 16:15:30 -0800 Subject: [PATCH 10/80] fix banner --- Cxx11/nstream-upcxx.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/nstream-upcxx.cc b/Cxx11/nstream-upcxx.cc index 083ab96b4..7925aef08 100644 --- a/Cxx11/nstream-upcxx.cc +++ b/Cxx11/nstream-upcxx.cc @@ -74,7 +74,7 @@ int main(int argc, char * argv[]) if (me==0) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++ HPX STREAM triad: A = B + scalar * C" << std::endl; + std::cout << "UPC++ STREAM triad: A = B + scalar * C" << std::endl; } ////////////////////////////////////////////////////////////////////// From a834102a6f69a38aa697a2c1653127a77a67d2db Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 May 2019 06:48:34 -0700 Subject: [PATCH 11/80] add README to capture what I am learning here --- HALIDE/README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 HALIDE/README.md diff --git a/HALIDE/README.md b/HALIDE/README.md new file mode 100644 index 000000000..af170de52 --- /dev/null +++ b/HALIDE/README.md @@ -0,0 +1,11 @@ +# Halide + +# Notes + +``` +$ git clone https://github.com/halide/Halide.git +``` + +``` +$ make CXX=clang++ PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config +``` From f37a0ac91fe5d1a2fe16bc53a872ce99dfcb32dc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 May 2019 10:21:40 -0700 Subject: [PATCH 12/80] add notes since Halide has pre-modern build system --- HALIDE/README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/HALIDE/README.md b/HALIDE/README.md index af170de52..efa2e2d96 100644 --- a/HALIDE/README.md +++ b/HALIDE/README.md @@ -6,6 +6,51 @@ $ git clone https://github.com/halide/Halide.git ``` +# MacOS + +This works: +``` +make CLANG=/usr/local/Cellar/llvm/8.0.0/bin/clang PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config +``` + +# Ubuntu 18.10 + +This works: +``` +make PREFIX=/opt/halide +``` + +This does not work: + ``` $ make CXX=clang++ PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config ``` + +This does not work: + +``` +$ make CC=/usr/local/Cellar/llvm/8.0.0/bin/clang CXX=/usr/local/Cellar/llvm/8.0.0/bin/clang++ CLANG=/usr/local/Cellar/llvm/8.0.0/bin/clang PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config +``` + +# Issues + +*TL;DR* Do not try to use non-default compilers. + +https://github.com/halide/Halide/issues/3884 + +Mac: +``` +$ make CC=gcc-9 CXX=g++-9 CLANG=/usr/local/Cellar/llvm/8.0.0/bin/clang PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config +g++-9 -Wall -Werror -Wno-unused-function -Wcast-qual -Wignored-qualifiers -Wno-comment -Wsign-compare -Wno-unknown-warning-option -Wno-psabi -Wsuggest-override -Woverloaded-virtual -fPIC -O3 -fno-omit-frame-pointer -DCOMPILING_HALIDE -std=c++11 -I/usr/local/Cellar/llvm/8.0.0/include -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/tmp/llvm-20190320-85215-19esl1h/llvm-8.0.0.src/tools/lld/include -DLLVM_VERSION=80 -DWITH_PTX=1 -DWITH_ARM=1 -DWITH_HEXAGON=1 -DWITH_AARCH64=1 -DWITH_X86=1 -DWITH_OPENCL=1 -DWITH_METAL=1 -DWITH_OPENGL=1 -DWITH_D3D12=1 -DWITH_MIPS=1 -DWITH_POWERPC=1 -DWITH_WEBASSEMBLY=1 -DWITH_INTROSPECTION -DWITH_AMDGPU=1 -funwind-tables -c ~/Work/Languages/Halide/src/Util.cpp -o bin/build/Util.o -MMD -MP -MF bin/build/Util.d -MT bin/build/Util.o +~/Work/Languages/Halide/src/Util.cpp: In function 'std::string Halide::Internal::running_program_name()': +~/Work/Languages/Halide/src/Util.cpp:80:19: error: 'PATH_MAX' was not declared in this scope + 80 | char path[PATH_MAX] = { 0 }; + | ^~~~~~~~ +~/Work/Languages/Halide/src/Util.cpp:81:32: error: 'path' was not declared in this scope + 81 | uint32_t size = sizeof(path); + | ^~~~ +At global scope: +cc1plus: error: unrecognized command line option '-Wno-unknown-warning-option' [-Werror] +cc1plus: all warnings being treated as errors +make: *** [bin/build/Util.o] Error 1 +``` From e69f46c08111e7b28ec078f81f8e7f0b965face2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 May 2019 13:14:16 -0700 Subject: [PATCH 13/80] add Halide to examples --- common/make.defs.gcc | 14 ++++++++++++-- common/make.defs.llvm | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 8ad79efb2..bf4b46ecf 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -151,13 +151,23 @@ CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions # CUDA flags # # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander -#NVCC=/opt/llvm/cocl/bin/cocl +NVCC=/opt/llvm/cocl/bin/cocl # Linux w/ NVIDIA CUDA NVCC=nvcc CUDAFLAGS=-g -O3 -std=c++11 CUDAFLAGS+=-arch=sm_50 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 -CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED +# +# Halide +# +HALIDECXX=c++ +HALIDEDIR=/opt/halide +HALIDEFLAG=-I${HALIDEDIR}/include +HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide +#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +HALIDEFLAG+=${DEFAULT_OPT_FLAGS} +HALIDEFLAG+=-std=c++17 -g3 # # ISPC # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 2aecf26d8..db54b5cc0 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -187,9 +187,9 @@ CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions # CUDA flags # # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander -NVCC=/opt/llvm/cocl/bin/cocl +#NVCC=/opt/llvm/cocl/bin/cocl # Linux w/ NVIDIA CUDA -#NVCC=nvcc -arch=sm_50 +NVCC=nvcc -arch=sm_50 CUDAFLAGS=-g -O3 -std=c++11 CUDAFLAGS+=-arch=sm_50 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 From 2bd32e246c5fdb61c5cb1b9158796981cf5718d6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 May 2019 13:14:27 -0700 Subject: [PATCH 14/80] add Halide nstream I have no idea what I am doing and this code is wrong and/or bad. --- HALIDE/README.md => Cxx11/HALIDE.md | 0 Cxx11/Makefile | 6 + Cxx11/nstream-halide.cc | 190 ++++++++++++++++++++++++++++ 3 files changed, 196 insertions(+) rename HALIDE/README.md => Cxx11/HALIDE.md (100%) create mode 100644 Cxx11/nstream-halide.cc diff --git a/HALIDE/README.md b/Cxx11/HALIDE.md similarity index 100% rename from HALIDE/README.md rename to Cxx11/HALIDE.md diff --git a/Cxx11/Makefile b/Cxx11/Makefile index b435091ed..fe6e8e891 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -50,6 +50,7 @@ THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) SYCLFLAGS = $(SYCLFLAG) -DUSE_2D_INDEXING=0 ORNLACCFLAGS = $(ORNLACCFLAG) +HALIDEFLAGS = $(HALIDEFLAG) ifdef OCCADIR include ${OCCADIR}/scripts/makefile @@ -133,6 +134,7 @@ oneapi: onemkl dpcpp sycl sycl-usm sycl-explicit occa: transpose-occa nstream-occa ornlacc: p2p-hyperplane-ornlacc +halide: nstream-halide boost-compute: nstream-boost-compute # busted @@ -253,6 +255,9 @@ endif $(info PRK help: Set OCCA_CXX=$(firstword $(CXX)) to use that compiler for OKL files.) $(CXX) $(CXXFLAGS) $< $(OCCAFLAGS) -o $@ +%-halide: %-halide.cc prk_util.h + $(HALIDECXX) $(CXXFLAGS) $< $(HALIDEFLAGS) -o $@ + %-ornlacc: %-ornlacc.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(ORNLACCFLAGS) -o $@ @@ -300,6 +305,7 @@ clean: -rm -f *-cblas -rm -f *-onemkl -rm -f *-occa + -rm -f *-halide -rm -f *-boost-compute -rm -f *-ornlacc -rm -f transpose-async transpose-thread diff --git a/Cxx11/nstream-halide.cc b/Cxx11/nstream-halide.cc new file mode 100644 index 000000000..51f0eee16 --- /dev/null +++ b/Cxx11/nstream-halide.cc @@ -0,0 +1,190 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "Halide.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/Halide STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> []"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + const Halide::Target target = Halide::get_jit_target_from_environment(); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double scalar = 3.0; + + Halide::Buffer A(length); + Halide::Buffer B(length); + Halide::Buffer C(length); + + for (size_t i=0; i out = nstream.realize(length); +#endif + } + nstream_time = prk::wtime() - nstream_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (auto i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + From a54488c28d8e8b2d2314d03340d20da81255bae1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 18 May 2019 20:59:45 -0700 Subject: [PATCH 15/80] add Stencil for Halide --- Cxx11/Makefile | 3 +- Cxx11/stencil-halide.cc | 231 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 Cxx11/stencil-halide.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index fe6e8e891..f3e6e5d3b 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -134,7 +134,8 @@ oneapi: onemkl dpcpp sycl sycl-usm sycl-explicit occa: transpose-occa nstream-occa ornlacc: p2p-hyperplane-ornlacc -halide: nstream-halide + +halide: nstream-halide stencil-halide boost-compute: nstream-boost-compute # busted diff --git a/Cxx11/stencil-halide.cc b/Cxx11/stencil-halide.cc new file mode 100644 index 000000000..f0aab6461 --- /dev/null +++ b/Cxx11/stencil-halide.cc @@ -0,0 +1,231 @@ + +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following functions are used in +/// this program: +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - RvdW: Removed unrolling pragmas for clarity; +/// added constant to array "in" at end of each iteration to force +/// refreshing of neighbor data in parallel versions; August 2013 +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "stencil_seq.hpp" + +void nothing(const int n, const int t, prk::vector & in, prk::vector & out) +{ + std::cout << "You are trying to use a stencil that does not exist.\n"; + std::cout << "Please generate the new stencil using the code generator\n"; + std::cout << "and add it to the case-switch in the driver." << std::endl; + // n will never be zero - this is to silence compiler warnings. + if (n==0 || t==0) std::cout << in.size() << out.size() << std::endl; + std::abort(); +} + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11 Stencil execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, n, radius, tile_size; + bool star = true; + try { + if (argc < 3) { + throw "Usage: <# iterations> [ ]"; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimension must be positive"; + } else if (n > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = n; + if (tile_size > n) tile_size = n; + } + + // stencil pattern + if (argc > 4) { + auto stencil = std::string(argv[4]); + auto grid = std::string("grid"); + star = (stencil == grid) ? false : true; + } + + // stencil radius + radius = 2; + if (argc > 5) { + radius = std::atoi(argv[5]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + throw "ERROR: Stencil radius negative or too large"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid size = " << n << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; + std::cout << "Radius of stencil = " << radius << std::endl; + + auto stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + } + } else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + } + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto stencil_time = 0.0; + + prk::vector in(n*n); + prk::vector out(n*n); + + { + for (auto it=0; it(i+j); + out[i*n+j] = 0.0; + } + } + } + } + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) stencil_time = prk::wtime(); + // Apply the stencil operator + stencil(n, tile_size, in, out); + // Add constant to solution to force refresh of neighbor data, if any + std::transform(in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; }); + } + stencil_time = prk::wtime() - stencil_time; + } + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + // interior of grid with respect to stencil + size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); + + // compute L1 norm in parallel + double norm = 0.0; + for (auto i=radius; i epsilon) { + std::cout << "ERROR: L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; +#ifdef VERBOSE + std::cout << "L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; +#endif + const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2L*(size_t)stencil_size+1L) * active_points; + auto avgtime = stencil_time/iterations; + std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} From b36ad8d0246a32c40dd1b8e410fdf4d550e8d7a4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 18 May 2019 21:49:07 -0700 Subject: [PATCH 16/80] ignore halide and occa binaries --- .gitignore | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 55140184d..8329936cb 100644 --- a/.gitignore +++ b/.gitignore @@ -184,13 +184,11 @@ Cxx11/p2p-vector-raja Cxx11/p2p-tbb Cxx11/p2p-innerloop-openmp Cxx11/p2p-doacross-openmp -Cxx11/p2p-doacross-openmp Cxx11/p2p-innerloop-opencl Cxx11/p2p-innerloop-vector +Cxx11/p2p-innerloop-tbb Cxx11/p2p-hyperplane-vector Cxx11/p2p-hyperplane-openmp -Cxx11/p2p-hyperplane-openmp -Cxx11/p2p-innerloop-tbb Cxx11/p2p-hyperplane-stl Cxx11/p2p-hyperplane-pstl Cxx11/p2p-hyperplane-tbb @@ -224,6 +222,8 @@ Cxx11/nstream-celerity Cxx11/nstream-hpx Cxx11/nstream-upcxx Cxx11/nstream-executors +Cxx11/nstream-occa +Cxx11/nstream-halide Cxx11/pic Cxx11/pic-dpcpp Cxx11/pic-sycl @@ -258,6 +258,8 @@ Cxx11/stencil-sycl-usm Cxx11/stencil-sycl-explicit Cxx11/stencil-sycl-explicit-usm Cxx11/stencil-dpcpp +Cxx11/stencil-occa +Cxx11/stencil-halide Cxx11/transpose Cxx11/transpose-openmp Cxx11/transpose-mpi @@ -288,6 +290,8 @@ Cxx11/transpose-device-thrust Cxx11/transpose-host-thrust Cxx11/transpose-cublas Cxx11/transpose-cuda +Cxx11/transpose-occa +Cxx11/transpose-halide Cxx11/grid1.cl Cxx11/grid2.cl Cxx11/grid3.cl From 6cb5371e7bc828f109fcdff2f48993b001c1c826 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 18 May 2019 21:49:22 -0700 Subject: [PATCH 17/80] less wrong --- Cxx11/stencil-halide.cc | 76 ++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 46 deletions(-) diff --git a/Cxx11/stencil-halide.cc b/Cxx11/stencil-halide.cc index f0aab6461..bdd1f1487 100644 --- a/Cxx11/stencil-halide.cc +++ b/Cxx11/stencil-halide.cc @@ -61,22 +61,12 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" -#include "stencil_seq.hpp" - -void nothing(const int n, const int t, prk::vector & in, prk::vector & out) -{ - std::cout << "You are trying to use a stencil that does not exist.\n"; - std::cout << "Please generate the new stencil using the code generator\n"; - std::cout << "and add it to the case-switch in the driver." << std::endl; - // n will never be zero - this is to silence compiler warnings. - if (n==0 || t==0) std::cout << in.size() << out.size() << std::endl; - std::abort(); -} +#include "Halide.h" int main(int argc, char* argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11 Stencil execution on 2D grid" << std::endl; + std::cout << "C++11/Halide Stencil execution on 2D grid" << std::endl; ////////////////////////////////////////////////////////////////////// // Process and test input parameters @@ -139,54 +129,48 @@ int main(int argc, char* argv[]) std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; std::cout << "Radius of stencil = " << radius << std::endl; - auto stencil = nothing; - if (star) { - switch (radius) { - case 1: stencil = star1; break; - case 2: stencil = star2; break; - case 3: stencil = star3; break; - case 4: stencil = star4; break; - case 5: stencil = star5; break; - } - } else { - switch (radius) { - case 1: stencil = grid1; break; - case 2: stencil = grid2; break; - case 3: stencil = grid3; break; - case 4: stencil = grid4; break; - case 5: stencil = grid5; break; - } - } + const Halide::Target target = Halide::get_jit_target_from_environment(); ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - auto stencil_time = 0.0; + double stencil_time(0); - prk::vector in(n*n); - prk::vector out(n*n); + Halide::Buffer in(n,n); + Halide::Buffer out(n,n); + + Halide::Var x("x"); + Halide::Var y("y"); + + Halide::Expr c1(0.25); + Halide::Expr c2(0.125); + Halide::Func stencil; + stencil(x,y) = c1 * ( in(x+1,y) + in(x-1,y) + in(x,y+1) + in(x,y+1) ) + + c2 * ( in(x+2,y) + in(x-2,y) + in(x,y+2) + in(x,y+2) ); { - for (auto it=0; it(i+j); - out[i*n+j] = 0.0; - } - } + for (auto i=0; i Date: Tue, 3 Mar 2020 09:44:00 -0800 Subject: [PATCH 18/80] move documentation to the right place Signed-off-by: Jeff Hammond --- {Cxx11 => doc}/HALIDE.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {Cxx11 => doc}/HALIDE.md (100%) diff --git a/Cxx11/HALIDE.md b/doc/HALIDE.md similarity index 100% rename from Cxx11/HALIDE.md rename to doc/HALIDE.md From 52bd76531144854fb3f9e537b1c1e46457853796 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 2 Nov 2020 08:48:46 -0800 Subject: [PATCH 19/80] update Halide stuff for 10.0 release --- common/make.defs.gcc | 6 +++--- common/make.defs.llvm | 6 +++--- common/make.defs.oneapi | 10 ++++++++++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index bf4b46ecf..f0fccc68c 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -162,12 +162,12 @@ CUDAFLAGS+=-arch=sm_50 # Halide # HALIDECXX=c++ -HALIDEDIR=/opt/halide +HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux HALIDEFLAG=-I${HALIDEDIR}/include -HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide +HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide #HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 HALIDEFLAG+=${DEFAULT_OPT_FLAGS} -HALIDEFLAG+=-std=c++17 -g3 +HALIDEFLAG+=-std=c++17 # # ISPC # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 1764d24f0..08fb7b1a1 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -202,12 +202,12 @@ CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # Halide # HALIDECXX=c++ -HALIDEDIR=/opt/halide +HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux HALIDEFLAG=-I${HALIDEDIR}/include -HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide +HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide #HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 HALIDEFLAG+=${DEFAULT_OPT_FLAGS} -HALIDEFLAG+=-std=c++17 -g3 +HALIDEFLAG+=-std=c++17 # # ISPC # diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi index be6b2dc4b..38e163047 100644 --- a/common/make.defs.oneapi +++ b/common/make.defs.oneapi @@ -106,6 +106,16 @@ CUDAFLAGS+=-arch=sm_50 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # +# Halide +# +HALIDECXX=icpx +HALIDEDIR=/opt/halide +HALIDEFLAG=-I${HALIDEDIR}/include +HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide +#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +HALIDEFLAG+=${DEFAULT_OPT_FLAGS} +HALIDEFLAG+=-std=c++17 +# # ISPC # ISPC=ispc From b5b422f1f8f1987d8f8720f412fee512b2ac0721 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 31 Dec 2021 17:53:42 +0200 Subject: [PATCH 20/80] fix make.defs* --- common/make.defs.gcc | 39 -------------------------------------- common/make.defs.intel | 29 +--------------------------- common/make.defs.llvm | 23 ---------------------- common/make.defs.upcxx-hpx | 9 +++++++++ 4 files changed, 10 insertions(+), 90 deletions(-) create mode 100644 common/make.defs.upcxx-hpx diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 7a179c356..7c6d9b188 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -56,16 +56,6 @@ METALFLAG=-framework MetalPerformanceShaders # SYCL flags # # Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md -<<<<<<< HEAD -#SYCLDIR=/opt/isycl -#SYCLCXX=${SYCLDIR}/bin/clang++ -#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib -#SYCLFLAG+=-std=c++17 -O3 -# CodePlay ComputeCpp -#SYCLDIR=/opt/sycl/latest -#SYCLCXX=${SYCLDIR}/bin/compute++ -#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp -======= # #SYCLDIR=/opt/isycl #SYCLCXX=${SYCLDIR}/bin/clang++ @@ -87,7 +77,6 @@ METALFLAG=-framework MetalPerformanceShaders #SYCLDIR=/opt/sycl/latest #SYCLCXX=${SYCLDIR}/bin/compute++ #SYCLFLAG=-sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp ->>>>>>> default #SYCLFLAG+=-std=c++14 -O3 # This makes a huge difference in e.g. nstream... #SYCLFLAG+=-no-serial-memop @@ -107,8 +96,6 @@ METALFLAG=-framework MetalPerformanceShaders #SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) #SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL # -<<<<<<< HEAD -======= # hipSYCL # SYCLDIR=/opt/hipSYCL @@ -124,7 +111,6 @@ CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/ CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime MPIINC=-I/usr/include/mpich-3.2-x86_64 MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi ->>>>>>> default # # OCCA # @@ -137,11 +123,7 @@ MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi # TBB # #TBBDIR=/usr/lib/x86_64-linux-gnu -<<<<<<< HEAD -TBBDIR=/usr/local/Cellar/tbb/2020_U0 -======= TBBDIR=/opt/homebrew/Cellar/tbb/2020_U3_1 ->>>>>>> default TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb @@ -149,19 +131,11 @@ TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # Parallel STL, Boost, etc. # #BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include -<<<<<<< HEAD -BOOSTROOT=/usr/local/Cellar/boost/1.72.0/include -BOOSTFLAG+=-I${BOOSTROOT} -BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11 -RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include -======= #BOOSTFLAG=-I/usr/include/boost169 BOOSTFLAG=-I/opt/homebrew/Cellar/boost/1.75.0_1/include # M1 Big Sur #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I../deps/range-v3/include #RANGEFLAG=-DUSE_GCC_RANGES ->>>>>>> default PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} #PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages KOKKOSDIR=/opt/kokkos/gcc @@ -170,11 +144,8 @@ RAJADIR=/opt/raja/gcc RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} -<<<<<<< HEAD -======= EXECUTORSDIR=./libunifex EXECUTORSFLAG=-I${EXECUTORSDIR}/include -I${EXECUTORSDIR}/build/include ->>>>>>> default # HPX is more complicated... HWLOCFLAG=-I/usr/local/include HPXDIR=./hpx @@ -209,15 +180,6 @@ CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # # Halide # -<<<<<<< HEAD -HALIDECXX=c++ -HALIDEDIR=/opt/halide -HALIDEFLAG=-I${HALIDEDIR}/include -HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide -#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 -HALIDEFLAG+=${DEFAULT_OPT_FLAGS} -HALIDEFLAG+=-std=c++17 -g3 -======= HALIDECXX=${CXX} HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux HALIDEFLAG=-I${HALIDEDIR}/include @@ -225,7 +187,6 @@ HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide #HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 HALIDEFLAG+=${DEFAULT_OPT_FLAGS} HALIDEFLAG+=-std=c++17 ->>>>>>> default # # ISPC # diff --git a/common/make.defs.intel b/common/make.defs.intel index 047c363ad..1abbb0c75 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -43,22 +43,10 @@ OFFLOADFLAG+=-DGPU_SCHEDULE="" # Linux OPENCLDIR=/etc/alternatives/opencl-intel-tools OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -#OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations -METALFLAG=-framework MetalPerformanceShaders # # SYCL flags # # Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md -<<<<<<< HEAD -#SYCLDIR=/opt/isycl -#SYCLCXX=${SYCLDIR}/bin/clang++ -#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib -#SYCLFLAG+=-std=c++17 -O3 -# CodePlay ComputeCpp -#SYCLDIR=/opt/sycl/latest -#SYCLCXX=${SYCLDIR}/bin/compute++ -#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp -======= # #SYCLDIR=/opt/isycl #SYCLCXX=${SYCLDIR}/bin/clang++ @@ -81,7 +69,6 @@ METALFLAG=-framework MetalPerformanceShaders #SYCLDIR=/opt/codeplay/latest #SYCLCXX=${SYCLDIR}/bin/compute++ #SYCLFLAG=-sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp ->>>>>>> default #SYCLFLAG+=-std=c++14 -O3 # This makes a huge difference in e.g. nstream... #SYCLFLAG+=-no-serial-memop @@ -93,19 +80,11 @@ METALFLAG=-framework MetalPerformanceShaders #SYCLFLAG+=${OPENCLFLAG} # NVIDIA target #SYCLFLAG+=-sycl-target ptx64 -<<<<<<< HEAD -======= #SYCLFLAG+=-DPRK_NO_OPENCL_GPU ->>>>>>> default # # triSYCL # # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -<<<<<<< HEAD -#SYCLDIR=./triSYCL -#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) -#SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL -======= SYCLDIR=./triSYCL SYCLCXX=${CXX} ${OPENMPFLAG} SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include @@ -123,7 +102,6 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include #CELERITYDIR=${SYCLDIR} #CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor #CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime ->>>>>>> default # # OCCA # @@ -131,7 +109,7 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include # # Cilk # -#CILKFLAG=-intel-extensions # default +CILKFLAG=-intel-extensions # default # # TBB # @@ -150,11 +128,6 @@ RAJADIR=/opt/raja/intel RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/opt/nvidia/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} -# HPX is more complicated... -HWLOCFLAG=-I/usr/local/include -HPXDIR=./hpx -HPXCXX=${HPXDIR}/bin/hpxcxx -HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} # # CBLAS for C++ DGEMM # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 502331404..730e1fa08 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -128,11 +128,6 @@ SYCLFLAG+=${OPENCLFLAG} # triSYCL # # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -<<<<<<< HEAD -SYCLDIR=./triSYCL -SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) -SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL -======= #SYCLDIR=./triSYCL #SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS) #SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL @@ -155,7 +150,6 @@ SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL CELERITYDIR=${SYCLDIR} CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime ->>>>>>> default # # OCCA # @@ -164,26 +158,13 @@ CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime # TBB # #TBBDIR=/usr/lib/x86_64-linux-gnu -<<<<<<< HEAD -TBBDIR=/usr/local/Cellar/tbb/2020_U0 -======= TBBDIR=/opt/homebrew/Cellar/tbb/2020_U3_1 ->>>>>>> default TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb # # Parallel STL, Boost, etc. # -<<<<<<< HEAD -#BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include -BOOSTROOT=/usr/local/Cellar/boost/1.72.0/include -BOOSTFLAG+=-I${BOOSTROOT} -BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11 -#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include -PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} -Wno-\#pragma-messages -DUSE_INTEL_PSTL -I./pstl/include -======= #BOOSTFLAG=-I/usr/local/Cellar/boost/1.72.0/include # old Homebrew #BOOSTFLAG=-I/usr/include/boost169 # Linux BOOSTFLAG=-I/opt/homebrew/Cellar/boost/1.75.0_2/include # new Homebrew @@ -196,7 +177,6 @@ SYCLFLAG+=${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} PSTLFLAG+=-I./llvm-pstl/include -DLLVM_PSTL ->>>>>>> default KOKKOSDIR=/opt/kokkos/clang KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos -ldl KOKKOSFLAG+=${OPENMPFLAG} @@ -205,11 +185,8 @@ RAJADIR=/opt/raja/clang RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/opt/nvidia/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} -<<<<<<< HEAD -======= EXECUTORSDIR=./libunifex EXECUTORSFLAG=-I${EXECUTORSDIR}/include -I${EXECUTORSDIR}/build/include ->>>>>>> default # HPX is more complicated... HWLOCFLAG=-I/usr/local/include HPXDIR=./hpx diff --git a/common/make.defs.upcxx-hpx b/common/make.defs.upcxx-hpx new file mode 100644 index 000000000..a30623ad5 --- /dev/null +++ b/common/make.defs.upcxx-hpx @@ -0,0 +1,9 @@ +UPCXXDIR=./upcxx +UPCXX=${UPCXXDIR}/bin/upcxx +UPCXXFLAG=-codemode={O3,debug} +UPCXXFLAG+=-std=c++17 +UPCXXFLAG+=-mtune=native -ffast-math + +HPXDIR=./hpx +HPXCXX=${HPXDIR}/bin/hpxcxx +HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} From 285677d201e1d04ccb68eda8df07eb86b52d549c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 4 Jan 2022 12:15:37 +0200 Subject: [PATCH 21/80] factor out PRK MPI module utility --- FORTRAN/Makefile | 11 +++++-- FORTRAN/prk_mpi.F90 | 60 +++++++++++++++++++++++++++++++++++ FORTRAN/transpose-a2a-mpi.F90 | 29 ----------------- FORTRAN/transpose-acc-mpi.F90 | 29 ----------------- FORTRAN/transpose-get-mpi.F90 | 29 ----------------- FORTRAN/transpose-p2p-mpi.F90 | 29 ----------------- 6 files changed, 68 insertions(+), 119 deletions(-) create mode 100644 FORTRAN/prk_mpi.F90 diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 4a1315edb..54e0b11ae 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -96,6 +96,9 @@ blas: dgemm-blas prk.mod prk_mod.o: prk_mod.F90 $(FC) $(FCFLAGS) -c $< -o prk_mod.o +prk_mpi.mod prk_mpi_mod.o: prk_mpi.F90 + $(FC) $(FCFLAGS) -c $< -o prk_mpi_mod.o + stencil: stencil.F90 prk.mod $(FC) $(FCFLAGS) -c stencil_serial.F90 $(FC) $(FCFLAGS) stencil.F90 stencil_serial.o prk_mod.o -o $@ @@ -119,10 +122,10 @@ dgemm-blas: dgemm-blas.F90 prk.mod $(MPIFORT) $(FCFLAGS) $< prk_mod.o $(GAFLAG) -o $@ %-mpi-openmp: %-mpi.F90 prk.mod - $(MPIFORT) $(FCFLAGS) $(OPENMPFLAG) $< prk_mod.o -o $@ + $(MPIFORT) $(FCFLAGS) $(OPENMPFLAG) $< prk_mod.o prk_mpi_mod.o -o $@ -%-mpi: %-mpi.F90 prk.mod - $(MPIFORT) $(FCFLAGS) $< prk_mod.o -o $@ +%-mpi: %-mpi.F90 prk.mod prk_mpi.mod + $(MPIFORT) $(FCFLAGS) $< prk_mod.o prk_mpi_mod.o -o $@ %-coarray: %-coarray.F90 prk.mod $(CAFC) $(FCFLAGS) $< prk_mod.o $(COARRAYFLAG) -o $@ @@ -142,6 +145,8 @@ dgemm-blas: dgemm-blas.F90 prk.mod clean: -rm -f prk.mod -rm -f prk.f18.mod + -rm -f prk_mpi.mod + -rm -f prk_mpi.f18.mod -rm -f *.o -rm -f *.i90 -rm -f *.dbg diff --git a/FORTRAN/prk_mpi.F90 b/FORTRAN/prk_mpi.F90 new file mode 100644 index 000000000..f1508f450 --- /dev/null +++ b/FORTRAN/prk_mpi.F90 @@ -0,0 +1,60 @@ +! +! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +module prk_mpi + contains + subroutine mpi_print_matrix(mat,clabel) + use, intrinsic :: iso_fortran_env + use mpi_f08 + use prk + implicit none + real(kind=REAL64), intent(in) :: mat(:,:) + character(*), intent(in), optional :: clabel + integer(kind=INT32) :: r, me, np + flush(6) + call MPI_Comm_rank(MPI_COMM_WORLD, me) + call MPI_Comm_size(MPI_COMM_WORLD, np) + call MPI_Barrier(MPI_COMM_WORLD) + flush(6) + if (me.eq.0) print*,clabel + flush(6) + call MPI_Barrier(MPI_COMM_WORLD) + flush(6) + do r=0,np-1 + if (me.eq.r) then + call print_matrix(mat,me) + endif + call MPI_Barrier(MPI_COMM_WORLD) + enddo + flush(6) + end subroutine +end module prk_mpi diff --git a/FORTRAN/transpose-a2a-mpi.F90 b/FORTRAN/transpose-a2a-mpi.F90 index c121b037a..a57615201 100644 --- a/FORTRAN/transpose-a2a-mpi.F90 +++ b/FORTRAN/transpose-a2a-mpi.F90 @@ -53,35 +53,6 @@ ! MPI by Jeff Hammond, November 2021 ! ******************************************************************* -module prk_mpi - contains - subroutine mpi_print_matrix(mat,clabel) - use, intrinsic :: iso_fortran_env - use mpi_f08 - use prk - implicit none - real(kind=REAL64), intent(in) :: mat(:,:) - character(*), intent(in), optional :: clabel - integer(kind=INT32) :: r, me, np - flush(6) - call MPI_Comm_rank(MPI_COMM_WORLD, me) - call MPI_Comm_size(MPI_COMM_WORLD, np) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - if (me.eq.0) print*,clabel - flush(6) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - do r=0,np-1 - if (me.eq.r) then - call print_matrix(mat,me) - endif - call MPI_Barrier(MPI_COMM_WORLD) - enddo - flush(6) - end subroutine -end module prk_mpi - program main use, intrinsic :: iso_fortran_env use mpi_f08 diff --git a/FORTRAN/transpose-acc-mpi.F90 b/FORTRAN/transpose-acc-mpi.F90 index 9023a006f..6e2904a47 100644 --- a/FORTRAN/transpose-acc-mpi.F90 +++ b/FORTRAN/transpose-acc-mpi.F90 @@ -53,35 +53,6 @@ ! MPI by Jeff Hammond, November 2021 ! ******************************************************************* -module prk_mpi - contains - subroutine mpi_print_matrix(mat,clabel) - use, intrinsic :: iso_fortran_env - use mpi_f08 - use prk - implicit none - real(kind=REAL64), intent(in) :: mat(:,:) - character(*), intent(in), optional :: clabel - integer(kind=INT32) :: r, me, np - flush(6) - call MPI_Comm_rank(MPI_COMM_WORLD, me) - call MPI_Comm_size(MPI_COMM_WORLD, np) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - if (me.eq.0) print*,clabel - flush(6) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - do r=0,np-1 - if (me.eq.r) then - call print_matrix(mat,me) - endif - call MPI_Barrier(MPI_COMM_WORLD) - enddo - flush(6) - end subroutine -end module prk_mpi - program main use, intrinsic :: iso_fortran_env use, intrinsic :: iso_c_binding diff --git a/FORTRAN/transpose-get-mpi.F90 b/FORTRAN/transpose-get-mpi.F90 index b153117ca..ecd6ed18d 100644 --- a/FORTRAN/transpose-get-mpi.F90 +++ b/FORTRAN/transpose-get-mpi.F90 @@ -53,35 +53,6 @@ ! MPI by Jeff Hammond, November 2021 ! ******************************************************************* -module prk_mpi - contains - subroutine mpi_print_matrix(mat,clabel) - use, intrinsic :: iso_fortran_env - use mpi_f08 - use prk - implicit none - real(kind=REAL64), intent(in) :: mat(:,:) - character(*), intent(in), optional :: clabel - integer(kind=INT32) :: r, me, np - flush(6) - call MPI_Comm_rank(MPI_COMM_WORLD, me) - call MPI_Comm_size(MPI_COMM_WORLD, np) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - if (me.eq.0) print*,clabel - flush(6) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - do r=0,np-1 - if (me.eq.r) then - call print_matrix(mat,me) - endif - call MPI_Barrier(MPI_COMM_WORLD) - enddo - flush(6) - end subroutine -end module prk_mpi - program main use, intrinsic :: iso_fortran_env use, intrinsic :: iso_c_binding diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90 index 3d72cb36c..1ae8dbc9a 100644 --- a/FORTRAN/transpose-p2p-mpi.F90 +++ b/FORTRAN/transpose-p2p-mpi.F90 @@ -53,35 +53,6 @@ ! MPI by Jeff Hammond, November 2021 ! ******************************************************************* -module prk_mpi - contains - subroutine mpi_print_matrix(mat,clabel) - use, intrinsic :: iso_fortran_env - use mpi_f08 - use prk - implicit none - real(kind=REAL64), intent(in) :: mat(:,:) - character(*), intent(in), optional :: clabel - integer(kind=INT32) :: r, me, np - flush(6) - call MPI_Comm_rank(MPI_COMM_WORLD, me) - call MPI_Comm_size(MPI_COMM_WORLD, np) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - if (me.eq.0) print*,clabel - flush(6) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - do r=0,np-1 - if (me.eq.r) then - call print_matrix(mat,me) - endif - call MPI_Barrier(MPI_COMM_WORLD) - enddo - flush(6) - end subroutine -end module prk_mpi - program main use, intrinsic :: iso_fortran_env use mpi_f08 From a640d7add111bbcc05341cd4e162dd7080e4244d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 5 Jan 2022 07:31:12 -0800 Subject: [PATCH 22/80] extra targets --- FORTRAN/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 3cd82fe25..bcc68182a 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -38,7 +38,7 @@ ifeq ($(findstring ifx,$(FC)),ifx) endif # GCC (also matches pgfortran so PGI must come after) ifeq ($(findstring gfortran,$(FC)),gfortran) - EXTRA = target coarray taskloop openacc + EXTRA = target coarray taskloop openacc blas endif # PGI and LLVM Flang ifeq ($(findstring flang,$(FC)),flang) @@ -50,7 +50,7 @@ ifeq ($(findstring pgf,$(FC)),pgf) FCFLAGS += -DPGI endif ifeq ($(findstring nvf,$(FC)),nvf) - EXTRA = target openacc cufortran + EXTRA = target openacc cufortran stdpar blas FCFLAGS += -DNVHPC endif ifeq ($(findstring xlf,$(FC)),xlf) From 719977ce107152e4355629ca749d2914957ca570 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 7 May 2022 11:56:26 +0300 Subject: [PATCH 23/80] implement Pablo's changes in code generator Signed-off-by: Jeff Hammond --- Cxx11/generate-sycl-stencil.py | 26 ++----- Cxx11/stencil_sycl.hpp | 121 ++++++++++++++++----------------- 2 files changed, 64 insertions(+), 83 deletions(-) diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index 22756399e..c67f2d124 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -33,20 +33,14 @@ def codegen(src,pattern,stencil_size,radius,model,dim,usm): src.write(' sycl::id<2> dx'+str(r)+'(sycl::range<2> {'+str(r)+',0});\n') src.write(' sycl::id<2> dy'+str(r)+'(sycl::range<2> {0,'+str(r)+'});\n') src.write(' h.parallel_for>(') - src.write('sycl::range<2> {n-'+str(2*radius)+',n-'+str(2*radius)+'}, ') - src.write('sycl::id<2> {'+str(radius)+','+str(radius)+'}, ') + src.write('sycl::range<2> {n-'+str(radius)+',n-'+str(radius)+'}, ') src.write('[=] (sycl::item<2> it) {\n') if (dim==2): - src.write(' sycl::id<2> xy = it.get_id();\n') + src.write(' sycl::id<2> xy = it.get_id() + sycl::id<2> {'+str(radius)+','+str(radius)+'};\n') src.write(' out[xy] += ') else: - # 1D indexing the slow way - #src.write(' auto i = it[0];\n') - #src.write(' auto j = it[1];\n') - #src.write(' out[i*n+j] += ') - # 1D indexing the fast way - src.write(' const auto i = it[0];\n') - src.write(' const auto j = it[1];\n') + src.write(' const auto i = it[0] + '+str(radius)+';\n') + src.write(' const auto j = it[1] + '+str(radius)+';\n') src.write(' out[i*n+j] += ') if pattern == 'star': for i in range(1,radius+1): @@ -62,18 +56,6 @@ def codegen(src,pattern,stencil_size,radius,model,dim,usm): src.write('\n'+19*' ') src.write('+in[xy-dy'+str(i)+'] * static_cast('+str(-1./(2.*i*radius))+')') else: - # 1D indexing the slow way - #if i > 1: - # src.write('\n') - # src.write(22*' ') - #src.write('+in[i*n+(j+'+str(i)+')] * static_cast('+str(+1./(2.*i*radius))+')') - #src.write('\n'+22*' ') - #src.write('+in[i*n+(j-'+str(i)+')] * static_cast('+str(-1./(2.*i*radius))+')') - #src.write('\n'+22*' ') - #src.write('+in[(i+'+str(i)+')*n+j] * static_cast('+str(+1./(2.*i*radius))+')') - #src.write('\n'+22*' ') - #src.write('+in[(i-'+str(i)+')*n+j] * static_cast('+str(-1./(2.*i*radius))+')') - # 1D indexing the fast way if i > 1: src.write('\n') src.write(30*' ') diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index 024e796c4..64af40b79 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -1,4 +1,3 @@ - // declare the kernel name used in SYCL parallel_for template class star1_1d; @@ -143,18 +142,18 @@ void star3(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer h.parallel_for>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) { const auto i = it[0] + 3; const auto j = it[1] + 3; - out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.166666666667) - +in[i*n+(j-1)] * static_cast(-0.166666666667) - +in[(i+1)*n+j] * static_cast(0.166666666667) - +in[(i-1)*n+j] * static_cast(-0.166666666667) - +in[i*n+(j+2)] * static_cast(0.0833333333333) - +in[i*n+(j-2)] * static_cast(-0.0833333333333) - +in[(i+2)*n+j] * static_cast(0.0833333333333) - +in[(i-2)*n+j] * static_cast(-0.0833333333333) - +in[i*n+(j+3)] * static_cast(0.0555555555556) - +in[i*n+(j-3)] * static_cast(-0.0555555555556) - +in[(i+3)*n+j] * static_cast(0.0555555555556) - +in[(i-3)*n+j] * static_cast(-0.0555555555556); + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.16666666666666666) + +in[i*n+(j-1)] * static_cast(-0.16666666666666666) + +in[(i+1)*n+j] * static_cast(0.16666666666666666) + +in[(i-1)*n+j] * static_cast(-0.16666666666666666) + +in[i*n+(j+2)] * static_cast(0.08333333333333333) + +in[i*n+(j-2)] * static_cast(-0.08333333333333333) + +in[(i+2)*n+j] * static_cast(0.08333333333333333) + +in[(i-2)*n+j] * static_cast(-0.08333333333333333) + +in[i*n+(j+3)] * static_cast(0.05555555555555555) + +in[i*n+(j-3)] * static_cast(-0.05555555555555555) + +in[(i+3)*n+j] * static_cast(0.05555555555555555) + +in[(i-3)*n+j] * static_cast(-0.05555555555555555); }); }); } @@ -176,18 +175,18 @@ void star3(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buf sycl::id<2> dy3(sycl::range<2> {0,3}); h.parallel_for>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) { sycl::id<2> xy = it.get_id() + sycl::id<2> {3,3}; - out[xy] += +in[xy+dx1] * static_cast(0.166666666667) - +in[xy-dx1] * static_cast(-0.166666666667) - +in[xy+dy1] * static_cast(0.166666666667) - +in[xy-dy1] * static_cast(-0.166666666667) - +in[xy+dx2] * static_cast(0.0833333333333) - +in[xy-dx2] * static_cast(-0.0833333333333) - +in[xy+dy2] * static_cast(0.0833333333333) - +in[xy-dy2] * static_cast(-0.0833333333333) - +in[xy+dx3] * static_cast(0.0555555555556) - +in[xy-dx3] * static_cast(-0.0555555555556) - +in[xy+dy3] * static_cast(0.0555555555556) - +in[xy-dy3] * static_cast(-0.0555555555556); + out[xy] += +in[xy+dx1] * static_cast(0.16666666666666666) + +in[xy-dx1] * static_cast(-0.16666666666666666) + +in[xy+dy1] * static_cast(0.16666666666666666) + +in[xy-dy1] * static_cast(-0.16666666666666666) + +in[xy+dx2] * static_cast(0.08333333333333333) + +in[xy-dx2] * static_cast(-0.08333333333333333) + +in[xy+dy2] * static_cast(0.08333333333333333) + +in[xy-dy2] * static_cast(-0.08333333333333333) + +in[xy+dx3] * static_cast(0.05555555555555555) + +in[xy-dx3] * static_cast(-0.05555555555555555) + +in[xy+dy3] * static_cast(0.05555555555555555) + +in[xy-dy3] * static_cast(-0.05555555555555555); }); }); } @@ -202,18 +201,18 @@ void star3(sycl::queue & q, const size_t n, const T * in, T * out) h.parallel_for>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) { const auto i = it[0] + 3; const auto j = it[1] + 3; - out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.166666666667) - +in[i*n+(j-1)] * static_cast(-0.166666666667) - +in[(i+1)*n+j] * static_cast(0.166666666667) - +in[(i-1)*n+j] * static_cast(-0.166666666667) - +in[i*n+(j+2)] * static_cast(0.0833333333333) - +in[i*n+(j-2)] * static_cast(-0.0833333333333) - +in[(i+2)*n+j] * static_cast(0.0833333333333) - +in[(i-2)*n+j] * static_cast(-0.0833333333333) - +in[i*n+(j+3)] * static_cast(0.0555555555556) - +in[i*n+(j-3)] * static_cast(-0.0555555555556) - +in[(i+3)*n+j] * static_cast(0.0555555555556) - +in[(i-3)*n+j] * static_cast(-0.0555555555556); + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.16666666666666666) + +in[i*n+(j-1)] * static_cast(-0.16666666666666666) + +in[(i+1)*n+j] * static_cast(0.16666666666666666) + +in[(i-1)*n+j] * static_cast(-0.16666666666666666) + +in[i*n+(j+2)] * static_cast(0.08333333333333333) + +in[i*n+(j-2)] * static_cast(-0.08333333333333333) + +in[(i+2)*n+j] * static_cast(0.08333333333333333) + +in[(i-2)*n+j] * static_cast(-0.08333333333333333) + +in[i*n+(j+3)] * static_cast(0.05555555555555555) + +in[i*n+(j-3)] * static_cast(-0.05555555555555555) + +in[(i+3)*n+j] * static_cast(0.05555555555555555) + +in[(i-3)*n+j] * static_cast(-0.05555555555555555); }); }); } @@ -238,10 +237,10 @@ void star4(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer +in[i*n+(j-2)] * static_cast(-0.0625) +in[(i+2)*n+j] * static_cast(0.0625) +in[(i-2)*n+j] * static_cast(-0.0625) - +in[i*n+(j+3)] * static_cast(0.0416666666667) - +in[i*n+(j-3)] * static_cast(-0.0416666666667) - +in[(i+3)*n+j] * static_cast(0.0416666666667) - +in[(i-3)*n+j] * static_cast(-0.0416666666667) + +in[i*n+(j+3)] * static_cast(0.041666666666666664) + +in[i*n+(j-3)] * static_cast(-0.041666666666666664) + +in[(i+3)*n+j] * static_cast(0.041666666666666664) + +in[(i-3)*n+j] * static_cast(-0.041666666666666664) +in[i*n+(j+4)] * static_cast(0.03125) +in[i*n+(j-4)] * static_cast(-0.03125) +in[(i+4)*n+j] * static_cast(0.03125) @@ -277,10 +276,10 @@ void star4(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buf +in[xy-dx2] * static_cast(-0.0625) +in[xy+dy2] * static_cast(0.0625) +in[xy-dy2] * static_cast(-0.0625) - +in[xy+dx3] * static_cast(0.0416666666667) - +in[xy-dx3] * static_cast(-0.0416666666667) - +in[xy+dy3] * static_cast(0.0416666666667) - +in[xy-dy3] * static_cast(-0.0416666666667) + +in[xy+dx3] * static_cast(0.041666666666666664) + +in[xy-dx3] * static_cast(-0.041666666666666664) + +in[xy+dy3] * static_cast(0.041666666666666664) + +in[xy-dy3] * static_cast(-0.041666666666666664) +in[xy+dx4] * static_cast(0.03125) +in[xy-dx4] * static_cast(-0.03125) +in[xy+dy4] * static_cast(0.03125) @@ -307,10 +306,10 @@ void star4(sycl::queue & q, const size_t n, const T * in, T * out) +in[i*n+(j-2)] * static_cast(-0.0625) +in[(i+2)*n+j] * static_cast(0.0625) +in[(i-2)*n+j] * static_cast(-0.0625) - +in[i*n+(j+3)] * static_cast(0.0416666666667) - +in[i*n+(j-3)] * static_cast(-0.0416666666667) - +in[(i+3)*n+j] * static_cast(0.0416666666667) - +in[(i-3)*n+j] * static_cast(-0.0416666666667) + +in[i*n+(j+3)] * static_cast(0.041666666666666664) + +in[i*n+(j-3)] * static_cast(-0.041666666666666664) + +in[(i+3)*n+j] * static_cast(0.041666666666666664) + +in[(i-3)*n+j] * static_cast(-0.041666666666666664) +in[i*n+(j+4)] * static_cast(0.03125) +in[i*n+(j-4)] * static_cast(-0.03125) +in[(i+4)*n+j] * static_cast(0.03125) @@ -339,10 +338,10 @@ void star5(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer +in[i*n+(j-2)] * static_cast(-0.05) +in[(i+2)*n+j] * static_cast(0.05) +in[(i-2)*n+j] * static_cast(-0.05) - +in[i*n+(j+3)] * static_cast(0.0333333333333) - +in[i*n+(j-3)] * static_cast(-0.0333333333333) - +in[(i+3)*n+j] * static_cast(0.0333333333333) - +in[(i-3)*n+j] * static_cast(-0.0333333333333) + +in[i*n+(j+3)] * static_cast(0.03333333333333333) + +in[i*n+(j-3)] * static_cast(-0.03333333333333333) + +in[(i+3)*n+j] * static_cast(0.03333333333333333) + +in[(i-3)*n+j] * static_cast(-0.03333333333333333) +in[i*n+(j+4)] * static_cast(0.025) +in[i*n+(j-4)] * static_cast(-0.025) +in[(i+4)*n+j] * static_cast(0.025) @@ -384,10 +383,10 @@ void star5(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buf +in[xy-dx2] * static_cast(-0.05) +in[xy+dy2] * static_cast(0.05) +in[xy-dy2] * static_cast(-0.05) - +in[xy+dx3] * static_cast(0.0333333333333) - +in[xy-dx3] * static_cast(-0.0333333333333) - +in[xy+dy3] * static_cast(0.0333333333333) - +in[xy-dy3] * static_cast(-0.0333333333333) + +in[xy+dx3] * static_cast(0.03333333333333333) + +in[xy-dx3] * static_cast(-0.03333333333333333) + +in[xy+dy3] * static_cast(0.03333333333333333) + +in[xy-dy3] * static_cast(-0.03333333333333333) +in[xy+dx4] * static_cast(0.025) +in[xy-dx4] * static_cast(-0.025) +in[xy+dy4] * static_cast(0.025) @@ -418,10 +417,10 @@ void star5(sycl::queue & q, const size_t n, const T * in, T * out) +in[i*n+(j-2)] * static_cast(-0.05) +in[(i+2)*n+j] * static_cast(0.05) +in[(i-2)*n+j] * static_cast(-0.05) - +in[i*n+(j+3)] * static_cast(0.0333333333333) - +in[i*n+(j-3)] * static_cast(-0.0333333333333) - +in[(i+3)*n+j] * static_cast(0.0333333333333) - +in[(i-3)*n+j] * static_cast(-0.0333333333333) + +in[i*n+(j+3)] * static_cast(0.03333333333333333) + +in[i*n+(j-3)] * static_cast(-0.03333333333333333) + +in[(i+3)*n+j] * static_cast(0.03333333333333333) + +in[(i-3)*n+j] * static_cast(-0.03333333333333333) +in[i*n+(j+4)] * static_cast(0.025) +in[i*n+(j-4)] * static_cast(-0.025) +in[(i+4)*n+j] * static_cast(0.025) From 527e58c517e52c16f8d8b9e41016878fa5c6b9bf Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 7 May 2022 12:02:04 +0300 Subject: [PATCH 24/80] remove unnecessary deprecated offset Signed-off-by: Jeff Hammond --- Cxx11/stencil-2d-sycl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/stencil-2d-sycl.cc b/Cxx11/stencil-2d-sycl.cc index ee42e2da0..541273634 100644 --- a/Cxx11/stencil-2d-sycl.cc +++ b/Cxx11/stencil-2d-sycl.cc @@ -144,7 +144,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star q.submit([&](sycl::handler& h) { auto in = d_in.template get_access(h); // Add constant to solution to force refresh of neighbor data, if any - h.parallel_for>(sycl::range<2> {n, n}, sycl::id<2> {0, 0}, [=] (sycl::item<2> it) { + h.parallel_for>(sycl::range<2> {n, n}, [=] (sycl::item<2> it) { sycl::id<2> xy = it.get_id(); in[xy] += static_cast(1); }); From 643b7965d2f4e47453cf5eaf4ff19db33a50313e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 7 May 2022 12:12:31 +0300 Subject: [PATCH 25/80] add a workaround for FP64 problems with DPC++ on TGL --- common/make.defs.oneapi | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi index ec6421d24..246af6200 100644 --- a/common/make.defs.oneapi +++ b/common/make.defs.oneapi @@ -19,7 +19,7 @@ CXX=icpx -std=c++20 -pthread #--gcc-toolchain=/opt/gcc/11.2.0 # Compiler flags # # -xHOST is appropriate for most cases. -DEFAULT_OPT_FLAGS=-g -O3 -xHOST +DEFAULT_OPT_FLAGS=-g3 -O3 -xHOST # # If you are compiling for KNL on a Xeon login node, use the following: # DEFAULT_OPT_FLAGS=-g -O3 -xMIC-AVX512 @@ -32,6 +32,7 @@ OPENMPFLAG=-qopenmp OPENMPSIMDFLAG=-qopenmp-simd OFFLOADFLAG=-fopenmp-targets=spir64 OFFLOADFLAG+=-DGPU_SCHEDULE="" +STDPARFLAG=-parallel -qmkl # # OpenCL flags # @@ -59,8 +60,12 @@ OPENCLFLAG=-I${OPENCLDIR}/include/sycl -L${OPENCLDIR}/lib -lOpenCL # SYCLCXX=dpcpp SYCLFLAG=-fsycl -SYCLFLAG+=-std=c++17 -O3 +SYCLFLAG+=-std=c++17 -O3 -g3 SYCLFLAG+=-DDPCPP +# this is because the DPC++ compiler will fail to compile run on Tiger Lake +# even though the code explicitly checks for FP64 support and only instantiates the +# template when the device query says FP64 is supported. +SYCLFLAG+=-DDPCPP_NO_DOUBLE # # # OCCA From 2d2898f1a5dd1b9c711fc8b6b65324ae3fae57c6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 7 May 2022 12:15:37 +0300 Subject: [PATCH 26/80] add a workaround for FP64 problems with DPC++ on TGL --- Cxx11/transpose-2d-sycl.cc | 11 ++++++++++- Cxx11/transpose-sycl-usm.cc | 8 +++++++- Cxx11/transpose-sycl.cc | 8 +++++++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/Cxx11/transpose-2d-sycl.cc b/Cxx11/transpose-2d-sycl.cc index 83092891e..2fbe8938b 100644 --- a/Cxx11/transpose-2d-sycl.cc +++ b/Cxx11/transpose-2d-sycl.cc @@ -217,7 +217,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, order, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -234,7 +236,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, order, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -250,13 +254,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); - bool has_fp64 = prk::SYCL::has_fp64(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE + bool has_fp64 = prk::SYCL::has_fp64(q); + if (has_fp64) { + if (prk::SYCL::print_gen12lp_helper(q)) return 1; + } if (has_fp64) { run(q, iterations, order, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/transpose-sycl-usm.cc b/Cxx11/transpose-sycl-usm.cc index c1d9a4fec..1ec5c1470 100644 --- a/Cxx11/transpose-sycl-usm.cc +++ b/Cxx11/transpose-sycl-usm.cc @@ -200,7 +200,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, order, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -217,7 +219,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, order, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -233,16 +237,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, order, block_size); if (has_fp64) { run(q, iterations, order, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index d3bcc0215..da0d596c0 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -216,7 +216,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, order, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -233,7 +235,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, order, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -249,16 +253,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, order, block_size); if (has_fp64) { run(q, iterations, order, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; From 7f5b6fa0d40665a297b0e66c8dc94b3a9948a321 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 7 May 2022 12:18:16 +0300 Subject: [PATCH 27/80] add a workaround for FP64 problems with DPC++ on TGL --- Cxx11/nstream-sycl-explicit-usm.cc | 8 +++++++- Cxx11/nstream-sycl-explicit.cc | 8 +++++++- Cxx11/nstream-sycl-usm.cc | 8 +++++++- Cxx11/nstream-sycl.cc | 8 +++++++- 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/Cxx11/nstream-sycl-explicit-usm.cc b/Cxx11/nstream-sycl-explicit-usm.cc index 22325b565..aa5c5c690 100644 --- a/Cxx11/nstream-sycl-explicit-usm.cc +++ b/Cxx11/nstream-sycl-explicit-usm.cc @@ -278,7 +278,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -295,7 +297,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -311,16 +315,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, length, block_size); if (has_fp64) { run(q, iterations, length, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc index a3083a244..adf045d32 100644 --- a/Cxx11/nstream-sycl-explicit.cc +++ b/Cxx11/nstream-sycl-explicit.cc @@ -271,7 +271,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -288,7 +290,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -304,16 +308,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, length, block_size); if (has_fp64) { run(q, iterations, length, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/nstream-sycl-usm.cc b/Cxx11/nstream-sycl-usm.cc index f119746ff..e872a5130 100644 --- a/Cxx11/nstream-sycl-usm.cc +++ b/Cxx11/nstream-sycl-usm.cc @@ -256,7 +256,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -273,7 +275,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -289,16 +293,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, length, block_size); if (has_fp64) { run(q, iterations, length, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index a95a163aa..140125f9d 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -253,7 +253,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -270,7 +272,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -286,16 +290,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, length, block_size); if (has_fp64) { run(q, iterations, length, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; From 86b6acd2a6c9760d9b6d4bb39e41578561d5f0b3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 10:09:29 +0300 Subject: [PATCH 28/80] no double stuff --- Cxx11/stencil-2d-sycl.cc | 11 ++++++++++- Cxx11/stencil-sycl-usm.cc | 8 +++++++- Cxx11/stencil-sycl.cc | 8 +++++++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/Cxx11/stencil-2d-sycl.cc b/Cxx11/stencil-2d-sycl.cc index 541273634..b6eeb09bc 100644 --- a/Cxx11/stencil-2d-sycl.cc +++ b/Cxx11/stencil-2d-sycl.cc @@ -281,7 +281,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, n, block_size, star, radius); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -298,7 +300,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, n, block_size, star, radius); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -314,13 +318,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); - bool has_fp64 = prk::SYCL::has_fp64(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE + bool has_fp64 = prk::SYCL::has_fp64(q); + if (has_fp64) { + if (prk::SYCL::print_gen12lp_helper(q)) return 1; + } if (has_fp64) { run(q, iterations, n, block_size, star, radius); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/stencil-sycl-usm.cc b/Cxx11/stencil-sycl-usm.cc index 8b7adfac5..b219b24f1 100644 --- a/Cxx11/stencil-sycl-usm.cc +++ b/Cxx11/stencil-sycl-usm.cc @@ -270,7 +270,9 @@ int main(int argc, char * argv[]) sycl::queue q(sycl::host_selector{}, sycl::property::queue::in_order{}); prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, n, block_size, star, radius); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -287,7 +289,9 @@ int main(int argc, char * argv[]) sycl::queue q(sycl::cpu_selector{}, sycl::property::queue::in_order{}); prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, n, block_size, star, radius); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -303,16 +307,18 @@ int main(int argc, char * argv[]) try { sycl::queue q(sycl::gpu_selector{}, sycl::property::queue::in_order{}); prk::SYCL::print_device_platform(q); + run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, n, block_size, star, radius); if (has_fp64) { run(q, iterations, n, block_size, star, radius); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index b78706df2..8947c8dee 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -279,7 +279,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, n, block_size, star, radius); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -296,7 +298,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, n, block_size, star, radius); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -312,16 +316,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, n, block_size, star, radius); if (has_fp64) { run(q, iterations, n, block_size, star, radius); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; From bfc6bb94d033852e6b303a253e334fa733a6aee4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 10:50:08 +0300 Subject: [PATCH 29/80] nstream C OpenACC --- C1z/nstream-openacc.c | 173 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 C1z/nstream-openacc.c diff --git a/C1z/nstream-openacc.c b/C1z/nstream-openacc.c new file mode 100644 index 000000000..051342f45 --- /dev/null +++ b/C1z/nstream-openacc.c @@ -0,0 +1,173 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// Copyright (c) 2022, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors and +/// the length of the vectors. +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// Converted to C11 by Jeff Hammond, February 2019. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + printf("Parallel Research Kernels version %d\n", PRKVERSION ); + printf("C11/OpenACC STREAM triad: A = B + scalar * C\n"); + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + printf("Usage: <# iterations> \n"); + return 1; + } + + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + // length of a the vector + size_t length = atol(argv[2]); + if (length <= 0) { + printf("ERROR: Vector length must be greater than 0\n"); + return 1; + } + + printf("Number of iterations = %d\n", iterations); + printf("Vector length = %zu\n", length); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time = 0.0; + + size_t bytes = length*sizeof(double); + double * restrict A = acc_malloc(bytes); + double * restrict B = acc_malloc(bytes); + double * restrict C = acc_malloc(bytes); + + double scalar = 3.0; + + { + #pragma acc parallel loop deviceptr(A,B,C) + for (size_t i=0; i epsilon) { + printf("Failed Validation on output array\n" + " Expected checksum: %lf\n" + " Observed checksum: %lf\n" + "ERROR: solution did not validate\n", ar, asum); + return 1; + } else { + printf("Solution validates\n"); + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime); + } + + return 0; +} + + From e15f271a9153a24ae04d6911bb11dc9148d51929 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 11:07:08 +0300 Subject: [PATCH 30/80] transpose OpenACC --- C1z/transpose-openacc.c | 167 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 C1z/transpose-openacc.c diff --git a/C1z/transpose-openacc.c b/C1z/transpose-openacc.c new file mode 100644 index 000000000..679afb1d8 --- /dev/null +++ b/C1z/transpose-openacc.c @@ -0,0 +1,167 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2022, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// C11-ification by Jeff Hammond, June 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + printf("Parallel Research Kernels version %d\n", PRKVERSION ); + printf("C11/OpenACC Matrix transpose: B = A^T\n"); + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + printf("Usage: <# iterations> [tile size]\n"); + return 1; + } + + // number of times to do the transpose + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + // order of a the matrix + int order = atoi(argv[2]); + if (order <= 0) { + printf("ERROR: Matrix Order must be greater than 0\n"); + return 1; + } + + // default tile size for tiling of local transpose + int tile_size = (argc>3) ? atoi(argv[3]) : 32; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; + + printf("Number of iterations = %d\n", iterations); + printf("Matrix order = %d\n", order); + printf("Tile size = %d\n", tile_size); + + ////////////////////////////////////////////////////////////////////// + /// Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + double trans_time = 0.0; + + size_t bytes = order*order*sizeof(double); + double * restrict A = acc_malloc(bytes); + double * restrict B = acc_malloc(bytes); + + { + #pragma acc parallel loop deviceptr(A,B) + for (int i=0;i Date: Wed, 18 May 2022 11:07:13 +0300 Subject: [PATCH 31/80] transpose OpenACC --- C1z/Makefile | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/C1z/Makefile b/C1z/Makefile index f8927c191..5e01f0894 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -22,34 +22,22 @@ endif ASMFLAGS = -fverbose-asm $(CFLAGS) OMPFLAGS = $(OPENMPFLAG) +ACCFLAGS = $(OPENACCFLAG) TARGETFLAGS = $(OFFLOADFLAG) CILKFLAGS = $(CILKFLAG) ISPCFLAGS = $(ISPCFLAG) -.PHONY: all clean serial thread openmp target taskloop ispc # cilk +.PHONY: all clean serial thread openmp tasks target taskloop ispc EXTRA= -ifeq ($(shell uname -s),Darwin) - ifneq ($(findstring icc,$(CC)),icc) - EXTRA += target - endif -else - ifneq ($(findstring icx,$(CC)),icx) - EXTRA += target - endif -endif ifdef ($(ISPC)) EXTRA += ispc endif ifneq ($(CILKFLAG),) EXTRA += cilk endif -ifeq ($(findstring xlc,$(CC)),xlc) - EXTRA = target - CFLAGS += -DXLC -endif -ifneq ($(findstring icx,$(CC)),icx) - EXTRA += tasks +ifneq ($(OPENACCFLAG),) + EXTRA += openacc endif all: serial thread openmp $(EXTRA) @@ -83,6 +71,8 @@ target: nstream-target stencil-target transpose-target nstream-alloc-target nstr taskloop: nstream-taskloop stencil-taskloop transpose-taskloop +openacc: nstream-openacc transpose-openacc + cilk: stencil-cilk transpose-cilk ispc: transpose-ispc @@ -132,6 +122,9 @@ p2p-2d: p2p-2d.c prk_util.h %-openmp: %-openmp.c prk_util.h prk_openmp.h $(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@ +%-openacc: %-openacc.c prk_util.h + $(CC) $(CFLAGS) $< $(ACCFLAGS) $(EXTRA_CLIBS) -o $@ + %-cilk: %-cilk.c prk_util.h $(CC) $(CFLAGS) $< $(CILKFLAGS) $(EXTRA_CLIBS) -o $@ @@ -161,6 +154,7 @@ clean: -rm -f p2p-sse p2p-avx p2p-avx3 p2p-avx-tasks-openmp -rm -f *-2d -rm -f *-openmp + -rm -f *-openacc -rm -f *-mpi -rm -f *-petsc -rm -f *-target From dc4f0554be4ae24cb3f2d64977529fa8eac797f6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 11:07:44 +0300 Subject: [PATCH 32/80] ignore and build --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 7843446ba..cb7046a77 100644 --- a/.gitignore +++ b/.gitignore @@ -65,6 +65,7 @@ C1z/nstream-mmap C1z/nstream-mmap-openmp C1z/nstream-mpi C1z/nstream-openmp +C1z/nstream-openacc C1z/nstream-petsc C1z/nstream-target C1z/nstream-taskloop @@ -96,6 +97,7 @@ C1z/transpose-2d-openmp C1z/transpose-cilk C1z/transpose-ispc C1z/transpose-openmp +C1z/transpose-openacc C1z/transpose-petsc C1z/transpose-target C1z/transpose-taskloop From a7bb31096312af10dbbb09c9cb3e7b894afcfcc3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 11:16:35 +0300 Subject: [PATCH 33/80] OpenACC C stencil --- .gitignore | 1 + C1z/Makefile | 2 +- C1z/stencil-openacc.c | 230 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 C1z/stencil-openacc.c diff --git a/.gitignore b/.gitignore index cb7046a77..df8aeaa8f 100644 --- a/.gitignore +++ b/.gitignore @@ -89,6 +89,7 @@ C1z/stencil-2d C1z/stencil-2d-openmp C1z/stencil-cilk C1z/stencil-openmp +C1z/stencil-openacc C1z/stencil-target C1z/stencil-taskloop C1z/transpose diff --git a/C1z/Makefile b/C1z/Makefile index 5e01f0894..c8c61ed10 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -71,7 +71,7 @@ target: nstream-target stencil-target transpose-target nstream-alloc-target nstr taskloop: nstream-taskloop stencil-taskloop transpose-taskloop -openacc: nstream-openacc transpose-openacc +openacc: nstream-openacc stencil-openacc transpose-openacc cilk: stencil-cilk transpose-cilk diff --git a/C1z/stencil-openacc.c b/C1z/stencil-openacc.c new file mode 100644 index 000000000..edc7e994b --- /dev/null +++ b/C1z/stencil-openacc.c @@ -0,0 +1,230 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2022, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - C99-ification by Jeff Hammond, February 2016. +/// - C11-ification by Jeff Hammond, June 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include "prk_util.h" + +typedef void (*stencil_t)(const int, const double * restrict, double * restrict); + +void nothing(const int n, const double * restrict in, double * restrict out) +{ + printf("You are trying to use a stencil that does not exist.\n"); + printf("Please generate the new stencil using the code generator.\n"); + // n will never be zero - this is to silence compiler warnings. + if (n==0) printf("%p %p\n", in, out); + abort(); +} + +#include "stencil_openacc.h" + +int main(int argc, char * argv[]) +{ + printf("Parallel Research Kernels version %d\n", PRKVERSION); + printf("C11/OpenACC Stencil execution on 2D grid\n"); + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3){ + printf("Usage: <# iterations> [ ]\n"); + return 1; + } + + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + int n = atoi(argv[2]); + if (n < 1) { + printf("ERROR: grid dimension must be positive\n"); + return 1; + } else if (n > floor(sqrt(INT_MAX))) { + printf("ERROR: grid dimension too large - overflow risk\n"); + return 1; + } + + // stencil pattern + bool star = true; + if (argc > 3) { + char* pattern = argv[3]; + star = (0==strncmp(pattern,"star",4)) ? true : false; + } + + // stencil radius + int radius = 2; + if (argc > 4) { + radius = atoi(argv[4]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + printf("ERROR: Stencil radius negative or too large\n"); + return 1; + } + + printf("Number of iterations = %d\n", iterations); + printf("Grid sizes = %d\n", n); + printf("Type of stencil = %s\n", (star ? "star" : "grid") ); + printf("Radius of stencil = %d\n", radius ); + + stencil_t stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + case 6: stencil = star6; break; + case 7: stencil = star7; break; + case 8: stencil = star8; break; + case 9: stencil = star9; break; + } + } else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + case 6: stencil = grid6; break; + case 7: stencil = grid7; break; + case 8: stencil = grid8; break; + case 9: stencil = grid9; break; + } + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double stencil_time = 0.0; + + // interior of grid with respect to stencil + size_t active_points = (n-2*radius)*(n-2*radius); + size_t bytes = n*n*sizeof(double); + + double * restrict in = acc_malloc(bytes); + double * restrict out = acc_malloc(bytes); + + { + #pragma acc parallel loop collapse(2) deviceptr(in,out) + for (int i=0; i epsilon) { + printf("ERROR: L1 norm = %lf Reference L1 norm = %lf\n", norm, reference_norm); + return 1; + } else { + printf("Solution validates\n"); +#ifdef VERBOSE + printf("L1 norm = %lf Reference L1 norm = %lf\n", norm, reference_norm); +#endif + const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2*stencil_size+1) * active_points; + double avgtime = stencil_time/iterations; + printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 1.0e-6 * (double)flops/avgtime, avgtime ); + } + + return 0; +} From c8fe3a913230cfd8a55739d0fef0cf08b39ffc7c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 12:03:00 +0300 Subject: [PATCH 34/80] stencil for OpenACC --- C1z/generate-c-stencil.py | 6 +- C1z/stencil_openacc.h | 3126 +++++++++++++++++++++++++++++++++++++ 2 files changed, 3130 insertions(+), 2 deletions(-) create mode 100644 C1z/stencil_openacc.h diff --git a/C1z/generate-c-stencil.py b/C1z/generate-c-stencil.py index f6dc86032..20a2c9455 100755 --- a/C1z/generate-c-stencil.py +++ b/C1z/generate-c-stencil.py @@ -21,9 +21,11 @@ def codegen(src,pattern,stencil_size,radius,W,model,dim): if (model=='openmp'): outer += 'OMP_FOR()\n ' elif (model=='target'): - outer += 'OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )\n ' + outer += 'OMP_TARGET( teams distribute parallel for simd collapse(2) )\n ' elif (model=='taskloop'): outer += 'OMP_TASKLOOP( firstprivate(n) shared(in,out) grainsize(gs) )\n ' + elif (model=='openacc'): + outer += 'PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )\n ' elif (model=='cilk'): outer += '_Cilk_' @@ -82,7 +84,7 @@ def instance(src,model,pattern,r,dim): codegen(src,pattern,stencil_size,r,W,model,dim) def main(): - for model in ['seq','openmp','target','cilk','taskloop']: + for model in ['seq','openmp','target','cilk','taskloop','openacc']: src = open('stencil_'+model+'.h','w') for pattern in ['star','grid']: for r in range(1,10): diff --git a/C1z/stencil_openacc.h b/C1z/stencil_openacc.h new file mode 100644 index 000000000..09652a00b --- /dev/null +++ b/C1z/stencil_openacc.h @@ -0,0 +1,3126 @@ +void star1(const int n, const double * restrict in, double * restrict out) { + PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) ) + for (int i=1; i Date: Wed, 18 May 2022 12:03:20 +0300 Subject: [PATCH 35/80] remove schedule --- C1z/stencil_target.h | 72 ++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/C1z/stencil_target.h b/C1z/stencil_target.h index b50d70636..28d1a5fcf 100644 --- a/C1z/stencil_target.h +++ b/C1z/stencil_target.h @@ -1,5 +1,5 @@ void star1(const int n, const double * restrict in, double * restrict out) { - OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) ) + OMP_TARGET( teams distribute parallel for simd collapse(2) ) for (int i=1; i Date: Wed, 18 May 2022 12:03:28 +0300 Subject: [PATCH 36/80] OpenACC --- C1z/Makefile | 11 ++++++----- C1z/transpose-openacc.c | 2 -- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/C1z/Makefile b/C1z/Makefile index c8c61ed10..f719a9096 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -3,13 +3,14 @@ include ../common/PRKVERSION CPPFLAGS = -DPRKVERSION=$(PRKVERSION) -CFLAGS = $(DEFAULT_OPT_FLAGS) $(CPPFLAGS) - # debugging ifdef VERBOSE - CFLAGS += -DVERBOSE + CPPFLAGS += -DVERBOSE endif +CFLAGS = $(DEFAULT_OPT_FLAGS) $(CPPFLAGS) + + ifdef PRK_USE_MMAP CFLAGS += -DPRK_USE_MMAP endif @@ -22,10 +23,10 @@ endif ASMFLAGS = -fverbose-asm $(CFLAGS) OMPFLAGS = $(OPENMPFLAG) -ACCFLAGS = $(OPENACCFLAG) TARGETFLAGS = $(OFFLOADFLAG) CILKFLAGS = $(CILKFLAG) ISPCFLAGS = $(ISPCFLAG) +OPENACCFLAGS = $(OPENACCFLAG) .PHONY: all clean serial thread openmp tasks target taskloop ispc @@ -123,7 +124,7 @@ p2p-2d: p2p-2d.c prk_util.h $(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@ %-openacc: %-openacc.c prk_util.h - $(CC) $(CFLAGS) $< $(ACCFLAGS) $(EXTRA_CLIBS) -o $@ + $(CC) $(CFLAGS) $< $(OPENACCFLAGS) $(EXTRA_CLIBS) -o $@ %-cilk: %-cilk.c prk_util.h $(CC) $(CFLAGS) $< $(CILKFLAGS) $(EXTRA_CLIBS) -o $@ diff --git a/C1z/transpose-openacc.c b/C1z/transpose-openacc.c index 679afb1d8..0ffc76c8e 100644 --- a/C1z/transpose-openacc.c +++ b/C1z/transpose-openacc.c @@ -71,14 +71,12 @@ int main(int argc, char * argv[]) return 1; } - // number of times to do the transpose int iterations = atoi(argv[1]); if (iterations < 1) { printf("ERROR: iterations must be >= 1\n"); return 1; } - // order of a the matrix int order = atoi(argv[2]); if (order <= 0) { printf("ERROR: Matrix Order must be greater than 0\n"); From 6f8e9d1c8564abbf206d95a6d3da7be601ce8993 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 12:03:45 +0300 Subject: [PATCH 37/80] cleanup --- Cxx11/Makefile | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index a96805be9..365e92363 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -1,7 +1,7 @@ -include ../common/Cxx11.defs +include ../common/make.defs include ../common/PRKVERSION -CPPFLAGS = -DPRKVERSION=$(PRKVERSION) +CPPFLAGS = -DPRKVERSION=$(PRKVERSION) # debugging ifdef VERBOSE @@ -31,7 +31,7 @@ endif #ASMFLAGS = -fsource-asm -fverbose-asm -fasm-blocks -fcode-asm ASMFLAGS = -fverbose-asm -OMPFLAGS = $(OPENMPFLAG) -DUSE_OPENMP +OMPFLAGS = $(OPENMPFLAG) TARGETFLAGS = $(OFFLOADFLAG) OPENCLFLAGS = $(OPENCLFLAG) -DCL_HPP_MINIMUM_OPENCL_VERSION=120 -DCL_HPP_TARGET_OPENCL_VERSION=120 -DCL_HPP_ENABLE_EXCEPTIONS # We do not yet handle all possible exceptions... @@ -62,18 +62,17 @@ OCCAFLAGS = -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib boost-compute thrust executor oneapi onemkl EXTRA= -ifeq ($(shell uname -s),Darwin) - ifneq ($(findstring icpc,$(CXX)),icpc) - EXTRA += target - endif -else - EXTRA += target +ifneq ($(findstring nvc++,$(CXX)),nvc++) + EXTRA += ranges stl pstl +endif +ifneq ($(OPENACCFLAG),) + EXTRA += openacc endif -ifneq ($(findstring pgc++,$(CXX)),pgc++) - EXTRA += pstl +ifneq ($(SYCLCC),) + EXTRA += sycl endif -all: sequential vector valarray openmp taskloop stl ranges opencl sycl $(EXTRA) +all: sequential vector valarray openmp taskloop opencl $(EXTRA) sequential: p2p stencil transpose nstream dgemm sparse @@ -137,7 +136,7 @@ oneapi: onemkl dpcpp sycl onedpl occa: transpose-occa nstream-occa -openacc: p2p-hyperplane-openacc +openacc: nstream-openacc stencil-openacc transpose-openacc p2p-hyperplane-openacc stdpar: nstream-stdpar transpose-stdpar #stencil-stdpar p2p-stdpar From 706bdaeb0e0c7deab21b56949a0078636eeb8026 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 12:03:55 +0300 Subject: [PATCH 38/80] remove USE_OPENMP --- Cxx11/prk_util.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index e5314fd81..93a037f78 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -81,7 +81,7 @@ #endif // omp_get_wtime() -#if defined(USE_OPENMP) && defined(_OPENMP) +#if defined(_OPENMP) #include #endif @@ -301,7 +301,7 @@ namespace prk { static inline double wtime(void) { -#if defined(USE_OPENMP) && defined(_OPENMP) +#if defined(_OPENMP) return omp_get_wtime(); #else using t = std::chrono::high_resolution_clock; From 59b06de28bd0fcf5bf699dbb058ef4f149d2ee38 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 12:04:10 +0300 Subject: [PATCH 39/80] remove unnecessary indirection --- common/Cxx11.defs | 1 - 1 file changed, 1 deletion(-) delete mode 100644 common/Cxx11.defs diff --git a/common/Cxx11.defs b/common/Cxx11.defs deleted file mode 100644 index d146ce6f7..000000000 --- a/common/Cxx11.defs +++ /dev/null @@ -1 +0,0 @@ -include ../common/make.defs From 9a20e2ec7e90fc82bdce894a261de41105eb935f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 12:27:53 +0300 Subject: [PATCH 40/80] OpenACC --- .gitignore | 5 + C1z/nstream-openacc.c | 1 - C1z/stencil-openacc.c | 5 +- Cxx11/generate-cxx-stencil.py | 13 +- Cxx11/nstream-openacc.cc | 177 +++++++++++++++ Cxx11/stencil-openacc.cc | 233 ++++++++++++++++++++ Cxx11/stencil_openacc.hpp | 397 ++++++++++++++++++++++++++++++++++ Cxx11/transpose-openacc.cc | 173 +++++++++++++++ 8 files changed, 998 insertions(+), 6 deletions(-) create mode 100644 Cxx11/nstream-openacc.cc create mode 100644 Cxx11/stencil-openacc.cc create mode 100644 Cxx11/stencil_openacc.hpp create mode 100644 Cxx11/transpose-openacc.cc diff --git a/.gitignore b/.gitignore index df8aeaa8f..2dab1847b 100644 --- a/.gitignore +++ b/.gitignore @@ -139,6 +139,7 @@ Cxx11/nstream-cublas Cxx11/nstream-cuda Cxx11/nstream-cuda-managed Cxx11/nstream-dpcpp +Cxx11/nstream-onedpl Cxx11/nstream-executors Cxx11/nstream-hip Cxx11/nstream-hipblas @@ -154,6 +155,7 @@ Cxx11/nstream-multigpu-dpcpp Cxx11/nstream-onemkl Cxx11/nstream-opencl Cxx11/nstream-openmp +Cxx11/nstream-openacc Cxx11/nstream-openmp-target Cxx11/nstream-pstl Cxx11/nstream-raja @@ -174,6 +176,7 @@ Cxx11/nstream-vector-raja Cxx11/p2p Cxx11/p2p-doacross-openmp Cxx11/p2p-hyperplane-openmp +Cxx11/p2p-hyperplane-openacc Cxx11/p2p-hyperplane-pstl Cxx11/p2p-hyperplane-stl Cxx11/p2p-hyperplane-sycl @@ -212,6 +215,7 @@ Cxx11/stencil-kokkos Cxx11/stencil-mpi Cxx11/stencil-opencl Cxx11/stencil-openmp +Cxx11/stencil-openacc Cxx11/stencil-openmp-target Cxx11/stencil-pstl Cxx11/stencil-raja @@ -243,6 +247,7 @@ Cxx11/transpose-kokkos Cxx11/transpose-mpi Cxx11/transpose-opencl Cxx11/transpose-openmp +Cxx11/transpose-openacc Cxx11/transpose-openmp-target Cxx11/transpose-pstl Cxx11/transpose-raja diff --git a/C1z/nstream-openacc.c b/C1z/nstream-openacc.c index 051342f45..94985da56 100644 --- a/C1z/nstream-openacc.c +++ b/C1z/nstream-openacc.c @@ -53,7 +53,6 @@ /// by the execution time. For a vector length of N, the total /// number of words read and written is 4*N*sizeof(double). /// -/// /// HISTORY: This code is loosely based on the Stream benchmark by John /// McCalpin, but does not follow all the Stream rules. Hence, /// reported results should not be associated with Stream in diff --git a/C1z/stencil-openacc.c b/C1z/stencil-openacc.c index edc7e994b..6f79c40f3 100644 --- a/C1z/stencil-openacc.c +++ b/C1z/stencil-openacc.c @@ -160,10 +160,7 @@ int main(int argc, char * argv[]) double stencil_time = 0.0; - // interior of grid with respect to stencil - size_t active_points = (n-2*radius)*(n-2*radius); size_t bytes = n*n*sizeof(double); - double * restrict in = acc_malloc(bytes); double * restrict out = acc_malloc(bytes); @@ -196,6 +193,8 @@ int main(int argc, char * argv[]) // Analyze and output results. ////////////////////////////////////////////////////////////////////// + // interior of grid with respect to stencil + size_t active_points = (n-2*radius)*(n-2*radius); // compute L1 norm in parallel double norm = 0.0; #pragma acc parallel loop reduction( +:norm ) deviceptr(out) diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index 00095484e..67cf61894 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -67,6 +67,15 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' }\n') src.write(' }\n') src.write('}\n\n') + elif (model=='openacc'): + src.write('void '+pattern+str(radius)+'(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {\n') + src.write(' PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )\n') + src.write(' for (int i='+str(radius)+'; i <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/OpenACC STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time{0}; + + size_t bytes = length*sizeof(double); + double * RESTRICT A = (double *) acc_malloc(bytes); + double * RESTRICT B = (double *) acc_malloc(bytes); + double * RESTRICT C = (double *) acc_malloc(bytes); + + double scalar = 3.0; + + { + #pragma acc parallel loop deviceptr(A,B,C) + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << std::setprecision(16) + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/stencil-openacc.cc b/Cxx11/stencil-openacc.cc new file mode 100644 index 000000000..18a1e212f --- /dev/null +++ b/Cxx11/stencil-openacc.cc @@ -0,0 +1,233 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2022, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following functions are used in +/// this program: +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - RvdW: Removed unrolling pragmas for clarity; +/// added constant to array "in" at end of each iteration to force +/// refreshing of neighbor data in parallel versions; August 2013 +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include "prk_util.h" +#include "stencil_openacc.hpp" + +void nothing(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) +{ + // use arguments to silence compiler warnings + out[0] = in[0] + n + t; +} + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/OpenMP TARGET Stencil execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, n, radius, tile_size; + bool star = true; + try { + if (argc < 3) { + throw "Usage: <# iterations> [ ]"; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimension must be positive"; + } else if (n > prk::get_max_matrix_size()) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = n; + if (tile_size > n) tile_size = n; + } + + // stencil pattern + if (argc > 4) { + auto stencil = std::string(argv[4]); + auto grid = std::string("grid"); + star = (stencil == grid) ? false : true; + } + + // stencil radius + radius = 2; + if (argc > 5) { + radius = std::atoi(argv[5]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + throw "ERROR: Stencil radius negative or too large"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid size = " << n << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; + std::cout << "Radius of stencil = " << radius << std::endl; + + auto stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + } + } else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + } + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double stencil_time{0}; + + size_t bytes = n*n*sizeof(double); + double * RESTRICT in = (double *)acc_malloc(bytes); + double * RESTRICT out = (double *)acc_malloc(bytes); + + { + #pragma acc parallel loop collapse(2) deviceptr(in,out) + for (int i=0; i(i+j); + out[i*n+j] = 0.0; + } + } + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) stencil_time = prk::wtime(); + + stencil(n, tile_size, in, out); + + #pragma acc parallel loop collapse(2) deviceptr(in) + for (int i=0; i(n-2*radius)*static_cast(n-2*radius); + // compute L1 norm in parallel + double norm = 0.0; + #pragma acc parallel loop reduction( +:norm ) deviceptr(out) + for (int i=radius; i epsilon) { + std::cout << "ERROR: L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; +#ifdef VERBOSE + std::cout << "L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; +#endif + const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2L*(size_t)stencil_size+1L) * active_points; + auto avgtime = stencil_time/iterations; + std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} diff --git a/Cxx11/stencil_openacc.hpp b/Cxx11/stencil_openacc.hpp new file mode 100644 index 000000000..523cda771 --- /dev/null +++ b/Cxx11/stencil_openacc.hpp @@ -0,0 +1,397 @@ +#define RESTRICT __restrict__ + +void star1(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { + PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) ) + for (int i=1; i <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/OpenMP TARGET Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + int tile_size; + try { + if (argc < 3) { + throw "Usage: <# iterations> [tile size]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = (argc>3) ? std::atoi(argv[3]) : order; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double trans_time{0}; + + size_t bytes = order*order*sizeof(double); + double * restrict A = (double *)acc_malloc(bytes); + double * restrict B = (double *)acc_malloc(bytes); + + { + #pragma acc parallel loop deviceptr(A,B) + for (int i=0;i(i*order+j); + B[i*order+j] = 0.0; + } + } + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) trans_time = prk::wtime(); + + #pragma acc parallel loop tile(tile_size,tile_size) deviceptr(A,B) + for (int i=0;i(ij)*(1.+iterations)+addit; + abserr += prk::abs(B[ji] - reference); + } + } + + acc_free(A); + acc_free(B); + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const auto epsilon = 1.0e-8; + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + return 1; + } + + return 0; +} + + From 5af2232d7a831564832c81951d96904d3a93c159 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 12:36:49 +0300 Subject: [PATCH 41/80] cleanup --- C1z/nstream-openacc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/C1z/nstream-openacc.c b/C1z/nstream-openacc.c index 94985da56..ba4e587af 100644 --- a/C1z/nstream-openacc.c +++ b/C1z/nstream-openacc.c @@ -143,7 +143,7 @@ int main(int argc, char * argv[]) ar *= length; double asum = 0.0; - #pragma acc parallel loop reduction( +:asum ) deviceptr(A,B,C) + #pragma acc parallel loop reduction( +:asum ) deviceptr(A) for (size_t i=0; i Date: Wed, 18 May 2022 03:45:59 -0700 Subject: [PATCH 42/80] fix validation --- Cxx11/dgemm-multigpu-cublas.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Cxx11/dgemm-multigpu-cublas.cu b/Cxx11/dgemm-multigpu-cublas.cu index 160a9d12c..439f33a9b 100644 --- a/Cxx11/dgemm-multigpu-cublas.cu +++ b/Cxx11/dgemm-multigpu-cublas.cu @@ -153,7 +153,7 @@ int main(int argc, char * argv[]) std::cout << "C++11/CUBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl; prk::CUDA::info info; - info.print(); + //info.print(); ////////////////////////////////////////////////////////////////////// /// Read and test input parameters @@ -306,18 +306,18 @@ int main(int argc, char * argv[]) double residuum(0); for (int i=0; i Date: Wed, 18 May 2022 03:47:03 -0700 Subject: [PATCH 43/80] print --- Cxx11/dgemm-multigpu-cublas.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/dgemm-multigpu-cublas.cu b/Cxx11/dgemm-multigpu-cublas.cu index 439f33a9b..18a039425 100644 --- a/Cxx11/dgemm-multigpu-cublas.cu +++ b/Cxx11/dgemm-multigpu-cublas.cu @@ -153,7 +153,7 @@ int main(int argc, char * argv[]) std::cout << "C++11/CUBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl; prk::CUDA::info info; - //info.print(); + info.print(); ////////////////////////////////////////////////////////////////////// /// Read and test input parameters From b002f2829a2149b7989011e04339954b510485f3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 03:47:32 -0700 Subject: [PATCH 44/80] update --- common/make.defs.nvhpc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc index 38438888b..fa4b59e8b 100644 --- a/common/make.defs.nvhpc +++ b/common/make.defs.nvhpc @@ -1,6 +1,6 @@ # # This file shows the NVHPC toolchain options. -NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11 +NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/22.2 #NVHPC_PATH=/proj/nv/Linux_$$(uname -m)/21.11 #NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_$$(uname -m)/2021 NVHPC_CBIN=${NVHPC_PATH}/compilers/bin/ @@ -74,7 +74,7 @@ CBLASFLAG=${BLASFLAG} NVCC=${NVHPC_CBIN}nvcc CUDAFLAGS=-g -O3 -std=c++17 CUDAFLAGS+=--extended-lambda -CUDAFLAGS+=--gpu-architecture=sm_75 +CUDAFLAGS+=--gpu-architecture=sm_80 #CUDAFLAGS+=--compiler-bindir=/swtools/gcc/7.5.0/bin #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp CUDAFLAGS+=-rdc=true # FIXES ptxas fatal : Unresolved extern function 'cudaCGGetIntrinsicHandle' @@ -106,8 +106,7 @@ CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED # MPI-3 # # mpiicc wraps icc. mpicc and mpigcc wrap gcc. -MPIDIR=${NVHPC_PATH}/comm_libs/openmpi/openmpi-3.1.5 -#MPIDIR=${NVHPC_PATH}/comm_libs/openmpi4/openmpi-4.0.5 +MPIDIR=${NVHPC_PATH}/comm_libs/hpcx/latest/ompi MPICC=${MPIDIR}/bin/mpicc MPICXX=${MPIDIR}/bin/mpicxx MPIFORT=${MPIDIR}/bin/mpifort From 06f16806267f74daa87d74b2d7e07b4372218e43 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 17:50:36 +0300 Subject: [PATCH 45/80] transpose cleanup --- FORTRAN/nstream-mpi.F90 | 3 +- FORTRAN/transpose-openacc.F90 | 119 ++++++-------------------- FORTRAN/transpose-openmp-target.F90 | 69 +++------------ FORTRAN/transpose-openmp.F90 | 63 +++----------- FORTRAN/transpose-pointer.F90 | 52 ++--------- FORTRAN/transpose-pretty.F90 | 51 +++-------- FORTRAN/transpose-stdpar.F90 | 71 +++------------ FORTRAN/transpose-taskloop-openmp.F90 | 65 +++----------- FORTRAN/transpose-tasks-openmp.F90 | 66 +++----------- FORTRAN/transpose.F90 | 10 +-- 10 files changed, 118 insertions(+), 451 deletions(-) diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90 index 2f4e58937..66ba8d30c 100644 --- a/FORTRAN/nstream-mpi.F90 +++ b/FORTRAN/nstream-mpi.F90 @@ -139,7 +139,8 @@ program main !$omp parallel default(none) & !$omp& shared(A,B,C,nstream_time) & !$omp& firstprivate(length,iterations,scalar) & - !$omp& private(i,k,t0,t1) + !$omp& private(i,k,t0,t1) & + !$omp& shared(MPI_COMM_WORLD) #endif #if defined(_OPENMP) diff --git a/FORTRAN/transpose-openacc.F90 b/FORTRAN/transpose-openacc.F90 index 02ab0ab9d..1a0a69fe9 100644 --- a/FORTRAN/transpose-openacc.F90 +++ b/FORTRAN/transpose-openacc.F90 @@ -50,16 +50,14 @@ ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 +! ! ******************************************************************* program main use, intrinsic :: iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -80,120 +78,50 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a41)') 'Fortran OpenACC Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling - endif + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + write(*,'(a22,i8)') 'Tile size = ', tile_size ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - allocate( B(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - t0 = 0 - if (tile_size.lt.order) then - !$acc parallel loop gang collapse(2) - do jt=1,order,tile_size - do it=1,order,tile_size - !$acc loop vector collapse(2) - do j=jt,min(order,jt+tile_size-1) - do i=it,min(order,it+tile_size-1) - A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) - B(i,j) = 0.0 - enddo - enddo - enddo - enddo - else - !$acc parallel loop collapse(2) - do j=1,order - do i=1,order - A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) - B(i,j) = 0.0 - enddo + !$acc data create(A,B) + + !$acc parallel loop collapse(2) + do j=1,order + do i=1,order + A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) + B(i,j) = 0 enddo - endif + enddo - !$acc data pcopyin(A) pcopy(B) do k=0,iterations if (k.eq.1) t0 = prk_get_wtime() - ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix - if (tile_size.lt.order) then - !$acc parallel loop gang collapse(2) - do jt=1,order,tile_size - do it=1,order,tile_size - !$acc loop vector collapse(2) - do j=jt,min(order,jt+tile_size-1) - do i=it,min(order,it+tile_size-1) - B(j,i) = B(j,i) + A(i,j) - A(i,j) = A(i,j) + 1.0 - enddo - enddo - enddo - enddo - else - !$acc parallel loop collapse(2) - do j=1,order - do i=1,order - B(j,i) = B(j,i) + A(i,j) - A(i,j) = A(i,j) + 1.0 - enddo + !$acc parallel loop tile(tile_size,tile_size) + do j=1,order + do i=1,order + B(j,i) = B(j,i) + A(i,j) + A(i,j) = A(i,j) + 1.0 enddo - endif + enddo enddo ! iterations t1 = prk_get_wtime() - !$acc end data - trans_time = t1 - t0 ! ******************************************************************** @@ -212,8 +140,9 @@ program main enddo enddo - deallocate( B ) - deallocate( A ) + !$acc end data + + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose-openmp-target.F90 b/FORTRAN/transpose-openmp-target.F90 index 4aa431b18..a8b75a245 100644 --- a/FORTRAN/transpose-openmp-target.F90 +++ b/FORTRAN/transpose-openmp-target.F90 @@ -50,16 +50,15 @@ ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 +! ! ******************************************************************* program main use, intrinsic :: iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -81,66 +80,23 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a47)') 'Fortran OpenMP TARGET Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a33,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a28,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif - - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size.gt.order).or.(tile_size.lt.1)) then - tile_size = order - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - if (tile_size.lt.order) then - if (mod(order,tile_size).ne.0) then - write(*,'(a50)') 'ERROR: order must be evenly divisible by tile_size' - stop 1 - endif - if (tile_size.gt.32) then - write(*,'(a50)') 'ERROR: tile_size must be less than 32 to use temp space' - stop 1 - endif + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif @@ -223,8 +179,7 @@ program main enddo !$omp end parallel do - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose-openmp.F90 b/FORTRAN/transpose-openmp.F90 index 93dab50a8..d88d470ff 100644 --- a/FORTRAN/transpose-openmp.F90 +++ b/FORTRAN/transpose-openmp.F90 @@ -50,16 +50,15 @@ ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 +! ! ******************************************************************* program main use, intrinsic :: iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -80,63 +79,27 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a40)') 'Fortran OpenMP Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of threads = ',omp_get_max_threads() + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - - t0 = 0 - !$omp parallel default(none) & !$omp& shared(A,B,t0,t1) & !$omp& firstprivate(order,iterations,tile_size) & @@ -172,6 +135,8 @@ program main !$omp end do endif + t0 = 0 + ! need this because otherwise no barrier between initialization ! and iteration 0 (warmup), which will lead to incorrectness. !$omp barrier diff --git a/FORTRAN/transpose-pointer.F90 b/FORTRAN/transpose-pointer.F90 index 87c3eaac1..b576d5e36 100644 --- a/FORTRAN/transpose-pointer.F90 +++ b/FORTRAN/transpose-pointer.F90 @@ -57,10 +57,7 @@ program main use, intrinsic :: iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -83,38 +80,14 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a40)') 'Fortran Serial Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif ! ******************************************************************** @@ -130,10 +103,6 @@ program main A(1:order,1:order) => TA B(1:order,1:order) => TB - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - t0 = 0 if (tile_size.lt.order) then @@ -158,9 +127,7 @@ program main do k=0,iterations - if (k.eq.1) then - t0 = prk_get_wtime() - endif + if (k.eq.1) t0 = prk_get_wtime() ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix if (tile_size.lt.order) then @@ -204,8 +171,7 @@ program main enddo enddo - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose-pretty.F90 b/FORTRAN/transpose-pretty.F90 index 885c4ac3d..6eff0820d 100644 --- a/FORTRAN/transpose-pretty.F90 +++ b/FORTRAN/transpose-pretty.F90 @@ -53,13 +53,11 @@ program main use, intrinsic :: iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix + integer(kind=INT32) :: tile_size real(kind=REAL64), allocatable :: A(:,:) ! buffer to hold original matrix real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold transposed matrix integer(kind=INT64) :: bytes ! combined size of matrices @@ -77,57 +75,37 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a40)') 'Fortran Pretty Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - allocate( B(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif + t0 = 0 ! Fill the original matrix o2 = int(order,INT64)**2 A = reshape((/ (j2, j2 = 0,o2) /),(/order, order/)) B = 0 - t0 = 0 - do k=0,iterations - ! start timer after a warmup iteration + if (k.eq.1) t0 = prk_get_wtime() + B = B + transpose(A) A = A + 1 enddo ! iterations @@ -155,8 +133,7 @@ program main abserr = norm2(A-B) #endif - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose-stdpar.F90 b/FORTRAN/transpose-stdpar.F90 index 26c0e87f5..7faf89646 100644 --- a/FORTRAN/transpose-stdpar.F90 +++ b/FORTRAN/transpose-stdpar.F90 @@ -50,16 +50,14 @@ ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 +! ! ******************************************************************* program main use, intrinsic :: iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -81,76 +79,33 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a40)') 'Fortran stdpar Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a33,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a28,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif - - ! same default as the C implementation - tile_size = 16 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - if ((tile_size.gt.0).and.(mod(order,tile_size).ne.0)) then - write(*,'(a50)') 'ERROR: order must be evenly divisible by tile_size' - stop 1 - endif - if ((tile_size.ne.order) .and. (tile_size.gt.32)) then - write(*,'(a50)') 'ERROR: tile_size must be less than 32 to use temp space' - stop 1 + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - - allocate( A(order,order), stat=err) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - allocate( B(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif + t0 = 0 do concurrent (j=1:order, i=1:order) A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) B(i,j) = 0.0 enddo - t0 = 0 - do k=0,iterations if (k.eq.1) t0 = prk_get_wtime() @@ -180,7 +135,6 @@ program main enddo ! iterations t1 = prk_get_wtime() - trans_time = t1 - t0 ! ******************************************************************** @@ -196,8 +150,7 @@ program main abserr = abserr + abs(B(i,j) - (temp+addit)) enddo - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose-taskloop-openmp.F90 b/FORTRAN/transpose-taskloop-openmp.F90 index 3cc0fbc78..fccef232f 100644 --- a/FORTRAN/transpose-taskloop-openmp.F90 +++ b/FORTRAN/transpose-taskloop-openmp.F90 @@ -49,16 +49,15 @@ ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 +! ! ******************************************************************* program main use, intrinsic :: iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -79,61 +78,26 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a50)') 'Fortran OpenMP TASKLOOP Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a,i1)') 'argument count = ', command_argument_count() - write(*,'(a)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - t0 = 0 !$omp parallel default(none) & @@ -160,9 +124,7 @@ program main do k=0,iterations - if (k.eq.1) then - t0 = omp_get_wtime() - endif + if (k.eq.1) t0 = omp_get_wtime() !$omp taskloop firstprivate(order,tile_size) shared(A,B) private(i,j,it,jt) do jt=1,order,tile_size @@ -211,8 +173,7 @@ program main enddo !$omp end parallel do - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose-tasks-openmp.F90 b/FORTRAN/transpose-tasks-openmp.F90 index a0ac9afb9..7cce694ba 100644 --- a/FORTRAN/transpose-tasks-openmp.F90 +++ b/FORTRAN/transpose-tasks-openmp.F90 @@ -49,16 +49,15 @@ ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 +! ! ******************************************************************* program main use, intrinsic :: iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -79,61 +78,27 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a46)') 'Fortran OpenMP TASKS Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of threads = ',omp_get_max_threads() + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - t0 = 0 !$omp parallel default(none) & @@ -160,9 +125,7 @@ program main do k=0,iterations - if (k.eq.1) then - t0 = omp_get_wtime() - endif + if (k.eq.1) t0 = omp_get_wtime() do jt=1,order,tile_size !$omp task firstprivate(order,tile_size,jt) shared(A,B) private(i,j,it) @@ -211,8 +174,7 @@ program main enddo !$omp end parallel do - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose.F90 b/FORTRAN/transpose.F90 index 4e398a1bf..56fb6ab26 100644 --- a/FORTRAN/transpose.F90 +++ b/FORTRAN/transpose.F90 @@ -98,6 +98,8 @@ program main stop 1 endif + t0 = 0 + if (tile_size.lt.order) then do jt=1,order,tile_size do it=1,order,tile_size @@ -118,12 +120,9 @@ program main enddo endif - t0 = 0 - do k=0,iterations - if (k.eq.1) then - t0 = prk_get_wtime() - endif + + if (k.eq.1) t0 = prk_get_wtime() ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix if (tile_size.lt.order) then @@ -149,7 +148,6 @@ program main enddo ! iterations t1 = prk_get_wtime() - trans_time = t1 - t0 ! ******************************************************************** From 359da8eabe2fd842ef9d1611d63a73bfe085c902 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 18:05:13 +0300 Subject: [PATCH 46/80] fix arg parse in MPI --- FORTRAN/transpose-a2a-mpi.F90 | 35 ++++------------------ FORTRAN/transpose-acc-mpi.F90 | 34 ++++------------------ FORTRAN/transpose-ga.F90 | 55 +++++++++++------------------------ FORTRAN/transpose-get-mpi.F90 | 34 ++++------------------ FORTRAN/transpose-p2p-mpi.F90 | 34 ++++------------------ 5 files changed, 37 insertions(+), 155 deletions(-) diff --git a/FORTRAN/transpose-a2a-mpi.F90 b/FORTRAN/transpose-a2a-mpi.F90 index c121b037a..c38158397 100644 --- a/FORTRAN/transpose-a2a-mpi.F90 +++ b/FORTRAN/transpose-a2a-mpi.F90 @@ -90,8 +90,6 @@ program main implicit none ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations integer(kind=INT32) :: order, block_order @@ -101,8 +99,7 @@ program main real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, r, lo, hi - !integer(kind=INT32) :: it, jt, tile_size + integer(kind=INT32) :: i, j, k, r, lo, hi, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime real(kind=REAL64), parameter :: epsilon=1.d-8 @@ -118,38 +115,16 @@ program main ! ******************************************************************** if (me.eq.0) then + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' - - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> ' - call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - call MPI_Abort(MPI_COMM_WORLD, 2) - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - call MPI_Abort(MPI_COMM_WORLD, 3) - endif + write(*,'(a22,i8)') 'Number of MPI procs = ', np + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order if (mod(order,np).ne.0) then write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np call MPI_Abort(MPI_COMM_WORLD, 4) endif - - write(*,'(a23,i8)') 'Number of MPI procs = ', np - write(*,'(a23,i8)') 'Number of iterations = ', iterations - write(*,'(a23,i8)') 'Matrix order = ', order endif call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) diff --git a/FORTRAN/transpose-acc-mpi.F90 b/FORTRAN/transpose-acc-mpi.F90 index 9023a006f..2b49bb0bd 100644 --- a/FORTRAN/transpose-acc-mpi.F90 +++ b/FORTRAN/transpose-acc-mpi.F90 @@ -91,8 +91,6 @@ program main implicit none ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations integer(kind=INT32) :: order, block_order @@ -104,7 +102,7 @@ program main real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, q, r, lo, hi + integer(kind=INT32) :: i, j, k, q, r, lo, hi, tile_size !integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime @@ -123,38 +121,16 @@ program main ! ******************************************************************** if (me.eq.0) then + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' - - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> ' - call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - call MPI_Abort(MPI_COMM_WORLD, 2) - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - call MPI_Abort(MPI_COMM_WORLD, 3) - endif + write(*,'(a22,i8)') 'Number of MPI procs = ', np + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order if (mod(order,np).ne.0) then write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np call MPI_Abort(MPI_COMM_WORLD, 4) endif - - write(*,'(a23,i8)') 'Number of MPI procs = ', np - write(*,'(a23,i8)') 'Number of iterations = ', iterations - write(*,'(a23,i8)') 'Matrix order = ', order endif call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) diff --git a/FORTRAN/transpose-ga.F90 b/FORTRAN/transpose-ga.F90 index 8d81c038d..5e2fde45f 100644 --- a/FORTRAN/transpose-ga.F90 +++ b/FORTRAN/transpose-ga.F90 @@ -55,14 +55,12 @@ program main use, intrinsic :: iso_fortran_env use mpi_f08 + use prk implicit none #include "global.fh" #include "mafdecls.fh" !#include 'ga-mpi.fh' ! unused - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! MPI - should always use 32-bit INTEGER integer(kind=INT32), parameter :: requested = MPI_THREAD_SERIALIZED integer(kind=INT32) :: provided @@ -86,33 +84,7 @@ program main real(kind=REAL64) :: t0, t1, trans_time, avgtime real(kind=REAL64), parameter :: epsilon=1.d-8 - ! ******************************************************************** - ! read and test input parameters - ! ******************************************************************** - - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> ' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif - - call mpi_init_thread(requested,provided) + call MPI_Init_thread(requested,provided) !call ga_initialize() ! ask GA to allocate enough memory for 4 matrices, just to be safe @@ -124,6 +96,21 @@ program main !if (me.eq.0) print*,'max_mem=',max_mem + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + if (me.eq.0) then + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a47)') 'Fortran Global Arrays Matrix transpose: B = A^T' + write(*,'(a22,i8)') 'Number of GA procs = ', np + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + endif + call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + #if PRK_CHECK_GA_MPI ! We do use MPI anywhere, but if we did, we would need to avoid MPI collectives ! on the world communicator, because it is possible for that to be larger than @@ -140,14 +127,6 @@ program main endif #endif - if (me.eq.0) then - write(*,'(a25)') 'Parallel Research Kernels' - write(*,'(a47)') 'Fortran Global Arrays Matrix transpose: B = A^T' - write(*,'(a22,i12)') 'Number of GA procs = ', np - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - endif - call ga_sync() ! ******************************************************************** diff --git a/FORTRAN/transpose-get-mpi.F90 b/FORTRAN/transpose-get-mpi.F90 index b153117ca..ebab0c406 100644 --- a/FORTRAN/transpose-get-mpi.F90 +++ b/FORTRAN/transpose-get-mpi.F90 @@ -91,8 +91,6 @@ program main implicit none ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations integer(kind=INT32) :: order, block_order @@ -104,7 +102,7 @@ program main real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, q, r, lo, hi + integer(kind=INT32) :: i, j, k, q, r, lo, hi, tile_size !integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime @@ -123,38 +121,16 @@ program main ! ******************************************************************** if (me.eq.0) then + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' - - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> ' - call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - call MPI_Abort(MPI_COMM_WORLD, 2) - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - call MPI_Abort(MPI_COMM_WORLD, 3) - endif + write(*,'(a22,i8)') 'Number of MPI procs = ', np + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order if (mod(order,np).ne.0) then write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np call MPI_Abort(MPI_COMM_WORLD, 4) endif - - write(*,'(a23,i8)') 'Number of MPI procs = ', np - write(*,'(a23,i8)') 'Number of iterations = ', iterations - write(*,'(a23,i8)') 'Matrix order = ', order endif call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90 index 3d72cb36c..b18c3b64f 100644 --- a/FORTRAN/transpose-p2p-mpi.F90 +++ b/FORTRAN/transpose-p2p-mpi.F90 @@ -90,8 +90,6 @@ program main implicit none ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations integer(kind=INT32) :: order, block_order @@ -101,7 +99,7 @@ program main real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, lo, hi, q + integer(kind=INT32) :: i, j, k, lo, hi, q, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime real(kind=REAL64), parameter :: epsilon=1.d-8 @@ -118,38 +116,16 @@ program main ! ******************************************************************** if (me.eq.0) then + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' - - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> ' - call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - call MPI_Abort(MPI_COMM_WORLD, 2) - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - call MPI_Abort(MPI_COMM_WORLD, 3) - endif + write(*,'(a22,i8)') 'Number of MPI procs = ', np + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order if (mod(order,np).ne.0) then write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np call MPI_Abort(MPI_COMM_WORLD, 4) endif - - write(*,'(a23,i8)') 'Number of MPI procs = ', np - write(*,'(a23,i8)') 'Number of iterations = ', iterations - write(*,'(a23,i8)') 'Matrix order = ', order endif call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) From a73e3cec55ffe4203002e639df8fabc8454fc09e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 19:06:18 +0300 Subject: [PATCH 47/80] print fix --- FORTRAN/transpose-a2a-mpi.F90 | 2 +- FORTRAN/transpose-acc-mpi.F90 | 2 +- FORTRAN/transpose-get-mpi.F90 | 2 +- FORTRAN/transpose-p2p-mpi.F90 | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/FORTRAN/transpose-a2a-mpi.F90 b/FORTRAN/transpose-a2a-mpi.F90 index c38158397..72a55c797 100644 --- a/FORTRAN/transpose-a2a-mpi.F90 +++ b/FORTRAN/transpose-a2a-mpi.F90 @@ -117,7 +117,7 @@ program main if (me.eq.0) then call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' - write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' + write(*,'(a37)') 'Fortran MPI Matrix transpose: B = A^T' write(*,'(a22,i8)') 'Number of MPI procs = ', np write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Matrix order = ', order diff --git a/FORTRAN/transpose-acc-mpi.F90 b/FORTRAN/transpose-acc-mpi.F90 index 2b49bb0bd..6ac96b7cf 100644 --- a/FORTRAN/transpose-acc-mpi.F90 +++ b/FORTRAN/transpose-acc-mpi.F90 @@ -123,7 +123,7 @@ program main if (me.eq.0) then call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' - write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' + write(*,'(a37)') 'Fortran MPI Matrix transpose: B = A^T' write(*,'(a22,i8)') 'Number of MPI procs = ', np write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Matrix order = ', order diff --git a/FORTRAN/transpose-get-mpi.F90 b/FORTRAN/transpose-get-mpi.F90 index ebab0c406..96a5470d5 100644 --- a/FORTRAN/transpose-get-mpi.F90 +++ b/FORTRAN/transpose-get-mpi.F90 @@ -123,7 +123,7 @@ program main if (me.eq.0) then call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' - write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' + write(*,'(a37)') 'Fortran MPI Matrix transpose: B = A^T' write(*,'(a22,i8)') 'Number of MPI procs = ', np write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Matrix order = ', order diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90 index b18c3b64f..b7fc14605 100644 --- a/FORTRAN/transpose-p2p-mpi.F90 +++ b/FORTRAN/transpose-p2p-mpi.F90 @@ -118,7 +118,7 @@ program main if (me.eq.0) then call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' - write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' + write(*,'(a37)') 'Fortran MPI Matrix transpose: B = A^T' write(*,'(a22,i8)') 'Number of MPI procs = ', np write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Matrix order = ', order From 908e3a304f5e70d346f7f0c1c6100245ded5439c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 19:12:56 +0300 Subject: [PATCH 48/80] args --- FORTRAN/transpose-coarray.F90 | 57 +++++------------------------------ 1 file changed, 8 insertions(+), 49 deletions(-) diff --git a/FORTRAN/transpose-coarray.F90 b/FORTRAN/transpose-coarray.F90 index bc15f1238..08526a1bb 100644 --- a/FORTRAN/transpose-coarray.F90 +++ b/FORTRAN/transpose-coarray.F90 @@ -58,10 +58,7 @@ program main use, intrinsic :: iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp integer :: me, np logical :: printer ! problem definition @@ -90,37 +87,18 @@ program main ! ******************************************************************** if (printer) then + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(6,'(a25)') 'Parallel Research Kernels' write(6,'(a41)') 'Fortran coarray Matrix transpose: B = A^T' + write(6,'(a23,i8)') 'Number of images = ', np + write(6,'(a23,i8)') 'Number of iterations = ', iterations + write(6,'(a23,i8)') 'Matrix order = ', order + write(6,'(a23,i8)') 'Tile size = ', tile_size endif + call co_broadcast(iterations,1) + call co_broadcast(order,1) + call co_broadcast(tile_size,1) - if (command_argument_count().lt.2) then - if (printer) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(6,'(a62)') 'Usage: ./transpose <# iterations> []' - endif - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - if (printer) then - write(6,'(a35,i5)') 'ERROR: iterations must be >= 1 : ', iterations - endif - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - if (printer) then - write(6,'(a30,i5)') 'ERROR: order must be >= 1 : ', order - endif - stop 1 - endif if (modulo(order,np).gt.0) then if (printer) then write(6,'(a20,i5,a35,i5)') 'ERROR: matrix order ',order,& @@ -130,18 +108,6 @@ program main endif block_order = order/np - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(6,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling - endif - ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** @@ -152,13 +118,6 @@ program main stop 1 endif - if (printer) then - write(6,'(a23,i8)') 'Number of images = ', np - write(6,'(a23,i8)') 'Number of iterations = ', iterations - write(6,'(a23,i8)') 'Matrix order = ', order - write(6,'(a23,i8)') 'Tile size = ', tile_size - endif - ! initialization ! local column index j corresponds to global column index block_order*me+j if ((tile_size.gt.1).and.(tile_size.lt.order)) then From f465bed8fa7f3d5ef319f0102f9b24f17582e688 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Thu, 2 Jun 2022 09:47:05 -0600 Subject: [PATCH 49/80] Fix typos in README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 14a059365..4fbb49dd8 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ executed on many computing systems. These programs should not be used as benchmarks. They are operations to explore features of a hardware platform, but they do not define fixed problems that can be used to rank systems. Furthermore -they have not been optimimzed for the features of any particular system. +they have not been optimized for the features of any particular system. # Build Instructions @@ -51,7 +51,7 @@ If you are looking for the simplest option, try `make.defs.gcc`. | `make.defs.pgi` | PGI compiler toolchain (infrequently tested). | | `make.defs.hip` | HIP compiler toolchain (infrequently tested). | -Some of the C++ implementations require you to install Boost, RAJA, KOKKOS, Parallel STL, respectively, +Some of the C++ implementations require you to install Boost, RAJA, Kokkos, Parallel STL, respectively, and then modify `make.defs` appropriately. Please see the documentation in the [documentation](https://github.com/ParRes/Kernels/tree/default/doc) (`doc`) subdirectory. @@ -215,7 +215,7 @@ be used unless a `make veryclean` has been issued. ## Individual make -Descend into the desired sub-tree and cd to the kernel(s) of interest. +Descend into the desired sub-tree and `cd` to the kernel(s) of interest. Each kernel has its own Makefile. There are a number of parameters that determine the behavior of the kernel that need to be known at compile time. These are explained succinctly in the Makefile itself. Edit From 00e68f8ddb482149255fc3bf77d878e8b2ff1bc5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Jun 2022 12:11:01 +0300 Subject: [PATCH 50/80] mpifort required for prk_mpi_mod --- FORTRAN/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 4faa6796a..6d3b0c1f1 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -99,7 +99,7 @@ prk.mod prk_mod.o: prk_mod.F90 $(FC) $(FCFLAGS) -c $< -o prk_mod.o prk_mpi.mod prk_mpi_mod.o: prk_mpi.F90 - $(FC) $(FCFLAGS) -c $< -o prk_mpi_mod.o + $(MPIFORT) $(FCFLAGS) -c $< -o prk_mpi_mod.o stencil: stencil.F90 prk.mod $(FC) $(FCFLAGS) -c stencil_serial.F90 From c89329dd810f315f96c3d6ba15a13d6e922e7ee8 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Jun 2022 12:13:36 +0300 Subject: [PATCH 51/80] default(none) and MPI_COMM_WORLD cannot coexist --- FORTRAN/nstream-mpi.F90 | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90 index 66ba8d30c..aa6c6b408 100644 --- a/FORTRAN/nstream-mpi.F90 +++ b/FORTRAN/nstream-mpi.F90 @@ -136,11 +136,10 @@ program main scalar = 3 #ifdef _OPENMP - !$omp parallel default(none) & + !$omp parallel & !$omp& shared(A,B,C,nstream_time) & !$omp& firstprivate(length,iterations,scalar) & - !$omp& private(i,k,t0,t1) & - !$omp& shared(MPI_COMM_WORLD) + !$omp& private(i,k,t0,t1) #endif #if defined(_OPENMP) From 83bad194530f9f3778cff68c9fcfd68d15e64146 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Jun 2022 12:14:08 +0300 Subject: [PATCH 52/80] update for homebrew --- common/make.defs.gcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 05a06c0ee..afcf1a6ae 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -205,7 +205,7 @@ ISPCFLAG=-O3 --target=host --opt=fast-math # # MPI-3 # -MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.1_2 +MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.4 MPICC=${MPIDIR}/bin/mpicc MPICXX=${MPIDIR}/bin/mpicxx MPIFORT=${MPIDIR}/bin/mpifort From eeabb8c15c97797ec1840e9b8adc15c74e53ea9a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Jun 2022 12:38:22 +0300 Subject: [PATCH 53/80] replace non-constant tiling with automatic tiling non-constant tiling was supported by NVHPC not GCC, and was not standard anyways. --- FORTRAN/stencil-openacc.F90 | 98 +++++++++-------------------------- FORTRAN/transpose-openacc.F90 | 7 ++- 2 files changed, 28 insertions(+), 77 deletions(-) diff --git a/FORTRAN/stencil-openacc.F90 b/FORTRAN/stencil-openacc.F90 index a5543e5f3..da660dd22 100644 --- a/FORTRAN/stencil-openacc.F90 +++ b/FORTRAN/stencil-openacc.F90 @@ -61,82 +61,42 @@ ! ! ******************************************************************* -subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) +subroutine apply_stencil(is_star,r,n,W,A,B) use, intrinsic :: iso_fortran_env implicit none - logical, intent(in) :: is_star, tiling - integer(kind=INT32), intent(in) :: tile_size, r, n + logical, intent(in) :: is_star + integer(kind=INT32), intent(in) :: r, n real(kind=REAL64), intent(in) :: W(-r:r,-r:r) real(kind=REAL64), intent(in) :: A(n,n) real(kind=REAL64), intent(inout) :: B(n,n) - integer(kind=INT32) :: i, j, ii, jj, it, jt + integer(kind=INT32) :: i, j, ii, jj !$acc data pcopyin(W,A) pcopy(B) if (is_star) then - if (.not.tiling) then - !$acc parallel loop collapse(2) - do j=r,n-r-1 - do i=r,n-r-1 - do jj=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) - enddo - do ii=-r,-1 - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - do ii=1,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo + !$acc parallel loop tile(*,*) + do j=r,n-r-1 + do i=r,n-r-1 + do jj=-r,r + B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) enddo - enddo - else ! tiling - !$acc parallel loop gang collapse(2) - do jt=r,n-r-1,tile_size - do it=r,n-r-1,tile_size - !$acc loop vector collapse(2) - do j=jt,min(n-r-1,jt+tile_size-1) - do i=it,min(n-r-1,it+tile_size-1) - do jj=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) - enddo - do ii=-r,-1 - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - do ii=1,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - enddo - enddo + do ii=-r,-1 + B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) enddo - enddo - endif ! tiling - else ! grid - if (.not.tiling) then - !$acc parallel loop collapse(2) - do j=r,n-r-1 - do i=r,n-r-1 - do jj=-r,r - do ii=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1) - enddo - enddo + do ii=1,r + B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) enddo enddo - else ! tiling - !$acc parallel loop gang collapse(2) - do jt=r,n-r-1,tile_size - do it=r,n-r-1,tile_size - !$acc loop vector collapse(2) - do j=jt,min(n-r-1,jt+tile_size-1) - do i=it,min(n-r-1,it+tile_size-1) - do jj=-r,r - do ii=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1) - enddo - enddo - enddo + enddo + else ! grid + !$acc parallel loop tile(*,*) + do j=r,n-r-1 + do i=r,n-r-1 + do jj=-r,r + do ii=-r,r + B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1) enddo enddo enddo - endif ! tiling + enddo endif ! star !$acc end data end subroutine apply_stencil @@ -150,8 +110,6 @@ program main integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n ! linear grid dimension integer(kind=INT32) :: stencil_size ! number of points in stencil - integer(kind=INT32) :: tile_size ! loop nest block factor - logical :: tiling ! boolean indication loop nest blocking logical :: is_star ! true = star, false = grid integer(kind=INT32), parameter :: r=RADIUS ! radius of stencil real(kind=REAL64) :: W(-r:r,-r:r) ! weights of points in the stencil @@ -172,7 +130,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a44)') 'Fortran OpenACC Stencil execution on 2D grid' - call prk_get_arguments('stencil',iterations=iterations,order=n,tile_size=tile_size) + call prk_get_arguments('stencil',iterations=iterations,order=n) ! TODO: parse runtime input for star/grid #ifdef STAR @@ -181,8 +139,6 @@ program main is_star = .false. #endif - tiling = (tile_size.ne.n) - write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Grid size = ', n write(*,'(a22,i8)') 'Radius of stencil = ', r @@ -193,11 +149,7 @@ program main write(*,'(a22,a8)') 'Type of stencil = ','grid' stencil_size = (2*r+1)**2 endif - if (tiling) then - write(*,'(a22,i8)') 'Tile size = ', tile_size - else - write(*,'(a10)') 'Tiling off' - endif + write(*,'(a32)') 'Tile size = automatic' ! ******************************************************************** ! ** Allocate space for the input and perform the computation @@ -228,7 +180,7 @@ program main if (k.eq.1) t0 = prk_get_wtime() ! Apply the stencil operator - call apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) + call apply_stencil(is_star,r,n,W,A,B) ! add constant to solution to force refresh of neighbor data, if any !$acc parallel loop collapse(2) diff --git a/FORTRAN/transpose-openacc.F90 b/FORTRAN/transpose-openacc.F90 index 1a0a69fe9..ad242cbfb 100644 --- a/FORTRAN/transpose-openacc.F90 +++ b/FORTRAN/transpose-openacc.F90 @@ -66,7 +66,6 @@ program main integer(kind=INT64) :: bytes ! combined size of matrices ! runtime variables integer(kind=INT32) :: i, j, k - integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp ! squared error real(kind=REAL64) :: t0, t1, trans_time, avgtime ! timing parameters real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance @@ -78,11 +77,11 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a41)') 'Fortran OpenACC Matrix transpose: B = A^T' - call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) + call prk_get_arguments('transpose',iterations=iterations,order=order) write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Matrix order = ', order - write(*,'(a22,i8)') 'Tile size = ', tile_size + write(*,'(a32)') 'Tile size = automatic' ! ******************************************************************** ! ** Allocate space for the input and transpose matrix @@ -110,7 +109,7 @@ program main if (k.eq.1) t0 = prk_get_wtime() - !$acc parallel loop tile(tile_size,tile_size) + !$acc parallel loop tile(*,*) do j=1,order do i=1,order B(j,i) = B(j,i) + A(i,j) From dd8308e00648a3b402ccdd613cfd2ee5bbae5c09 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 23 Jun 2022 22:06:52 +0300 Subject: [PATCH 54/80] Update README.md --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4fbb49dd8..994639a7e 100644 --- a/README.md +++ b/README.md @@ -38,15 +38,16 @@ If you are looking for the simplest option, try `make.defs.gcc`. | File (in `./common/`) | Environment | |----------------------|-------------------------| -| `make.defs.cray` | Cray compilers on Cray XC systems. | +| `make.defs.cray` | Cray toolchain (rarely tested). | | `make.defs.cuda` | GCC with the CUDA compiler (only used in C++/CUDA implementation). | -| `make.defs.gcc` | GCC compiler tool chain, which supports essentially all implementations. | +| `make.defs.gcc` | GCC compiler toolchain, which supports essentially all implementations (tested often). | | `make.defs.freebsd` | FreeBSD (rarely tested). | | `make.defs.ibmbg` | IBM Blue Gene/Q compiler toolchain (deprecated). | -| `make.defs.ibmp9nv` | IBM compilers for POWER9 and NVIDIA Volta platforms. | -| `make.defs.intel` | Intel compiler tool chain, which supports most implementations. | -| `make.defs.llvm` | LLVM compiler tool chain, which supports most implementations. | -| `make.defs.musl` | GCC compiler toolchain with MUSL as the C standard library, which is required to use C11 threads. | +| `make.defs.ibmp9nv` | IBM compilers for POWER9 and NVIDIA Volta platforms (rarely tested). | +| `make.defs.intel` | Intel Parallel Studio toolchain, which supports most implementations (tested often). | +| `make.defs.llvm` | LLVM compiler toolchain, which supports most implementations (tested often). | +| `make.defs.musl` | GCC compiler toolchain with MUSL as the C standard library, which was required to use C11 threads. | +| `make.defs.nvhpc` | NVIDIA HPC compiler tool chain, which supports most implementations (tested often). | | `make.defs.oneapi` | Intel oneAPI (https://software.intel.com/oneapi/hpc-kit). | | `make.defs.pgi` | PGI compiler toolchain (infrequently tested). | | `make.defs.hip` | HIP compiler toolchain (infrequently tested). | From 029c003c20a5ffaf522708685e673200f7c5de9e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 23 Jun 2022 22:31:00 +0300 Subject: [PATCH 55/80] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 994639a7e..4c0852d4d 100644 --- a/README.md +++ b/README.md @@ -47,8 +47,8 @@ If you are looking for the simplest option, try `make.defs.gcc`. | `make.defs.intel` | Intel Parallel Studio toolchain, which supports most implementations (tested often). | | `make.defs.llvm` | LLVM compiler toolchain, which supports most implementations (tested often). | | `make.defs.musl` | GCC compiler toolchain with MUSL as the C standard library, which was required to use C11 threads. | -| `make.defs.nvhpc` | NVIDIA HPC compiler tool chain, which supports most implementations (tested often). | -| `make.defs.oneapi` | Intel oneAPI (https://software.intel.com/oneapi/hpc-kit). | +| `make.defs.nvhpc` | [NVIDIA HPC SDK](https://developer.nvidia.com/nvidia-hpc-sdk-downloads), which supports most implementations (tested often). | +| `make.defs.oneapi` | Intel [oneAPI](https://software.intel.com/oneapi/hpc-kit). | | `make.defs.pgi` | PGI compiler toolchain (infrequently tested). | | `make.defs.hip` | HIP compiler toolchain (infrequently tested). | From b1699023fb8d491c8a17d485dc57814b2e799005 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 4 Oct 2022 13:45:38 +0300 Subject: [PATCH 56/80] update OpenCL C++ header this ancient one does not compile with ICPX or Clang when C++20 is enabled --- Cxx11/{cl2.hpp => opencl.hpp} | 706 +++++++++++++++++++++------------- Cxx11/prk_opencl.h | 2 +- 2 files changed, 429 insertions(+), 279 deletions(-) rename Cxx11/{cl2.hpp => opencl.hpp} (94%) diff --git a/Cxx11/cl2.hpp b/Cxx11/opencl.hpp similarity index 94% rename from Cxx11/cl2.hpp rename to Cxx11/opencl.hpp index 09e295ec5..1e61d7890 100644 --- a/Cxx11/cl2.hpp +++ b/Cxx11/opencl.hpp @@ -1,36 +1,23 @@ -/******************************************************************************* - * Copyright (c) 2008-2016 The Khronos Group Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and/or associated documentation files (the - * "Materials"), to deal in the Materials without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Materials, and to - * permit persons to whom the Materials are furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Materials. - * - * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS - * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS - * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT - * https://www.khronos.org/registry/ - * - * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. - ******************************************************************************/ +// +// Copyright (c) 2008-2020 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// /*! \file * - * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33), - * OpenCL 1.2 (rev 15), OpenCL 2.0 (rev 29), OpenCL 2.1 (rev 17), - * and OpenCL 2.2 (V2.2-11). + * \brief C++ bindings for OpenCL 1.0, OpenCL 1.1, OpenCL 1.2, + * OpenCL 2.0, OpenCL 2.1, OpenCL 2.2, and OpenCL 3.0. * \author Lee Howes and Bruce Merry * * Derived from the OpenCL 1.x C++ bindings written by @@ -73,10 +60,10 @@ * For many large applications C++ is the language of choice and so it seems * reasonable to define C++ bindings for OpenCL. * - * The interface is contained with a single C++ header file \em cl2.hpp and all + * The interface is contained with a single C++ header file \em opencl.hpp and all * definitions are contained within the namespace \em cl. There is no additional * requirement to include \em cl.h and to use either the C++ or original C - * bindings; it is enough to simply include \em cl2.hpp. + * bindings; it is enough to simply include \em opencl.hpp. * * The bindings themselves are lightweight and correspond closely to the * underlying C API. Using the C++ bindings introduces no additional execution @@ -85,7 +72,7 @@ * There are numerous compatibility, portability and memory management * fixes in the new header as well as additional OpenCL 2.0 features. * As a result the header is not directly backward compatible and for this - * reason we release it as cl2.hpp rather than a new version of cl.hpp. + * reason we release it as opencl.hpp rather than a new version of cl.hpp. * * * \section compatibility Compatibility @@ -157,30 +144,26 @@ * - CL_HPP_NO_STD_STRING * * Do not use the standard library string class. cl::string is not - * defined and may be defined by the user before cl2.hpp is + * defined and may be defined by the user before opencl.hpp is * included. * * - CL_HPP_NO_STD_VECTOR * * Do not use the standard library vector class. cl::vector is not - * defined and may be defined by the user before cl2.hpp is + * defined and may be defined by the user before opencl.hpp is * included. * * - CL_HPP_NO_STD_ARRAY * * Do not use the standard library array class. cl::array is not - * defined and may be defined by the user before cl2.hpp is + * defined and may be defined by the user before opencl.hpp is * included. * * - CL_HPP_NO_STD_UNIQUE_PTR * * Do not use the standard library unique_ptr class. cl::pointer and * the cl::allocate_pointer functions are not defined and may be - * defined by the user before cl2.hpp is included. - * - * - CL_HPP_ENABLE_DEVICE_FISSION - * - * Enables device fission for OpenCL 1.2 platforms. + * defined by the user before opencl.hpp is included. * * - CL_HPP_ENABLE_EXCEPTIONS * @@ -207,10 +190,22 @@ * applies to use of cl::Program construction and other program * build variants. * + * - CL_HPP_USE_CL_DEVICE_FISSION + * + * Enable the cl_ext_device_fission extension. + * + * - CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR + * + * Enable the cl_khr_image2d_from_buffer extension. + * * - CL_HPP_USE_CL_SUB_GROUPS_KHR * * Enable the cl_khr_subgroups extension. * + * - CL_HPP_USE_DX_INTEROP + * + * Enable the cl_khr_d3d10_sharing extension. + * * - CL_HPP_USE_IL_KHR * * Enable the cl_khr_il_program extension. @@ -222,12 +217,16 @@ * bindings, including support for the optional exception feature and * also the supplied vector and string classes, see following sections for * decriptions of these features. + * + * Note: the C++ bindings use std::call_once and therefore may need to be + * compiled using special command-line options (such as "-pthread") on some + * platforms! * * \code #define CL_HPP_ENABLE_EXCEPTIONS #define CL_HPP_TARGET_OPENCL_VERSION 200 - #include + #include #include #include #include @@ -237,28 +236,30 @@ int main(void) { - // Filter for a 2.0 platform and set it as the default + // Filter for a 2.0 or newer platform and set it as the default std::vector platforms; cl::Platform::get(&platforms); cl::Platform plat; for (auto &p : platforms) { std::string platver = p.getInfo(); - if (platver.find("OpenCL 2.") != std::string::npos) { + if (platver.find("OpenCL 2.") != std::string::npos || + platver.find("OpenCL 3.") != std::string::npos) { + // Note: an OpenCL 3.x platform may not support all required features! plat = p; } } - if (plat() == 0) { - std::cout << "No OpenCL 2.0 platform found."; + if (plat() == 0) { + std::cout << "No OpenCL 2.0 or newer platform found.\n"; return -1; } cl::Platform newP = cl::Platform::setDefault(plat); if (newP != plat) { - std::cout << "Error setting default platform."; + std::cout << "Error setting default platform.\n"; return -1; } - // Use C++11 raw string literals for kernel source code + // C++11 raw string literal for the first kernel std::string kernel1{R"CLC( global int globalA; kernel void updateGlobal() @@ -266,6 +267,8 @@ globalA = 75; } )CLC"}; + + // Raw string literal for the second kernel std::string kernel2{R"CLC( typedef struct { global int *bar; } Foo; kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, @@ -292,8 +295,9 @@ } )CLC"}; - // New simpler string interface style - std::vector programStrings {kernel1, kernel2}; + std::vector programStrings; + programStrings.push_back(kernel1); + programStrings.push_back(kernel2); cl::Program vectorAddProgram(programStrings); try { @@ -332,10 +336,9 @@ std::vector>> inputA(numElements, 1, svmAlloc); cl::coarse_svm_vector inputB(numElements, 2, svmAlloc); - // ////////////// - // Traditional cl_mem allocations + std::vector output(numElements, 0xdeadbeef); cl::Buffer outputBuffer(begin(output), end(output), false); cl::Pipe aPipe(sizeof(cl_int), numElements / 2); @@ -359,14 +362,8 @@ // This one was not passed as a parameter vectorAddKernel.setSVMPointers(anSVMInt); - // Hand control of coarse allocations to runtime - cl::enqueueUnmapSVM(anSVMInt); - cl::enqueueUnmapSVM(fooPointer); - cl::unmapSVM(inputB); - cl::unmapSVM(output2); - - cl_int error; - vectorAddKernel( + cl_int error; + vectorAddKernel( cl::EnqueueArgs( cl::NDRange(numElements/2), cl::NDRange(numElements/2)), @@ -377,12 +374,10 @@ 3, aPipe, defaultDeviceQueue, - error + error ); cl::copy(outputBuffer, begin(output), end(output)); - // Grab the SVM output vector using a map - cl::mapSVM(output2); cl::Device d = cl::Device::getDefault(); @@ -406,59 +401,60 @@ * both and hence work with either version of the bindings. */ #if !defined(CL_HPP_USE_DX_INTEROP) && defined(USE_DX_INTEROP) -# pragma message("cl2.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead") +# pragma message("opencl.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead") # define CL_HPP_USE_DX_INTEROP #endif #if !defined(CL_HPP_USE_CL_DEVICE_FISSION) && defined(USE_CL_DEVICE_FISSION) -# pragma message("cl2.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead") +# pragma message("opencl.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead") # define CL_HPP_USE_CL_DEVICE_FISSION #endif #if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS) -# pragma message("cl2.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead") +# pragma message("opencl.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead") # define CL_HPP_ENABLE_EXCEPTIONS #endif #if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR) -# pragma message("cl2.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead") +# pragma message("opencl.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead") # define CL_HPP_NO_STD_VECTOR #endif #if !defined(CL_HPP_NO_STD_STRING) && defined(__NO_STD_STRING) -# pragma message("cl2.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead") +# pragma message("opencl.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead") # define CL_HPP_NO_STD_STRING #endif #if defined(VECTOR_CLASS) -# pragma message("cl2.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead") +# pragma message("opencl.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead") #endif #if defined(STRING_CLASS) -# pragma message("cl2.hpp: STRING_CLASS is deprecated. Alias cl::string instead.") +# pragma message("opencl.hpp: STRING_CLASS is deprecated. Alias cl::string instead.") #endif #if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) && defined(__CL_USER_OVERRIDE_ERROR_STRINGS) -# pragma message("cl2.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead") +# pragma message("opencl.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead") # define CL_HPP_USER_OVERRIDE_ERROR_STRINGS #endif /* Warn about features that are no longer supported */ #if defined(__USE_DEV_VECTOR) -# pragma message("cl2.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors") +# pragma message("opencl.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors") #endif #if defined(__USE_DEV_STRING) -# pragma message("cl2.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors") +# pragma message("opencl.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors") #endif /* Detect which version to target */ #if !defined(CL_HPP_TARGET_OPENCL_VERSION) -# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 220 (OpenCL 2.2)") -# define CL_HPP_TARGET_OPENCL_VERSION 220 +# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 300 (OpenCL 3.0)") +# define CL_HPP_TARGET_OPENCL_VERSION 300 #endif #if CL_HPP_TARGET_OPENCL_VERSION != 100 && \ CL_HPP_TARGET_OPENCL_VERSION != 110 && \ CL_HPP_TARGET_OPENCL_VERSION != 120 && \ CL_HPP_TARGET_OPENCL_VERSION != 200 && \ CL_HPP_TARGET_OPENCL_VERSION != 210 && \ - CL_HPP_TARGET_OPENCL_VERSION != 220 -# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210 or 220). It will be set to 220") + CL_HPP_TARGET_OPENCL_VERSION != 220 && \ + CL_HPP_TARGET_OPENCL_VERSION != 300 +# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 300 (OpenCL 3.0).") # undef CL_HPP_TARGET_OPENCL_VERSION -# define CL_HPP_TARGET_OPENCL_VERSION 220 +# define CL_HPP_TARGET_OPENCL_VERSION 300 #endif /* Forward target OpenCL version to C headers if necessary */ @@ -480,8 +476,9 @@ CL_HPP_MINIMUM_OPENCL_VERSION != 120 && \ CL_HPP_MINIMUM_OPENCL_VERSION != 200 && \ CL_HPP_MINIMUM_OPENCL_VERSION != 210 && \ - CL_HPP_MINIMUM_OPENCL_VERSION != 220 -# pragma message("cl2.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210 or 220). It will be set to 100") + CL_HPP_MINIMUM_OPENCL_VERSION != 220 && \ + CL_HPP_MINIMUM_OPENCL_VERSION != 300 +# pragma message("opencl.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 100") # undef CL_HPP_MINIMUM_OPENCL_VERSION # define CL_HPP_MINIMUM_OPENCL_VERSION 100 #endif @@ -541,13 +538,15 @@ #include #endif // !__APPLE__ -#if (__cplusplus >= 201103L) +#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L ) #define CL_HPP_NOEXCEPT_ noexcept #else #define CL_HPP_NOEXCEPT_ #endif -#if defined(_MSC_VER) +#if __cplusplus >= 201703L +# define CL_HPP_DEFINE_STATIC_MEMBER_ inline +#elif defined(_MSC_VER) # define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany) #elif defined(__MINGW32__) # define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((selectany)) @@ -557,19 +556,26 @@ // Define deprecated prefixes and suffixes to ensure compilation // in case they are not pre-defined -#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) -#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED -#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) -#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED) -#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED -#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) - -#if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) -#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED -#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) -#if !defined(CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED) -#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED -#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) +#if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED) +#define CL_API_PREFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED) +#if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED) +#define CL_API_SUFFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED) + +#if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED) +#define CL_API_PREFIX__VERSION_1_2_DEPRECATED +#endif // #if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED) +#if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED) +#define CL_API_SUFFIX__VERSION_1_2_DEPRECATED +#endif // #if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED) + +#if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED) +#define CL_API_PREFIX__VERSION_2_2_DEPRECATED +#endif // #if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED) +#if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED) +#define CL_API_SUFFIX__VERSION_2_2_DEPRECATED +#endif // #if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED) #if !defined(CL_CALLBACK) #define CL_CALLBACK @@ -1326,13 +1332,20 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_ F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, string) \ F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \ \ + F(cl_kernel_work_group_info, CL_KERNEL_GLOBAL_WORK_SIZE, cl::detail::size_t_array) \ + \ + F(cl_device_info, CL_DEVICE_LINKER_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, size_type) \ + F(cl_device_info, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, size_type) \ F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl::Device) \ + F(cl_device_info, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, cl_uint) \ F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, cl::vector) \ F(cl_device_info, CL_DEVICE_PARTITION_TYPE, cl::vector) \ F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, size_type) \ + F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, cl_bool) \ F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \ F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, string) \ + F(cl_device_info, CL_DEVICE_PRINTF_BUFFER_SIZE, size_type) \ \ F(cl_image_info, CL_IMAGE_ARRAY_SIZE, size_type) \ F(cl_image_info, CL_IMAGE_NUM_MIP_LEVELS, cl_uint) \ @@ -1352,6 +1365,14 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_ F(cl_device_info, CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT, cl_uint) \ F(cl_device_info, CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT, cl_uint) \ F(cl_device_info, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT, cl_uint) \ + F(cl_device_info, CL_DEVICE_IMAGE_PITCH_ALIGNMENT, cl_uint) \ + F(cl_device_info, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS, cl_uint ) \ + F(cl_device_info, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, size_type ) \ + F(cl_device_info, CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE, size_type ) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_COMPLETE, cl_ulong) \ + F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM, cl_bool) \ + F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_PTRS, void**) \ F(cl_command_queue_info, CL_QUEUE_SIZE, cl_uint) \ F(cl_mem_info, CL_MEM_USES_SVM_POINTER, cl_bool) \ F(cl_program_build_info, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, size_type) \ @@ -1367,17 +1388,17 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_ F(cl_program_info, CL_PROGRAM_IL_KHR, cl::vector) #define CL_HPP_PARAM_NAME_INFO_2_1_(F) \ - F(cl_platform_info, CL_PLATFORM_HOST_TIMER_RESOLUTION, size_type) \ + F(cl_platform_info, CL_PLATFORM_HOST_TIMER_RESOLUTION, cl_ulong) \ F(cl_program_info, CL_PROGRAM_IL, cl::vector) \ - F(cl_kernel_info, CL_KERNEL_MAX_NUM_SUB_GROUPS, size_type) \ - F(cl_kernel_info, CL_KERNEL_COMPILE_NUM_SUB_GROUPS, size_type) \ F(cl_device_info, CL_DEVICE_MAX_NUM_SUB_GROUPS, cl_uint) \ F(cl_device_info, CL_DEVICE_IL_VERSION, string) \ F(cl_device_info, CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, cl_bool) \ F(cl_command_queue_info, CL_QUEUE_DEVICE_DEFAULT, cl::DeviceCommandQueue) \ F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, size_type) \ F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, size_type) \ - F(cl_kernel_sub_group_info, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, cl::detail::size_t_array) + F(cl_kernel_sub_group_info, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, cl::detail::size_t_array) \ + F(cl_kernel_sub_group_info, CL_KERNEL_MAX_NUM_SUB_GROUPS, size_type) \ + F(cl_kernel_sub_group_info, CL_KERNEL_COMPILE_NUM_SUB_GROUPS, size_type) #define CL_HPP_PARAM_NAME_INFO_2_2_(F) \ F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT, cl_bool) \ @@ -1390,6 +1411,43 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_ F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \ F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, cl::vector) +#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(F) \ + F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION_KHR, cl_version_khr) \ + F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR, cl::vector) \ + \ + F(cl_device_info, CL_DEVICE_NUMERIC_VERSION_KHR, cl_version_khr) \ + F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR, cl::vector) \ + F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION_KHR, cl::vector) \ + F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR, cl::vector) + +#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(F) \ + F(cl_device_info, CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR, cl_version_khr) + +#define CL_HPP_PARAM_NAME_INFO_3_0_(F) \ + F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION, cl_version) \ + F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION, cl::vector) \ + \ + F(cl_device_info, CL_DEVICE_NUMERIC_VERSION, cl_version) \ + F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION, cl::vector) \ + F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION, cl::vector) \ + F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION, cl::vector) \ + F(cl_device_info, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES, cl_device_atomic_capabilities) \ + F(cl_device_info, CL_DEVICE_ATOMIC_FENCE_CAPABILITIES, cl_device_atomic_capabilities) \ + F(cl_device_info, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_OPENCL_C_ALL_VERSIONS, cl::vector) \ + F(cl_device_info, CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \ + F(cl_device_info, CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_OPENCL_C_FEATURES, cl::vector) \ + F(cl_device_info, CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES, cl_device_device_enqueue_capabilities) \ + F(cl_device_info, CL_DEVICE_PIPE_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED, string) \ + \ + F(cl_command_queue_info, CL_QUEUE_PROPERTIES_ARRAY, cl::vector) \ + F(cl_mem_info, CL_MEM_PROPERTIES, cl::vector) \ + F(cl_pipe_info, CL_PIPE_PROPERTIES, cl::vector) \ + F(cl_sampler_info, CL_SAMPLER_PROPERTIES, cl::vector) + template struct param_traits {}; @@ -1418,12 +1476,15 @@ CL_HPP_PARAM_NAME_INFO_2_1_(CL_HPP_DECLARE_PARAM_TRAITS_) #if CL_HPP_TARGET_OPENCL_VERSION >= 220 CL_HPP_PARAM_NAME_INFO_2_2_(CL_HPP_DECLARE_PARAM_TRAITS_) #endif // CL_HPP_TARGET_OPENCL_VERSION >= 220 +#if CL_HPP_TARGET_OPENCL_VERSION >= 300 +CL_HPP_PARAM_NAME_INFO_3_0_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300 #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210 CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_) #endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210 -#if defined(CL_HPP_USE_IL_KHR) +#if defined(CL_HPP_USE_IL_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210 CL_HPP_PARAM_NAME_INFO_IL_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_) #endif // #if defined(CL_HPP_USE_IL_KHR) @@ -1454,6 +1515,35 @@ CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_) CL_HPP_PARAM_NAME_DEVICE_FISSION_(CL_HPP_DECLARE_PARAM_TRAITS_); #endif // CL_HPP_USE_CL_DEVICE_FISSION +#if defined(cl_khr_extended_versioning) +#if CL_HPP_TARGET_OPENCL_VERSION < 300 +CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // CL_HPP_TARGET_OPENCL_VERSION < 300 +CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // cl_khr_extended_versioning + +#if defined(cl_khr_device_uuid) +using uuid_array = array; +using luid_array = array; +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_UUID_KHR, uuid_array) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DRIVER_UUID_KHR, uuid_array) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_VALID_KHR, cl_bool) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_KHR, luid_array) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NODE_MASK_KHR, cl_uint) +#endif + +#if defined(cl_khr_pci_bus_info) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PCI_BUS_INFO_KHR, cl_device_pci_bus_info_khr) +#endif + +#if defined(cl_khr_integer_dot_product) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR, cl_device_integer_dot_product_capabilities_khr) +#if defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR, cl_device_integer_dot_product_acceleration_properties_khr) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR, cl_device_integer_dot_product_acceleration_properties_khr) +#endif // defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR) +#endif // defined(cl_khr_integer_dot_product) + #ifdef CL_PLATFORM_ICD_SUFFIX_KHR CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string) #endif @@ -1461,7 +1551,6 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, strin #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) #endif - #ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, vector) #endif @@ -1492,6 +1581,9 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUT #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint) #endif +#ifdef CL_DEVICE_BOARD_NAME_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_BOARD_NAME_AMD, string) +#endif #ifdef CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM, cl_ulong) @@ -1499,6 +1591,30 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_AR #ifdef CL_DEVICE_JOB_SLOTS_ARM CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_JOB_SLOTS_ARM, cl_uint) #endif +#ifdef CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, cl_bitfield) +#endif +#ifdef CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM, vector) +#endif +#ifdef CL_DEVICE_MAX_WARP_COUNT_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_MAX_WARP_COUNT_ARM, cl_uint) +#endif +#ifdef CL_KERNEL_MAX_WARP_COUNT_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_info, CL_KERNEL_MAX_WARP_COUNT_ARM, cl_uint) +#endif +#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM, cl_uint) +#endif +#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, cl_int) +#endif +#ifdef CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM, cl_uint) +#endif +#ifdef CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM, cl_uint) +#endif #ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) @@ -1862,6 +1978,7 @@ class Wrapper retVal = true; #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + (void)device; return retVal; } @@ -1982,51 +2099,7 @@ inline bool operator!=(const Wrapper &lhs, const Wrapper &rhs) //! \endcond -using BuildLogType = vector::param_type>>; -#if defined(CL_HPP_ENABLE_EXCEPTIONS) -/** -* Exception class for build errors to carry build info -*/ -class BuildError : public Error -{ -private: - BuildLogType buildLogs; -public: - BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec) - { - } - BuildLogType getBuildLog() const - { - return buildLogs; - } -}; -namespace detail { - static inline cl_int buildErrHandler( - cl_int err, - const char * errStr, - const BuildLogType &buildLogs) - { - if (err != CL_SUCCESS) { - throw BuildError(err, errStr, buildLogs); - } - return err; - } -} // namespace detail - -#else -namespace detail { - static inline cl_int buildErrHandler( - cl_int err, - const char * errStr, - const BuildLogType &buildLogs) - { - (void)buildLogs; // suppress unused variable warning - (void)errStr; - return err; - } -} // namespace detail -#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS) /*! \stuct ImageFormat @@ -2046,6 +2119,9 @@ struct ImageFormat : public cl_image_format image_channel_data_type = type; } + //! \brief Copy constructor. + ImageFormat(const ImageFormat &other) { *this = other; } + //! \brief Assignment operator. ImageFormat& operator = (const ImageFormat& rhs) { @@ -2187,7 +2263,7 @@ class Device : public detail::Wrapper } //! \brief Wrapper for clGetDeviceInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -2299,7 +2375,7 @@ class Device : public detail::Wrapper const cl_device_partition_property_ext * /* properties */, cl_uint /*num_entries*/, cl_device_id * /*out_devices*/, - cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + cl_uint * /*num_devices*/ ) CL_API_SUFFIX__VERSION_1_1; static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL; CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT); @@ -2333,6 +2409,52 @@ class Device : public detail::Wrapper #endif // defined(CL_HPP_USE_CL_DEVICE_FISSION) }; +using BuildLogType = vector::param_type>>; +#if defined(CL_HPP_ENABLE_EXCEPTIONS) +/** +* Exception class for build errors to carry build info +*/ +class BuildError : public Error +{ +private: + BuildLogType buildLogs; +public: + BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec) + { + } + + BuildLogType getBuildLog() const + { + return buildLogs; + } +}; +namespace detail { + static inline cl_int buildErrHandler( + cl_int err, + const char * errStr, + const BuildLogType &buildLogs) + { + if (err != CL_SUCCESS) { + throw BuildError(err, errStr, buildLogs); + } + return err; + } +} // namespace detail + +#else +namespace detail { + static inline cl_int buildErrHandler( + cl_int err, + const char * errStr, + const BuildLogType &buildLogs) + { + (void)buildLogs; // suppress unused variable warning + (void)errStr; + return err; + } +} // namespace detail +#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS) + CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Device::default_initialized_; CL_HPP_DEFINE_STATIC_MEMBER_ Device Device::default_; CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Device::default_error_ = CL_SUCCESS; @@ -2465,7 +2587,8 @@ class Platform : public detail::Wrapper } //! \brief Wrapper for clGetPlatformInfo(). - cl_int getInfo(cl_platform_info name, string* param) const + template + cl_int getInfo(cl_platform_info name, T* param) const { return detail::errHandler( detail::getInfo(&::clGetPlatformInfo, object_, name, param), @@ -2473,7 +2596,7 @@ class Platform : public detail::Wrapper } //! \brief Wrapper for clGetPlatformInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -2708,8 +2831,8 @@ CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS; * Unload the OpenCL compiler. * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead. */ -inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int -UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +inline CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int +UnloadCompiler() CL_API_SUFFIX__VERSION_1_1_DEPRECATED; inline cl_int UnloadCompiler() { @@ -2799,7 +2922,7 @@ class Context */ Context( const vector& devices, - cl_context_properties* properties = NULL, + const cl_context_properties* properties = NULL, void (CL_CALLBACK * notifyFptr)( const char *, const void *, @@ -2828,9 +2951,13 @@ class Context } } + /*! \brief Constructs a context including a specific device. + * + * Wraps clCreateContext(). + */ Context( const Device& device, - cl_context_properties* properties = NULL, + const cl_context_properties* properties = NULL, void (CL_CALLBACK * notifyFptr)( const char *, const void *, @@ -2860,7 +2987,7 @@ class Context */ Context( cl_device_type type, - cl_context_properties* properties = NULL, + const cl_context_properties* properties = NULL, void (CL_CALLBACK * notifyFptr)( const char *, const void *, @@ -3030,7 +3157,7 @@ class Context } //! \brief Wrapper for clGetContextInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -3172,7 +3299,7 @@ class Event : public detail::Wrapper } //! \brief Wrapper for clGetEventInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -3195,7 +3322,7 @@ class Event : public detail::Wrapper } //! \brief Wrapper for clGetEventProfilingInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getProfilingInfo(cl_int* err = NULL) const { @@ -3226,7 +3353,7 @@ class Event : public detail::Wrapper */ cl_int setCallback( cl_int type, - void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), + void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), void * user_data = NULL) { return detail::errHandler( @@ -3387,7 +3514,7 @@ class Memory : public detail::Wrapper } //! \brief Wrapper for clGetMemObjectInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -3415,7 +3542,7 @@ class Memory : public detail::Wrapper * value - not the Memory class instance. */ cl_int setDestructorCallback( - void (CL_CALLBACK * pfn_notify)(cl_mem, void *), + void (CL_CALLBACK * pfn_notify)(cl_mem, void *), void * user_data = NULL) { return detail::errHandler( @@ -3758,7 +3885,7 @@ cl::pointer> allocate_pointer(const Alloc &alloc_, Arg return cl::pointer>(tmp, detail::Deleter{alloc, copies}); } - catch (std::bad_alloc& b) + catch (std::bad_alloc&) { std::allocator_traits::deallocate(alloc, tmp, copies); throw; @@ -3893,7 +4020,7 @@ class Buffer : public Memory Context context = Context::getDefault(err); if( useHostPtr ) { - object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + object_ = ::clCreateBuffer(context(), flags, size, const_cast(&*startIterator), &error); } else { object_ = ::clCreateBuffer(context(), flags, size, 0, &error); } @@ -4006,7 +4133,7 @@ class Buffer : public Memory } return result; - } + } #endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 }; @@ -4385,7 +4512,7 @@ class Image : public Memory } //! \brief Wrapper for clGetImageInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getImageInfo(cl_int* err = NULL) const { @@ -4422,12 +4549,11 @@ class Image1D : public Image cl_int* err = NULL) { cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE1D, - width, - 0, 0, 0, 0, 0, 0, 0, 0 - }; + + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE1D; + desc.image_width = width; + object_ = ::clCreateImage( context(), flags, @@ -4510,13 +4636,12 @@ class Image1DBuffer : public Image cl_int* err = NULL) { cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE1D_BUFFER, - width, - 0, 0, 0, 0, 0, 0, 0, - buffer() - }; + + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + desc.image_width = width; + desc.buffer = buffer(); + object_ = ::clCreateImage( context(), flags, @@ -4596,15 +4721,13 @@ class Image1DArray : public Image cl_int* err = NULL) { cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE1D_ARRAY, - width, - 0, 0, // height, depth (unused) - arraySize, - rowPitch, - 0, 0, 0, 0 - }; + + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY; + desc.image_width = width; + desc.image_array_size = arraySize; + desc.image_row_pitch = rowPitch; + object_ = ::clCreateImage( context(), flags, @@ -4711,15 +4834,12 @@ class Image2D : public Image #if CL_HPP_TARGET_OPENCL_VERSION >= 120 if (useCreateImage) { - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE2D, - width, - height, - 0, 0, // depth, array size (unused) - row_pitch, - 0, 0, 0, 0 - }; + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = width; + desc.image_height = height; + desc.image_row_pitch = row_pitch; + object_ = ::clCreateImage( context(), flags, @@ -4765,17 +4885,13 @@ class Image2D : public Image { cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE2D, - width, - height, - 0, 0, // depth, array size (unused) - row_pitch, - 0, 0, 0, - // Use buffer as input to image - sourceBuffer() - }; + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = width; + desc.image_height = height; + desc.image_row_pitch = row_pitch; + desc.buffer = sourceBuffer(); + object_ = ::clCreateImage( context(), 0, // flags inherited from buffer @@ -4829,19 +4945,16 @@ class Image2D : public Image // Update only the channel order. // Channel format inherited from source. sourceFormat.image_channel_order = order; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE2D, - sourceWidth, - sourceHeight, - 0, 0, // depth (unused), array size (unused) - sourceRowPitch, - 0, // slice pitch (unused) - sourceNumMIPLevels, - sourceNumSamples, - // Use buffer as input to image - sourceImage() - }; + + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = sourceWidth; + desc.image_height = sourceHeight; + desc.image_row_pitch = sourceRowPitch; + desc.num_mip_levels = sourceNumMIPLevels; + desc.num_samples = sourceNumSamples; + desc.buffer = sourceImage(); + object_ = ::clCreateImage( context(), 0, // flags should be inherited from mem_object @@ -4921,7 +5034,7 @@ class Image2D : public Image * \see Memory * \note Deprecated for OpenCL 1.2. Please use ImageGL instead. */ -class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D +class CL_API_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D { public: /*! \brief Constructs an Image2DGL in a specified context, from a given @@ -5004,7 +5117,7 @@ class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D return *this; } -} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +} CL_API_SUFFIX__VERSION_1_1_DEPRECATED; #endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS #if CL_HPP_TARGET_OPENCL_VERSION >= 120 @@ -5027,17 +5140,15 @@ class Image2DArray : public Image cl_int* err = NULL) { cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE2D_ARRAY, - width, - height, - 0, // depth (unused) - arraySize, - rowPitch, - slicePitch, - 0, 0, 0 - }; + + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; + desc.image_width = width; + desc.image_height = height; + desc.image_array_size = arraySize; + desc.image_row_pitch = rowPitch; + desc.image_slice_pitch = slicePitch; + object_ = ::clCreateImage( context(), flags, @@ -5142,17 +5253,14 @@ class Image3D : public Image #if CL_HPP_TARGET_OPENCL_VERSION >= 120 if (useCreateImage) { - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE3D, - width, - height, - depth, - 0, // array size (unused) - row_pitch, - slice_pitch, - 0, 0, 0 - }; + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE3D; + desc.image_width = width; + desc.image_height = height; + desc.image_depth = depth; + desc.image_row_pitch = row_pitch; + desc.image_slice_pitch = slice_pitch; + object_ = ::clCreateImage( context(), flags, @@ -5534,7 +5642,7 @@ class Pipe : public Memory } //! \brief Wrapper for clGetMemObjectInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -5667,7 +5775,7 @@ class Sampler : public detail::Wrapper } //! \brief Wrapper for clGetSamplerInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -5890,7 +5998,7 @@ class Kernel : public detail::Wrapper __GET_KERNEL_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -5912,7 +6020,7 @@ class Kernel : public detail::Wrapper __GET_KERNEL_ARG_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getArgInfo(cl_uint argIndex, cl_int* err = NULL) const { @@ -5936,7 +6044,7 @@ class Kernel : public detail::Wrapper __GET_KERNEL_WORK_GROUP_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getWorkGroupInfo(const Device& device, cl_int* err = NULL) const { @@ -5971,7 +6079,7 @@ class Kernel : public detail::Wrapper #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 } - template + template size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = NULL) const { size_type param; @@ -6134,6 +6242,23 @@ class Kernel : public detail::Wrapper sizeof(void*)*(1 + sizeof...(Ts)), pointerList.data())); } + + template + cl_int setExecInfo(cl_kernel_exec_info param_name, const T& val) + { + return detail::errHandler( + ::clSetKernelExecInfo( + object_, + param_name, + sizeof(T), + &val)); + } + + template + cl_int setExecInfo(typename detail::param_traits::param_type& val) + { + return setExecInfo(name, val); + } #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 #if CL_HPP_TARGET_OPENCL_VERSION >= 210 @@ -6339,8 +6464,7 @@ class Program : public detail::Wrapper static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL; CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR); - return detail::errHandler( - pfn_clCreateProgramWithILKHR( + object_ = pfn_clCreateProgramWithILKHR( context(), static_cast(IL.data()), IL.size(), &error); #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 @@ -6393,8 +6517,7 @@ class Program : public detail::Wrapper static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL; CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR); - return detail::errHandler( - pfn_clCreateProgramWithILKHR( + object_ = pfn_clCreateProgramWithILKHR( context(), static_cast(IL.data()), IL.size(), &error); #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 @@ -6538,7 +6661,7 @@ class Program : public detail::Wrapper Program() { } - /*! \brief Constructor from cl_mem - takes ownership. + /*! \brief Constructor from cl_program - takes ownership. * * \param retainObject will cause the constructor to retain its cl object. * Defaults to false to maintain compatibility with @@ -6606,6 +6729,27 @@ class Program : public detail::Wrapper return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo()); } + cl_int build( + const Device& device, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + cl_device_id deviceID = device(); + + cl_int buildError = ::clBuildProgram( + object_, + 1, + &deviceID, + options, + notifyFptr, + data); + + BuildLogType buildLog(0); + buildLog.push_back(std::make_pair(device, getBuildInfo(device))); + return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, buildLog); + } + cl_int build( const char* options = NULL, void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, @@ -6619,7 +6763,6 @@ class Program : public detail::Wrapper notifyFptr, data); - return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo()); } @@ -6651,7 +6794,7 @@ class Program : public detail::Wrapper __GET_PROGRAM_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -6674,7 +6817,7 @@ class Program : public detail::Wrapper __GET_PROGRAM_BUILD_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getBuildInfo(const Device& device, cl_int* err = NULL) const { @@ -6692,7 +6835,7 @@ class Program : public detail::Wrapper * info type and for all devices in the program. * On an error reading the info for any device, an empty vector of info will be returned. */ - template + template vector::param_type>> getBuildInfo(cl_int *err = NULL) const { @@ -6762,6 +6905,7 @@ class Program : public detail::Wrapper } #if CL_HPP_TARGET_OPENCL_VERSION >= 220 +#if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) /*! \brief Registers a callback function to be called when destructors for * program scope global variables are complete and before the * program is released. @@ -6772,9 +6916,9 @@ class Program : public detail::Wrapper * on a callback stack associated with program. The registered user callback * functions are called in the reverse order in which they were registered. */ - cl_int setReleaseCallback( + CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int setReleaseCallback( void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data), - void * user_data = NULL) + void * user_data = NULL) CL_API_SUFFIX__VERSION_2_2_DEPRECATED { return detail::errHandler( ::clSetProgramReleaseCallback( @@ -6783,6 +6927,7 @@ class Program : public detail::Wrapper user_data), __SET_PROGRAM_RELEASE_CALLBACK_ERR); } +#endif // #if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) /*! \brief Sets a SPIR-V specialization constant. * @@ -6978,6 +7123,11 @@ inline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs) return static_cast(static_cast(lhs) | static_cast(rhs)); } +inline QueueProperties operator&(QueueProperties lhs, QueueProperties rhs) +{ + return static_cast(static_cast(lhs) & static_cast(rhs)); +} + /*! \class CommandQueue * \brief CommandQueue interface for cl_command_queue. */ @@ -7434,7 +7584,7 @@ class CommandQueue : public detail::Wrapper CommandQueue() { } - /*! \brief Constructor from cl_mem - takes ownership. + /*! \brief Constructor from cl_command_queue - takes ownership. * * \param retainObject will cause the constructor to retain its cl object. * Defaults to false to maintain compatibility with @@ -7486,7 +7636,7 @@ class CommandQueue : public detail::Wrapper __GET_COMMAND_QUEUE_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -8119,7 +8269,7 @@ class CommandQueue : public detail::Wrapper { cl_event tmp; cl_int err = detail::errHandler(::clEnqueueSVMMap( - object_, blocking, flags, static_cast(container.data()), container.size(), + object_, blocking, flags, static_cast(container.data()), container.size()*sizeof(T), (events != NULL) ? (cl_uint)events->size() : 0, (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, (event != NULL) ? &tmp : NULL), @@ -8478,10 +8628,10 @@ class CommandQueue : public detail::Wrapper } #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) - CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask( + CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask( const Kernel& kernel, const vector* events = NULL, - Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + Event* event = NULL) const CL_API_SUFFIX__VERSION_1_2_DEPRECATED { cl_event tmp; cl_int err = detail::errHandler( @@ -8538,8 +8688,8 @@ class CommandQueue : public detail::Wrapper * Deprecated APIs for 1.2 */ #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + CL_API_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueMarker(Event* event = NULL) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED { cl_event tmp; cl_int err = detail::errHandler( @@ -8554,8 +8704,8 @@ class CommandQueue : public detail::Wrapper return err; } - CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueWaitForEvents(const vector& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + CL_API_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueWaitForEvents(const vector& events) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED { return detail::errHandler( ::clEnqueueWaitForEvents( @@ -8691,8 +8841,8 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)( * Deprecated APIs for 1.2 */ #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + CL_API_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueBarrier() const CL_API_SUFFIX__VERSION_1_1_DEPRECATED { return detail::errHandler( ::clEnqueueBarrier(object_), @@ -8866,7 +9016,7 @@ class DeviceCommandQueue : public detail::Wrapper __GET_COMMAND_QUEUE_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -9038,7 +9188,7 @@ Buffer::Buffer( size_type size = sizeof(DataType)*(endIterator - startIterator); if( useHostPtr ) { - object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + object_ = ::clCreateBuffer(context(), flags, size, const_cast(&*startIterator), &error); } else { object_ = ::clCreateBuffer(context(), flags, size, 0, &error); } @@ -9091,7 +9241,7 @@ Buffer::Buffer( Context context = queue.getInfo(); if (useHostPtr) { - object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + object_ = ::clCreateBuffer(context(), flags, size, const_cast(&*startIterator), &error); } else { object_ = ::clCreateBuffer(context(), flags, size, 0, &error); @@ -9213,7 +9363,7 @@ inline cl_int enqueueMapSVM( */ template inline cl_int enqueueMapSVM( - cl::pointer ptr, + cl::pointer &ptr, cl_bool blocking, cl_map_flags flags, size_type size, @@ -9237,7 +9387,7 @@ inline cl_int enqueueMapSVM( */ template inline cl_int enqueueMapSVM( - cl::vector container, + cl::vector &container, cl_bool blocking, cl_map_flags flags, const vector* events = NULL, @@ -10063,7 +10213,7 @@ class KernelFunctor namespace compatibility { /** - * Backward compatibility class to ensure that cl.hpp code works with cl2.hpp. + * Backward compatibility class to ensure that cl.hpp code works with opencl.hpp. * Please use KernelFunctor directly. */ template diff --git a/Cxx11/prk_opencl.h b/Cxx11/prk_opencl.h index f8f0ade9c..b8d783438 100644 --- a/Cxx11/prk_opencl.h +++ b/Cxx11/prk_opencl.h @@ -19,7 +19,7 @@ #include -#include "cl2.hpp" +#include "opencl.hpp" namespace prk { From dee4ba9d96b9c7b0eeba6c28dc79d23776f31530 Mon Sep 17 00:00:00 2001 From: Sajid Ali Date: Sat, 5 Nov 2022 14:36:36 -0500 Subject: [PATCH 57/80] RUST: nstream with rayon! --- .gitignore | 2 + RUST/Makefile | 1 + RUST/nstream-rayon/Cargo.toml | 9 ++ RUST/nstream-rayon/src/main.rs | 184 +++++++++++++++++++++++++++++++++ 4 files changed, 196 insertions(+) create mode 100644 RUST/nstream-rayon/Cargo.toml create mode 100644 RUST/nstream-rayon/src/main.rs diff --git a/.gitignore b/.gitignore index a92a237e3..bd9ee8deb 100644 --- a/.gitignore +++ b/.gitignore @@ -377,6 +377,8 @@ RUST/nstream-unsafe/Cargo.lock RUST/nstream-unsafe/target/ RUST/nstream-iter/Cargo.lock RUST/nstream-iter/target/ +RUST/nstream-rayon/Cargo.lock +RUST/nstream-rayon/target/ RUST/p2p/Cargo.lock RUST/p2p/target/ RUST/stencil/Cargo.lock diff --git a/RUST/Makefile b/RUST/Makefile index d70e5855e..9904e005b 100644 --- a/RUST/Makefile +++ b/RUST/Makefile @@ -16,6 +16,7 @@ all: cd nstream && cargo build $(RCFLAGS) cd nstream-unsafe && cargo build $(RCFLAGS) cd nstream-iter && cargo build $(RCFLAGS) + cd nstream-rayon && cargo build $(RCFLAGS) cd p2p && cargo build $(RCFLAGS) cd stencil && cargo build $(RCFLAGS) cd transpose && cargo build $(RCFLAGS) diff --git a/RUST/nstream-rayon/Cargo.toml b/RUST/nstream-rayon/Cargo.toml new file mode 100644 index 000000000..054caa930 --- /dev/null +++ b/RUST/nstream-rayon/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "nstream" +version = "0.1.0" +authors = ["Jeff Hammond ", "Thomas Hayward-Schneider ", "Sajid Ali "] + +edition = "2021" + +[dependencies] +rayon = "1.5" diff --git a/RUST/nstream-rayon/src/main.rs b/RUST/nstream-rayon/src/main.rs new file mode 100644 index 000000000..4d02cb145 --- /dev/null +++ b/RUST/nstream-rayon/src/main.rs @@ -0,0 +1,184 @@ +// +// Copyright (c) 2020, Intel Corporation +// Copyright (c) 2020, Thomas Hayward-Schneider +// Copyright (c) 2022, Sajid Ali +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/////////////////////////////////////////////// +// +// NAME: nstream +// +// PURPOSE: To compute memory bandwidth when adding a vector of a given +// number of double precision values to the scalar multiple of +// another vector of the same length, and storing the result in +// a third vector. +// +// USAGE: The program takes as input the number +// of iterations to loop over the triad vectors, the length of the +// vectors, and the offset between vectors +// +// <# iterations> +// +// The output consists of diagnostics to make sure the +// algorithm worked, and of timing statistics. +// +// NOTES: Bandwidth is determined as the number of words read, plus the +// number of words written, times the size of the words, divided +// by the execution time. For a vector length of N, the total +// number of words read and written is 4*N*sizeof(double). +// +// HISTORY: This code is loosely based on the Stream benchmark by John +// McCalpin, but does not follow all the Stream rules. Hence, +// reported results should not be associated with Stream in +// external publications +// +// Converted to C++11 by Jeff Hammond, November 2017. +// +/////////////////////////////////////////////// + +use std::env; +use std::mem; +//use std::num; // abs? +use rayon::prelude::*; +use std::time::{Duration, Instant}; + +fn help() { + println!("Usage: <# iterations> "); +} + +fn main() { + println!("Parallel Research Kernels"); + println!("Rust STREAM triad: A = B + scalar * C"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let length: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + length = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } + } + + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } + + println!("Number of iterations = {}", iterations); + println!("vector length = {}", length); + + /////////////////////////////////////////////// + // Allocate space and perform the computation + /////////////////////////////////////////////// + + let mut a: Vec = vec![0.0; length]; + let b: Vec = vec![2.0; length]; + let c: Vec = vec![2.0; length]; + + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + let scalar: f64 = 3.0; + + for _k in 0..iterations + 1 { + if _k == 1 { + t0 = timer.elapsed(); + } + + (&mut a, &b, &c).into_par_iter().for_each(|(x, y, z)| { + *x += *y + scalar * (*z); + }); + } + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let nstream_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let mut ar: f64 = 0.0; + let br: f64 = 2.0; + let cr: f64 = 2.0; + for _k in 0..iterations + 1 { + ar += br + scalar * cr; + } + + ar *= length as f64; + + let mut asum = 0.0; + for i in 0..length { + let absa: f64 = a[i].abs(); + asum += absa; + } + + let err: f64 = (ar - asum) / asum; + let abserr: f64 = err.abs(); + let epsilon: f64 = 1.0e-8; + if abserr < epsilon { + println!("Solution validates"); + let avgtime: f64 = (nstream_time as f64) / (iterations as f64); + let nbytes: usize = 4 * length * mem::size_of::(); + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (nbytes as f64) / avgtime, + avgtime + ); + } else { + println!("Failed Validation on output array"); + println!(" Expected checksum: {}", ar); + println!(" Observed checksum: {}", asum); + println!("ERROR: solution did not validate"); + } + return; +} From be2972f727c5b322b4cb4f3e4f79f2f4f9329002 Mon Sep 17 00:00:00 2001 From: Sajid Ali Date: Sun, 6 Nov 2022 21:32:55 -0600 Subject: [PATCH 58/80] RUST: dgemm with iter and rayon! modified: .gitignore modified: RUST/Makefile new file: RUST/dgemm-iter/Cargo.toml new file: RUST/dgemm-iter/src/main.rs new file: RUST/dgemm-rayon/Cargo.toml new file: RUST/dgemm-rayon/src/main.rs modified: RUST/dgemm/Cargo.toml modified: RUST/dgemm/src/main.rs modified: RUST/transpose/Cargo.toml modified: RUST/transpose/src/main.rs --- .gitignore | 6 + RUST/Makefile | 37 +++-- RUST/dgemm-iter/Cargo.toml | 6 + RUST/dgemm-iter/src/main.rs | 202 ++++++++++++++++++++++++ RUST/dgemm-rayon/Cargo.toml | 9 ++ RUST/dgemm-rayon/src/main.rs | 204 ++++++++++++++++++++++++ RUST/dgemm/Cargo.toml | 7 +- RUST/dgemm/src/main.rs | 225 +++++++++++++-------------- RUST/transpose/Cargo.toml | 4 +- RUST/transpose/src/main.rs | 291 +++++++++++++++++++++-------------- 10 files changed, 734 insertions(+), 257 deletions(-) create mode 100644 RUST/dgemm-iter/Cargo.toml create mode 100644 RUST/dgemm-iter/src/main.rs create mode 100644 RUST/dgemm-rayon/Cargo.toml create mode 100644 RUST/dgemm-rayon/src/main.rs diff --git a/.gitignore b/.gitignore index bd9ee8deb..9ba4c2b06 100644 --- a/.gitignore +++ b/.gitignore @@ -379,6 +379,12 @@ RUST/nstream-iter/Cargo.lock RUST/nstream-iter/target/ RUST/nstream-rayon/Cargo.lock RUST/nstream-rayon/target/ +RUST/dgemm/Cargo.lock +RUST/dgemm/target/ +RUST/dgemm-iter/Cargo.lock +RUST/dgemm-iter/target/ +RUST/dgemm-rayon/Cargo.lock +RUST/dgemm-rayon/target/ RUST/p2p/Cargo.lock RUST/p2p/target/ RUST/stencil/Cargo.lock diff --git a/RUST/Makefile b/RUST/Makefile index 9904e005b..cc3fa2d06 100644 --- a/RUST/Makefile +++ b/RUST/Makefile @@ -13,21 +13,24 @@ RCFLAGS += --release .PHONY: all clean all: - cd nstream && cargo build $(RCFLAGS) - cd nstream-unsafe && cargo build $(RCFLAGS) - cd nstream-iter && cargo build $(RCFLAGS) - cd nstream-rayon && cargo build $(RCFLAGS) - cd p2p && cargo build $(RCFLAGS) - cd stencil && cargo build $(RCFLAGS) - cd transpose && cargo build $(RCFLAGS) - cd dgemm && cargo build $(RCFLAGS) - + cd nstream && cargo build $(RCFLAGS) + cd nstream-unsafe && cargo build $(RCFLAGS) + cd nstream-iter && cargo build $(RCFLAGS) + cd nstream-rayon && cargo build $(RCFLAGS) + cd p2p && cargo build $(RCFLAGS) + cd stencil && cargo build $(RCFLAGS) + cd transpose && cargo build $(RCFLAGS) + cd dgemm && cargo build $(RCFLAGS) + cd dgemm-iter && cargo build $(RCFLAGS) + cd dgemm-rayon && cargo build $(RCFLAGS) clean: - cd nstream && cargo clean - cd nstream-unsafe && cargo clean - cd nstream-iter && cargo clean - cd p2p && cargo clean - cd stencil && cargo clean - cd transpose && cargo clean - cd dgemm && cargo clean - + cd nstream && cargo clean + cd nstream-unsafe && cargo clean + cd nstream-iter && cargo clean + cd nstream-rayon && cargo clean + cd p2p && cargo clean + cd stencil && cargo clean + cd transpose && cargo clean + cd dgemm && cargo clean + cd dgemm-iter && cargo clean + cd dgemm-rayon && cargo clean diff --git a/RUST/dgemm-iter/Cargo.toml b/RUST/dgemm-iter/Cargo.toml new file mode 100644 index 000000000..5714a1fa3 --- /dev/null +++ b/RUST/dgemm-iter/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "dgemm" +version = "0.1.0" +authors = ["Jeff Hammond ", "Sajid Ali "] + +edition = "2021" diff --git a/RUST/dgemm-iter/src/main.rs b/RUST/dgemm-iter/src/main.rs new file mode 100644 index 000000000..208cc47b5 --- /dev/null +++ b/RUST/dgemm-iter/src/main.rs @@ -0,0 +1,202 @@ +// +// Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////// +// +// NAME: transpose +// +// PURPOSE: This program measures the time for the transpose of a +// column-major stored matrix into a row-major stored matrix. +// +// USAGE: Program input is the matrix order and the number of times to +// repeat the operation: +// +// transpose <# iterations> [tile size] +// +// An optional parameter specifies the tile size used to divide the +// individual matrix blocks for improved cache and TLB performance. +// +// The output consists of diagnostics to make sure the +// transpose worked and timing statistics. +// +// HISTORY: Written by Rob Van der Wijngaart, February 2009. +// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +// +/////////////////////////////////////////////// + +use std::env; +use std::time::{Duration, Instant}; + +fn help() { + println!("Usage: <# iterations> "); +} + +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Dense matrix-matrix multiplication: C += A x B"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } + } + + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } + + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); + + /////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + /////////////////////////////////////////////// + + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + let mut c: Vec = vec![0.0; nelems]; + + for i in 0..order { + for j in 0..order { + a[i * order + j] = i as f64; + b[i * order + j] = i as f64; + } + } + + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + + // https://www.reidatcheson.com/matrix%20multiplication/rust/iterators/2021/02/26/gemm-iterators.html + c.chunks_exact_mut(order) + .zip(a.chunks_exact(order)) + // ci_mut : mutable ith row of C + // ai : immutable ith row of A + .for_each(|(ci_mut, ai)| { + // iterate over columns of ith row of a, + // zipped with rows of b + ai.iter() + .zip(b.chunks_exact(order)) + // aik : element at row i, column k in matrix A + // bk : immutable kth row of matrix B + .for_each(|(aik, bk)| { + // iterate over columns of ith row of c, + // zipped with columns of kth row of b + ci_mut + .iter_mut() + .zip(bk.iter()) + // cij : element at row i, column j of matrix C + // bkj : element at row k, column j of marrix B + .for_each(|(cij, bkj)| { + *cij += aik * bkj; + }) + }); + }); + } + + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let dgemm_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let forder: f64 = order as f64; + let reference: f64 = 0.25 + * (forder * forder * forder) + * (forder - 1.0) + * (forder - 1.0) + * (iterations as f64 + 1.0); + let mut checksum: f64 = 0.0; + for i in 0..order { + for j in 0..order { + checksum += c[i * order + j]; + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", checksum); + } + + let epsilon: f64 = 1.0e-8; + let residuum: f64 = (checksum - reference) / reference; + if residuum < epsilon { + println!("Solution validates"); + let avgtime: f64 = (dgemm_time as f64) / (iterations as f64); + let uorder: usize = order as usize; + let nflops: usize = 2_usize * uorder * uorder * uorder; + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (nflops as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + residuum, epsilon + ); + return; + } +} diff --git a/RUST/dgemm-rayon/Cargo.toml b/RUST/dgemm-rayon/Cargo.toml new file mode 100644 index 000000000..49886cd96 --- /dev/null +++ b/RUST/dgemm-rayon/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "dgemm" +version = "0.1.0" +authors = ["Jeff Hammond ", "Sajid Ali "] + +edition = "2021" + +[dependencies] +rayon = "1.5" diff --git a/RUST/dgemm-rayon/src/main.rs b/RUST/dgemm-rayon/src/main.rs new file mode 100644 index 000000000..30dc55057 --- /dev/null +++ b/RUST/dgemm-rayon/src/main.rs @@ -0,0 +1,204 @@ +// +// Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////// +// +// NAME: transpose +// +// PURPOSE: This program measures the time for the transpose of a +// column-major stored matrix into a row-major stored matrix. +// +// USAGE: Program input is the matrix order and the number of times to +// repeat the operation: +// +// transpose <# iterations> [tile size] +// +// An optional parameter specifies the tile size used to divide the +// individual matrix blocks for improved cache and TLB performance. +// +// The output consists of diagnostics to make sure the +// transpose worked and timing statistics. +// +// HISTORY: Written by Rob Van der Wijngaart, February 2009. +// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +// +/////////////////////////////////////////////// + +use std::env; +use std::time::{Duration, Instant}; + +use rayon::prelude::*; + +fn help() { + println!("Usage: <# iterations> "); +} + +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Dense matrix-matrix multiplication: C += A x B"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } + } + + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } + + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); + + /////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + /////////////////////////////////////////////// + + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + let mut c: Vec = vec![0.0; nelems]; + + for i in 0..order { + for j in 0..order { + a[i * order + j] = i as f64; + b[i * order + j] = i as f64; + } + } + + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + + // Outermost loop parallelism applied to dgemm-iter version + c.par_chunks_exact_mut(order) + .zip(a.par_chunks_exact(order)) + // ci_mut : mutable ith row of C + // ai : immutable ith row of A + .for_each(|(ci_mut, ai)| { + // iterate over columns of ith row of a, + // zipped with rows of b + ai.iter() + .zip(b.chunks_exact(order)) + // aik : element at row i, column k in matrix A + // bk : immutable kth row of matrix B + .for_each(|(aik, bk)| { + // iterate over columns of ith row of c, + // zipped with columns of kth row of b + ci_mut + .iter_mut() + .zip(bk.iter()) + // cij : element at row i, column j of matrix C + // bkj : element at row k, column j of marrix B + .for_each(|(cij, bkj)| { + *cij += aik * bkj; + }) + }); + }); + } + + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let dgemm_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let forder: f64 = order as f64; + let reference: f64 = 0.25 + * (forder * forder * forder) + * (forder - 1.0) + * (forder - 1.0) + * (iterations as f64 + 1.0); + let mut checksum: f64 = 0.0; + for i in 0..order { + for j in 0..order { + checksum += c[i * order + j]; + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", checksum); + } + + let epsilon: f64 = 1.0e-8; + let residuum: f64 = (checksum - reference) / reference; + if residuum < epsilon { + println!("Solution validates"); + let avgtime: f64 = (dgemm_time as f64) / (iterations as f64); + let uorder: usize = order as usize; + let nflops: usize = 2_usize * uorder * uorder * uorder; + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (nflops as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + residuum, epsilon + ); + return; + } +} diff --git a/RUST/dgemm/Cargo.toml b/RUST/dgemm/Cargo.toml index 4548f4f12..cd045832a 100644 --- a/RUST/dgemm/Cargo.toml +++ b/RUST/dgemm/Cargo.toml @@ -1,9 +1,6 @@ [package] name = "dgemm" version = "0.1.0" -authors = ["Jeff Hammond "] +authors = ["Jeff Hammond ", "Sajid Ali "] -[dependencies] -blas = "0.20" -cblas = "0.2" -blas-src = { version = "0.7", features = ["blis"] } +edition="2021" diff --git a/RUST/dgemm/src/main.rs b/RUST/dgemm/src/main.rs index b0d03aaa4..930d3a60d 100644 --- a/RUST/dgemm/src/main.rs +++ b/RUST/dgemm/src/main.rs @@ -1,5 +1,6 @@ // // Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -52,136 +53,130 @@ // /////////////////////////////////////////////// -extern crate blas; -extern crate cblas; -extern crate blas_src; - use std::env; -use std::time::{Instant,Duration}; - -//use blas::*; -use cblas::*; - -fn prk_dgemm(order : usize, a : &mut Vec, b : &mut Vec, c : &mut Vec) -{ - for i in 0..order { - for k in 0..order { - for j in 0..order { - c[i*order+j] += a[i*order+k] * b[k*order+j]; - } - } - } -} +use std::time::{Duration, Instant}; fn help() { - println!("Usage: <# iterations> "); + println!("Usage: <# iterations> "); } -fn main() -{ - println!("Parallel Research Kernels"); - println!("Rust Dense matrix-matrix multiplication: C += A x B"); - - /////////////////////////////////////////////// - // Read and test input parameters - /////////////////////////////////////////////// - - let args : Vec = env::args().collect(); - - let iterations : u32; - let order : usize; - - match args.len() { - 3 => { - iterations = match args[1].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - order = match args[2].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - }, - _ => { - help(); - return; +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Dense matrix-matrix multiplication: C += A x B"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } } - } - if iterations < 1 { - println!("ERROR: iterations must be >= 1"); - } + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } - println!("Number of iterations = {}", iterations); - println!("Matrix order = {}", order); + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); - /////////////////////////////////////////////// - // Allocate space for the input and transpose matrix - /////////////////////////////////////////////// + /////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + /////////////////////////////////////////////// - let nelems : usize = order*order; - let mut a : Vec = vec![0.0; nelems]; - let mut b : Vec = vec![0.0; nelems]; - let mut c : Vec = vec![0.0; nelems]; + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + let mut c: Vec = vec![0.0; nelems]; - for i in 0..order { - for j in 0..order { - a[i*order+j] = i as f64; - b[i*order+j] = i as f64; + for i in 0..order { + for j in 0..order { + a[i * order + j] = i as f64; + b[i * order + j] = i as f64; + } } - } - - let timer = Instant::now(); - let mut t0 : Duration = timer.elapsed(); - for k in 0..iterations+1 { - - if k == 1 { t0 = timer.elapsed(); } + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + for i in 0..order { + for k in 0..order { + for j in 0..order { + c[i * order + j] += a[i * order + k] * b[k * order + j]; + } + } + } + } + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let dgemm_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let forder: f64 = order as f64; + let reference: f64 = 0.25 + * (forder * forder * forder) + * (forder - 1.0) + * (forder - 1.0) + * (iterations as f64 + 1.0); + let mut checksum: f64 = 0.0; + for i in 0..order { + for j in 0..order { + checksum += c[i * order + j]; + } + } - //prk_dgemm(order, &mut a, &mut b, &mut c); - let m : i32 = order as i32; - let n : i32 = order as i32; - let k : i32 = order as i32; - unsafe { - dgemm(Layout::RowMajor, Transpose::None, Transpose::None, - m, n, k, 1.0, &a, m, &b, k, 1.0, &mut c, m); + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", checksum); } - } - let t1 = timer.elapsed(); - let dt = (t1.checked_sub(t0)).unwrap(); - let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; - let dgemm_time : f64 = dtt as f64 * 1.0e-9; - - /////////////////////////////////////////////// - // Analyze and output results - /////////////////////////////////////////////// - - let forder : f64 = order as f64; - let reference : f64 = 0.25 * (forder*forder*forder) * (forder-1.0)*(forder-1.0) * (iterations as f64 + 1.0); - let mut checksum : f64 = 0.0; - for i in 0..order { - for j in 0..order { - checksum += c[i*order+j]; + let epsilon: f64 = 1.0e-8; + let residuum: f64 = (checksum - reference) / reference; + if residuum < epsilon { + println!("Solution validates"); + let avgtime: f64 = (dgemm_time as f64) / (iterations as f64); + let uorder: usize = order as usize; + let nflops: usize = 2_usize * uorder * uorder * uorder; + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (nflops as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + residuum, epsilon + ); + return; } - } - - if cfg!(VERBOSE) { - println!("Sum of absolute differences: {:30.15}", checksum); - } - - let epsilon : f64 = 1.0e-8; - let residuum : f64 = (checksum - reference)/reference; - if residuum < epsilon { - println!("Solution validates"); - let avgtime : f64 = (dgemm_time as f64) / (iterations as f64); - let uorder : usize = order as usize; - let nflops : usize = 2_usize * uorder * uorder * uorder; - println!("Rate (MB/s): {:10.3} Avg time (s): {:10.3}", (1.0e-6_f64) * (nflops as f64) / avgtime, avgtime); - } else { - println!("ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", residuum, epsilon); - return; - } } - - diff --git a/RUST/transpose/Cargo.toml b/RUST/transpose/Cargo.toml index 3f634d3c5..22fe9074e 100644 --- a/RUST/transpose/Cargo.toml +++ b/RUST/transpose/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "transpose" version = "0.1.0" -authors = ["Jeff Hammond "] +authors = ["Jeff Hammond ", "Sajid Ali "] -[dependencies] +edition = "2021" diff --git a/RUST/transpose/src/main.rs b/RUST/transpose/src/main.rs index 935addae8..baace9c90 100644 --- a/RUST/transpose/src/main.rs +++ b/RUST/transpose/src/main.rs @@ -1,5 +1,6 @@ // // Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -54,137 +55,191 @@ use std::env; use std::mem; -use std::time::{Instant,Duration}; +use std::time::{Duration, Instant}; fn help() { - println!("Usage: <# iterations> [tile size]"); + println!("Usage: <# iterations> [tile size]"); } -fn main() -{ - println!("Parallel Research Kernels"); - println!("Rust Matrix transpose: B = A^T"); - - /////////////////////////////////////////////// - // Read and test input parameters - /////////////////////////////////////////////// - - let args : Vec = env::args().collect(); - - let iterations : u32; - let order : usize; - let tilesize : usize; - - match args.len() { - 3 => { - iterations = match args[1].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - order = match args[2].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - tilesize = 32; - }, - 4 => { - iterations = match args[1].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - order = match args[2].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - tilesize = match args[3].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - }, - _ => { - help(); - return; +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Matrix transpose: B = A^T"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + let tilesize: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + tilesize = 32; + } + 4 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + tilesize = match args[3].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } } - } - - if iterations < 1 { - println!("ERROR: iterations must be >= 1"); - } - if tilesize > order { - println!("ERROR: tilesize cannot be > order"); - } - - println!("Number of iterations = {}", iterations); - println!("Matrix order = {}", order); - if tilesize < order { - println!("Tile size = {}", tilesize); - } else { - println!("Untiled"); - } - - /////////////////////////////////////////////// - // Allocate space for the input and transpose matrix - /////////////////////////////////////////////// - - let nelems : usize = order*order; - let mut a : Vec = vec![0.0; nelems]; - let mut b : Vec = vec![0.0; nelems]; - - for i in 0..order { - for j in 0..order { - a[i*order+j] = (i*order+j) as f64; + + if tilesize > order { + println!("Warning: tilesize cannot be > order, will not use tiling!"); } - } - let timer = Instant::now(); - let mut t0 : Duration = timer.elapsed(); + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); + if tilesize < order { + println!("Tile size = {}", tilesize); + } else { + println!("Untiled"); + } - for k in 0..iterations+1 { + ///////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ///////////////////////////////////////////////////// - if k == 1 { t0 = timer.elapsed(); } + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + // Initialize matrices for i in 0..order { - for j in 0..order { - b[j*order+i] += a[i*order+j]; - a[i*order+j] += 1.0; - } + for j in 0..order { + a[i * order + j] = (i * order + j) as f64; + } } - } - let t1 = timer.elapsed(); - let dt = (t1.checked_sub(t0)).unwrap(); - let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; - let transpose_time : f64 = dtt as f64 * 1.0e-9; - - /////////////////////////////////////////////// - // Analyze and output results - /////////////////////////////////////////////// - - let addit : usize = ((iterations as usize + 1) * (iterations as usize)) / 2; - let mut abserr : f64 = 0.0; - for i in 0..order { - for j in 0..order { - let ij = i*order+j; - let ji = j*order+i; - let reference : f64 = (ij*(iterations as usize + 1)+addit) as f64; - abserr += (b[ji] - reference).abs(); + let (num_tiles, boundscheck): (usize, bool) = if order % tilesize == 0 { + (order / tilesize, false) // all tiles have same size + } else { + (order / tilesize + 1, true) // last tile has size < tilesize + }; + + println!("Initialization done, running algorithm"); + if boundscheck { + println!("Warning: Matrix order not divisible by tilesize, will employ bounds checking!") } - } - - if cfg!(VERBOSE) { - println!("Sum of absolute differences: {:30.15}", abserr); - } - - let epsilon : f64 = 1.0e-8; - if abserr < epsilon { - println!("Solution validates"); - let avgtime : f64 = (transpose_time as f64) / (iterations as f64); - let bytes : usize = 2_usize * nelems * mem::size_of::(); - println!("Rate (MB/s): {:10.3} Avg time (s): {:10.3}", (1.0e-6_f64) * (bytes as f64) / avgtime, avgtime); - } else { - println!("ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", abserr, epsilon); - return; - } -} + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + + // Version with no bounds check + if !boundscheck { + for row_tile in 0..num_tiles { + for col_tile in 0..num_tiles { + for i in 0..tilesize { + for j in 0..tilesize { + let rowidx = row_tile * tilesize + i; + let colidx = col_tile * tilesize + j; + b[colidx * order + rowidx] += a[rowidx * order + colidx]; + a[rowidx * order + colidx] += 1.0; + } + } + } + } + } else { + // Version with bounds check + for row_tile in 0..num_tiles { + for col_tile in 0..num_tiles { + for i in 0..tilesize { + for j in 0..tilesize { + let rowidx = row_tile * tilesize + i; + let colidx = col_tile * tilesize + j; + if rowidx < order && colidx < order { + b[colidx * order + rowidx] += a[rowidx * order + colidx]; + a[rowidx * order + colidx] += 1.0; + } + } + } + } + } + } + } + + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let transpose_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + let addit: usize = ((iterations as usize + 1) * (iterations as usize)) / 2; + let mut abserr: f64 = 0.0; + for i in 0..order { + for j in 0..order { + let ij = i * order + j; + let ji = j * order + i; + let reference: f64 = (ij * (iterations as usize + 1) + addit) as f64; + abserr += (b[ji] - reference).abs(); + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", abserr); + } + + let epsilon: f64 = 1.0e-8; + if abserr < epsilon { + println!("Solution validates"); + let avgtime: f64 = (transpose_time as f64) / (iterations as f64); + let bytes: usize = 2_usize * nelems * mem::size_of::(); + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (bytes as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + abserr, epsilon + ); + return; + } +} From e5009d17778e35c547326fa689ff144948bb0fcc Mon Sep 17 00:00:00 2001 From: Sajid Ali Date: Mon, 7 Nov 2022 07:35:05 -0600 Subject: [PATCH 59/80] RUST: transpose with iter! modified: .gitignore new file: RUST/transpose-iter/Cargo.toml new file: RUST/transpose-iter/src/main.rs --- .gitignore | 2 + RUST/transpose-iter/Cargo.toml | 6 + RUST/transpose-iter/src/main.rs | 258 ++++++++++++++++++++++++++++++++ 3 files changed, 266 insertions(+) create mode 100644 RUST/transpose-iter/Cargo.toml create mode 100644 RUST/transpose-iter/src/main.rs diff --git a/.gitignore b/.gitignore index 9ba4c2b06..eef173b49 100644 --- a/.gitignore +++ b/.gitignore @@ -391,6 +391,8 @@ RUST/stencil/Cargo.lock RUST/stencil/target/ RUST/transpose/Cargo.lock RUST/transpose/target/ +RUST/transpose-iter/Cargo.lock +RUST/transpose-iter/target/ SERIAL/AMR/amr SERIAL/Branch/branch SERIAL/DGEMM/dgemm diff --git a/RUST/transpose-iter/Cargo.toml b/RUST/transpose-iter/Cargo.toml new file mode 100644 index 000000000..22fe9074e --- /dev/null +++ b/RUST/transpose-iter/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "transpose" +version = "0.1.0" +authors = ["Jeff Hammond ", "Sajid Ali "] + +edition = "2021" diff --git a/RUST/transpose-iter/src/main.rs b/RUST/transpose-iter/src/main.rs new file mode 100644 index 000000000..f50e7dd27 --- /dev/null +++ b/RUST/transpose-iter/src/main.rs @@ -0,0 +1,258 @@ +// +// Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/////////////////////////////////////////////// +// +// NAME: transpose +// +// PURPOSE: This program measures the time for the transpose of a +// column-major stored matrix into a row-major stored matrix. +// +// USAGE: Program input is the matrix order and the number of times to +// repeat the operation: +// +// transpose <# iterations> [tile size] +// +// An optional parameter specifies the tile size used to divide the +// individual matrix blocks for improved cache and TLB performance. +// +// The output consists of diagnostics to make sure the +// transpose worked and timing statistics. +// +// HISTORY: Written by Rob Van der Wijngaart, February 2009. +// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +// +/////////////////////////////////////////////// + +use std::env; +use std::mem; +use std::time::{Duration, Instant}; + +fn help() { + println!("Usage: <# iterations> [tile size]"); +} + +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Matrix transpose: B = A^T"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + let mut tilesize: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + tilesize = 32; + } + 4 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + tilesize = match args[3].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } + } + + if tilesize > order { + println!("Warning: tilesize cannot be > order, will not use tiling!"); + } + + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); + if tilesize < order { + println!("Tile size = {}", tilesize); + } else { + tilesize = order; + println!("Untiled"); + } + + if order % tilesize != 0 && tilesize < order { + panic!("Cannot use the given tilesize!") + }; + + let num_tiles: usize = order / tilesize; // all tiles have same size + + ///////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ///////////////////////////////////////////////////// + + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + + // Initialize matrices + for i in 0..order { + for j in 0..order { + a[i * order + j] = (i * order + j) as f64; + } + } + + println!("Initialization done, running algorithm"); + + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + + /* + (0..num_tiles).for_each(|row_tile_idx| { + (0..num_tiles).for_each(|col_tile_idx| { + (0..tilesize).for_each(|row_within_tile| { + (0..tilesize).for_each(|col_within_tile| { + let rowidx: usize = row_tile_idx * tilesize + row_within_tile; + let colidx: usize = col_tile_idx * tilesize + col_within_tile; + b[rowidx * order + colidx] += a[colidx * order + rowidx]; + }) + }) + }) + }); + */ + + b.chunks_exact_mut(tilesize * order) + .enumerate() + // for the current set of row tiles + // and the rows corresponding to this row tile + .for_each(|(row_tile_idx, b_rows)| { + // iterator over all column tiles + (0..num_tiles).for_each(|col_tile_idx| { + // within the tile, iterate over *tilesize* rows of b + // zipped together with rows of b available in the tile + (0..tilesize).zip(b_rows.chunks_exact_mut(order)).for_each( + |(row_within_tile, bi)| { + let bi_subset_cols = bi + .get_mut((col_tile_idx * tilesize)..((col_tile_idx + 1) * tilesize)) + .unwrap(); + // within the tile, iterator over *tilesize* columns of b + // zipped together with subset of columns of b + (0..tilesize).zip(bi_subset_cols.iter_mut()).for_each( + |(col_within_tile, b_element)| { + let rowidx: usize = row_tile_idx * tilesize + row_within_tile; + let colidx: usize = col_tile_idx * tilesize + col_within_tile; + *b_element += a[colidx * order + rowidx]; + }, + ) + }, + ) + }) + }); + + // straightforward addition of 1.0 to all elements of A + a.iter_mut().for_each(|a_element| { + *a_element += 1.0; + }); + } + + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let transpose_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let addit: usize = ((iterations as usize + 1) * (iterations as usize)) / 2; + let mut abserr: f64 = 0.0; + for i in 0..order { + for j in 0..order { + let ij = i * order + j; + let ji = j * order + i; + let reference: f64 = (ij * (iterations as usize + 1) + addit) as f64; + abserr += (b[ji] - reference).abs(); + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", abserr); + } + + let epsilon: f64 = 1.0e-8; + if abserr < epsilon { + println!("Solution validates"); + let avgtime: f64 = (transpose_time as f64) / (iterations as f64); + let bytes: usize = 2_usize * nelems * mem::size_of::(); + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (bytes as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + abserr, epsilon + ); + return; + } +} From 1c30af0a05e9738a1733f1c2cef51067495a3f6d Mon Sep 17 00:00:00 2001 From: Sajid Ali Date: Tue, 8 Nov 2022 09:47:15 -0600 Subject: [PATCH 60/80] RUST: add transpose-rayon modified: .gitignore modified: RUST/Makefile modified: RUST/transpose-iter/src/main.rs new file: RUST/transpose-rayon/Cargo.toml new file: RUST/transpose-rayon/src/main.rs --- .gitignore | 2 + RUST/Makefile | 4 + RUST/transpose-iter/src/main.rs | 19 +-- RUST/transpose-rayon/Cargo.toml | 9 ++ RUST/transpose-rayon/src/main.rs | 247 +++++++++++++++++++++++++++++++ 5 files changed, 265 insertions(+), 16 deletions(-) create mode 100644 RUST/transpose-rayon/Cargo.toml create mode 100644 RUST/transpose-rayon/src/main.rs diff --git a/.gitignore b/.gitignore index eef173b49..1bacbfade 100644 --- a/.gitignore +++ b/.gitignore @@ -393,6 +393,8 @@ RUST/transpose/Cargo.lock RUST/transpose/target/ RUST/transpose-iter/Cargo.lock RUST/transpose-iter/target/ +RUST/transpose-rayon/Cargo.lock +RUST/transpose-rayon/target/ SERIAL/AMR/amr SERIAL/Branch/branch SERIAL/DGEMM/dgemm diff --git a/RUST/Makefile b/RUST/Makefile index cc3fa2d06..f72474c64 100644 --- a/RUST/Makefile +++ b/RUST/Makefile @@ -20,6 +20,8 @@ all: cd p2p && cargo build $(RCFLAGS) cd stencil && cargo build $(RCFLAGS) cd transpose && cargo build $(RCFLAGS) + cd transpose-iter && cargo build $(RCFLAGS) + cd transpose-rayon && cargo build $(RCFLAGS) cd dgemm && cargo build $(RCFLAGS) cd dgemm-iter && cargo build $(RCFLAGS) cd dgemm-rayon && cargo build $(RCFLAGS) @@ -31,6 +33,8 @@ clean: cd p2p && cargo clean cd stencil && cargo clean cd transpose && cargo clean + cd transpose-iter && cargo clean + cd transpose-rayon && cargo clean cd dgemm && cargo clean cd dgemm-iter && cargo clean cd dgemm-rayon && cargo clean diff --git a/RUST/transpose-iter/src/main.rs b/RUST/transpose-iter/src/main.rs index f50e7dd27..4aae33be1 100644 --- a/RUST/transpose-iter/src/main.rs +++ b/RUST/transpose-iter/src/main.rs @@ -166,35 +166,22 @@ fn main() { t0 = timer.elapsed(); } - /* - (0..num_tiles).for_each(|row_tile_idx| { - (0..num_tiles).for_each(|col_tile_idx| { - (0..tilesize).for_each(|row_within_tile| { - (0..tilesize).for_each(|col_within_tile| { - let rowidx: usize = row_tile_idx * tilesize + row_within_tile; - let colidx: usize = col_tile_idx * tilesize + col_within_tile; - b[rowidx * order + colidx] += a[colidx * order + rowidx]; - }) - }) - }) - }); - */ - b.chunks_exact_mut(tilesize * order) .enumerate() // for the current set of row tiles // and the rows corresponding to this row tile .for_each(|(row_tile_idx, b_rows)| { - // iterator over all column tiles + // iterate over all column tiles (0..num_tiles).for_each(|col_tile_idx| { // within the tile, iterate over *tilesize* rows of b // zipped together with rows of b available in the tile (0..tilesize).zip(b_rows.chunks_exact_mut(order)).for_each( + // bi is the ith row of b |(row_within_tile, bi)| { let bi_subset_cols = bi .get_mut((col_tile_idx * tilesize)..((col_tile_idx + 1) * tilesize)) .unwrap(); - // within the tile, iterator over *tilesize* columns of b + // within the tile, iterate over *tilesize* columns of b // zipped together with subset of columns of b (0..tilesize).zip(bi_subset_cols.iter_mut()).for_each( |(col_within_tile, b_element)| { diff --git a/RUST/transpose-rayon/Cargo.toml b/RUST/transpose-rayon/Cargo.toml new file mode 100644 index 000000000..fa75e1f79 --- /dev/null +++ b/RUST/transpose-rayon/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "transpose" +version = "0.1.0" +authors = ["Jeff Hammond ", "Sajid Ali "] + +edition = "2021" + +[dependencies] +rayon = "1.5" diff --git a/RUST/transpose-rayon/src/main.rs b/RUST/transpose-rayon/src/main.rs new file mode 100644 index 000000000..8cfced9c0 --- /dev/null +++ b/RUST/transpose-rayon/src/main.rs @@ -0,0 +1,247 @@ +// +// Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/////////////////////////////////////////////// +// +// NAME: transpose +// +// PURPOSE: This program measures the time for the transpose of a +// column-major stored matrix into a row-major stored matrix. +// +// USAGE: Program input is the matrix order and the number of times to +// repeat the operation: +// +// transpose <# iterations> [tile size] +// +// An optional parameter specifies the tile size used to divide the +// individual matrix blocks for improved cache and TLB performance. +// +// The output consists of diagnostics to make sure the +// transpose worked and timing statistics. +// +// HISTORY: Written by Rob Van der Wijngaart, February 2009. +// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +// +/////////////////////////////////////////////// + +use rayon::prelude::*; +use std::env; +use std::mem; +use std::time::{Duration, Instant}; + +fn help() { + println!("Usage: <# iterations> [tile size]"); +} + +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Matrix transpose: B = A^T"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + let mut tilesize: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + tilesize = 32; + } + 4 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + tilesize = match args[3].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } + } + + if tilesize > order { + println!("Warning: tilesize cannot be > order, will not use tiling!"); + } + + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); + if tilesize < order { + println!("Tile size = {}", tilesize); + } else { + tilesize = order; + println!("Untiled"); + } + + if order % tilesize != 0 && tilesize < order { + panic!("Cannot use the given tilesize!") + }; + + let num_tiles: usize = order / tilesize; // all tiles have same size + + ///////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ///////////////////////////////////////////////////// + + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + + // Initialize matrices + for i in 0..order { + for j in 0..order { + a[i * order + j] = (i * order + j) as f64; + } + } + + println!("Initialization done, running algorithm"); + + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + + // parallelisze outermost loop with rayon + b.par_chunks_exact_mut(tilesize * order) + .enumerate() + // for the current set of row tiles + // and the rows corresponding to this row tile + .for_each(|(row_tile_idx, b_rows)| { + // iterate over all column tiles + (0..num_tiles).for_each(|col_tile_idx| { + // within the tile, iterate over *tilesize* rows of b + // zipped together with rows of b available in the tile + (0..tilesize).zip(b_rows.chunks_exact_mut(order)).for_each( + // bi is the ith row of b + |(row_within_tile, bi)| { + let bi_subset_cols = bi + .get_mut((col_tile_idx * tilesize)..((col_tile_idx + 1) * tilesize)) + .unwrap(); + // within the tile, iterate over *tilesize* columns of b + // zipped together with subset of columns of b + (0..tilesize).zip(bi_subset_cols.iter_mut()).for_each( + |(col_within_tile, b_element)| { + let rowidx: usize = row_tile_idx * tilesize + row_within_tile; + let colidx: usize = col_tile_idx * tilesize + col_within_tile; + *b_element += a[colidx * order + rowidx]; + }, + ) + }, + ) + }) + }); + + // straightforward addition of 1.0 to all elements of A + a.par_iter_mut().for_each(|a_element| { + *a_element += 1.0; + }); + } + + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let transpose_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let addit: usize = ((iterations as usize + 1) * (iterations as usize)) / 2; + let mut abserr: f64 = 0.0; + for i in 0..order { + for j in 0..order { + let ij = i * order + j; + let ji = j * order + i; + let reference: f64 = (ij * (iterations as usize + 1) + addit) as f64; + abserr += (b[ji] - reference).abs(); + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", abserr); + } + + let epsilon: f64 = 1.0e-8; + if abserr < epsilon { + println!("Solution validates"); + let avgtime: f64 = (transpose_time as f64) / (iterations as f64); + let bytes: usize = 2_usize * nelems * mem::size_of::(); + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (bytes as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + abserr, epsilon + ); + return; + } +} From 3036da41ec0e783595c0b54d72d62b72321a8163 Mon Sep 17 00:00:00 2001 From: Sajid Ali Date: Tue, 8 Nov 2022 17:07:50 -0600 Subject: [PATCH 61/80] Update nstream-kokkos for kokkos-3.7 compatibility modified: Cxx11/nstream-kokkos.cc --- Cxx11/nstream-kokkos.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc index 6ec3528de..340ce1819 100644 --- a/Cxx11/nstream-kokkos.cc +++ b/Cxx11/nstream-kokkos.cc @@ -163,7 +163,7 @@ int main(int argc, char * argv[]) double asum(0); Kokkos::parallel_reduce(length, KOKKOS_LAMBDA(size_t const i, double & inner) { - using Kokkos::Experimental::fabs; + using Kokkos::fabs; inner += fabs(A(i)); }, asum); Kokkos::fence(); From a7687472c15942630b42fca50a68f558afc79f4a Mon Sep 17 00:00:00 2001 From: Sajid Ali Date: Wed, 9 Nov 2022 10:22:31 -0600 Subject: [PATCH 62/80] RUST: clarify naming, and add old blis based dgemm as a separate kernel modified: .gitignore new file: RUST/dgemm-blis/Cargo.toml new file: RUST/dgemm-blis/src/main.rs modified: RUST/dgemm-iter/Cargo.toml modified: RUST/dgemm-rayon/Cargo.toml modified: RUST/nstream-iter/Cargo.toml modified: RUST/nstream-rayon/Cargo.toml modified: RUST/transpose-iter/Cargo.toml modified: RUST/transpose-rayon/Cargo.toml --- .gitignore | 2 + RUST/dgemm-blis/Cargo.toml | 10 ++ RUST/dgemm-blis/src/main.rs | 202 ++++++++++++++++++++++++++++++++ RUST/dgemm-iter/Cargo.toml | 2 +- RUST/dgemm-rayon/Cargo.toml | 2 +- RUST/nstream-iter/Cargo.toml | 2 +- RUST/nstream-rayon/Cargo.toml | 2 +- RUST/transpose-iter/Cargo.toml | 2 +- RUST/transpose-rayon/Cargo.toml | 2 +- 9 files changed, 220 insertions(+), 6 deletions(-) create mode 100644 RUST/dgemm-blis/Cargo.toml create mode 100644 RUST/dgemm-blis/src/main.rs diff --git a/.gitignore b/.gitignore index 1bacbfade..73e16e2da 100644 --- a/.gitignore +++ b/.gitignore @@ -381,6 +381,8 @@ RUST/nstream-rayon/Cargo.lock RUST/nstream-rayon/target/ RUST/dgemm/Cargo.lock RUST/dgemm/target/ +RUST/dgemm-blis/Cargo.lock +RUST/dgemm-blis/target/ RUST/dgemm-iter/Cargo.lock RUST/dgemm-iter/target/ RUST/dgemm-rayon/Cargo.lock diff --git a/RUST/dgemm-blis/Cargo.toml b/RUST/dgemm-blis/Cargo.toml new file mode 100644 index 000000000..3ea994400 --- /dev/null +++ b/RUST/dgemm-blis/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "dgemm-blis" +version = "0.1.0" +authors = ["Jeff Hammond ", "Sajid Ali "] + +edition="2021" + +[dependencies] +cblas = "0.4" +blas-src = { version = "0.8", features = ["blis"] } diff --git a/RUST/dgemm-blis/src/main.rs b/RUST/dgemm-blis/src/main.rs new file mode 100644 index 000000000..83ff6d041 --- /dev/null +++ b/RUST/dgemm-blis/src/main.rs @@ -0,0 +1,202 @@ +// +// Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////// +// +// NAME: transpose +// +// PURPOSE: This program measures the time for the transpose of a +// column-major stored matrix into a row-major stored matrix. +// +// USAGE: Program input is the matrix order and the number of times to +// repeat the operation: +// +// transpose <# iterations> [tile size] +// +// An optional parameter specifies the tile size used to divide the +// individual matrix blocks for improved cache and TLB performance. +// +// The output consists of diagnostics to make sure the +// transpose worked and timing statistics. +// +// HISTORY: Written by Rob Van der Wijngaart, February 2009. +// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +// +/////////////////////////////////////////////// + +// Need the following to prevent linker errors per +// https://github.com/blas-lapack-rs/blas-lapack-rs.github.io/wiki +extern crate blas_src; + +use std::env; +use std::time::{Duration, Instant}; + +fn help() { + println!("Usage: <# iterations> "); +} + +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Dense matrix-matrix multiplication: C += A x B"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } + } + + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } + + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); + + /////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + /////////////////////////////////////////////// + + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + let mut c: Vec = vec![0.0; nelems]; + + for i in 0..order { + for j in 0..order { + a[i * order + j] = i as f64; + b[i * order + j] = i as f64; + } + } + + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + + //prk_dgemm(order, &mut a, &mut b, &mut c); + let m: i32 = order as i32; + let n: i32 = order as i32; + let k: i32 = order as i32; + unsafe { + cblas::dgemm( + cblas::Layout::RowMajor, + cblas::Transpose::None, + cblas::Transpose::None, + m, + n, + k, + 1.0, + &a, + m, + &b, + k, + 1.0, + &mut c, + m, + ); + } + } + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let dgemm_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let forder: f64 = order as f64; + let reference: f64 = 0.25 + * (forder * forder * forder) + * (forder - 1.0) + * (forder - 1.0) + * (iterations as f64 + 1.0); + let mut checksum: f64 = 0.0; + for i in 0..order { + for j in 0..order { + checksum += c[i * order + j]; + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", checksum); + } + + let epsilon: f64 = 1.0e-8; + let residuum: f64 = (checksum - reference) / reference; + if residuum < epsilon { + println!("Solution validates"); + let avgtime: f64 = (dgemm_time as f64) / (iterations as f64); + let uorder: usize = order as usize; + let nflops: usize = 2_usize * uorder * uorder * uorder; + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (nflops as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + residuum, epsilon + ); + return; + } +} diff --git a/RUST/dgemm-iter/Cargo.toml b/RUST/dgemm-iter/Cargo.toml index 5714a1fa3..af296857c 100644 --- a/RUST/dgemm-iter/Cargo.toml +++ b/RUST/dgemm-iter/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "dgemm" +name = "dgemm-iter" version = "0.1.0" authors = ["Jeff Hammond ", "Sajid Ali "] diff --git a/RUST/dgemm-rayon/Cargo.toml b/RUST/dgemm-rayon/Cargo.toml index 49886cd96..905e888df 100644 --- a/RUST/dgemm-rayon/Cargo.toml +++ b/RUST/dgemm-rayon/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "dgemm" +name = "dgemm-rayon" version = "0.1.0" authors = ["Jeff Hammond ", "Sajid Ali "] diff --git a/RUST/nstream-iter/Cargo.toml b/RUST/nstream-iter/Cargo.toml index 479e87e60..b43f54b10 100644 --- a/RUST/nstream-iter/Cargo.toml +++ b/RUST/nstream-iter/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "nstream" +name = "nstream-iter" version = "0.1.0" authors = ["Jeff Hammond ", "Thomas Hayward-Schneider "] diff --git a/RUST/nstream-rayon/Cargo.toml b/RUST/nstream-rayon/Cargo.toml index 054caa930..af291bdbf 100644 --- a/RUST/nstream-rayon/Cargo.toml +++ b/RUST/nstream-rayon/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "nstream" +name = "nstream-rayon" version = "0.1.0" authors = ["Jeff Hammond ", "Thomas Hayward-Schneider ", "Sajid Ali "] diff --git a/RUST/transpose-iter/Cargo.toml b/RUST/transpose-iter/Cargo.toml index 22fe9074e..840edb129 100644 --- a/RUST/transpose-iter/Cargo.toml +++ b/RUST/transpose-iter/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "transpose" +name = "transpose-iter" version = "0.1.0" authors = ["Jeff Hammond ", "Sajid Ali "] diff --git a/RUST/transpose-rayon/Cargo.toml b/RUST/transpose-rayon/Cargo.toml index fa75e1f79..540969f59 100644 --- a/RUST/transpose-rayon/Cargo.toml +++ b/RUST/transpose-rayon/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "transpose" +name = "transpose-rayon" version = "0.1.0" authors = ["Jeff Hammond ", "Sajid Ali "] From 929548b9fb5508e56f5e63588f5b2845089b808a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 17 Nov 2022 13:24:42 +0200 Subject: [PATCH 63/80] GCC OpenACC does not support runtime tilesizes --- C1z/transpose-openacc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/C1z/transpose-openacc.c b/C1z/transpose-openacc.c index 0ffc76c8e..8bd66d14d 100644 --- a/C1z/transpose-openacc.c +++ b/C1z/transpose-openacc.c @@ -90,7 +90,11 @@ int main(int argc, char * argv[]) printf("Number of iterations = %d\n", iterations); printf("Matrix order = %d\n", order); +#ifdef __GNUC__ + printf("Tile size = %s\n", "automatic (GCC)"); +#else printf("Tile size = %d\n", tile_size); +#endif ////////////////////////////////////////////////////////////////////// /// Allocate space for the input and transpose matrix @@ -115,7 +119,11 @@ int main(int argc, char * argv[]) if (iter==1) trans_time = prk_wtime(); +#ifdef __GNUC__ + #pragma acc parallel loop tile(*,*) deviceptr(A,B) +#else #pragma acc parallel loop tile(tile_size,tile_size) deviceptr(A,B) +#endif for (int i=0;i Date: Thu, 17 Nov 2022 13:25:57 +0200 Subject: [PATCH 64/80] GCC OpenACC does not support runtime tilesizes --- Cxx11/transpose-openacc.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Cxx11/transpose-openacc.cc b/Cxx11/transpose-openacc.cc index 130d424d3..c93e0414c 100644 --- a/Cxx11/transpose-openacc.cc +++ b/Cxx11/transpose-openacc.cc @@ -96,7 +96,11 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Matrix order = " << order << std::endl; +#ifdef __GNUC__ + std::cout << "Tile size = " << "automatic (GCC)" << std::endl; +#else std::cout << "Tile size = " << tile_size << std::endl; +#endif ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation @@ -121,7 +125,11 @@ int main(int argc, char * argv[]) if (iter==1) trans_time = prk::wtime(); +#ifdef __GNUC__ + #pragma acc parallel loop tile(*,*) deviceptr(A,B) +#else #pragma acc parallel loop tile(tile_size,tile_size) deviceptr(A,B) +#endif for (int i=0;i Date: Thu, 17 Nov 2022 13:47:34 +0200 Subject: [PATCH 65/80] fix restrict->RESTRICT --- Cxx11/transpose-openacc.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/transpose-openacc.cc b/Cxx11/transpose-openacc.cc index c93e0414c..258064534 100644 --- a/Cxx11/transpose-openacc.cc +++ b/Cxx11/transpose-openacc.cc @@ -109,8 +109,8 @@ int main(int argc, char * argv[]) double trans_time{0}; size_t bytes = order*order*sizeof(double); - double * restrict A = (double *)acc_malloc(bytes); - double * restrict B = (double *)acc_malloc(bytes); + double * RESTRICT A = (double *)acc_malloc(bytes); + double * RESTRICT B = (double *)acc_malloc(bytes); { #pragma acc parallel loop deviceptr(A,B) From 793667cbd2eb137c73b1675894cd7ad1f21ce2f5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 7 Nov 2022 08:44:06 +0200 Subject: [PATCH 66/80] add SGEMM CBLAS --- Cxx11/Makefile | 2 +- Cxx11/sgemm-cblas.cc | 340 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 341 insertions(+), 1 deletion(-) create mode 100644 Cxx11/sgemm-cblas.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 3a50f690a..ee69f1a75 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -133,7 +133,7 @@ thrust: nstream-host-thrust nstream-device-thrust \ cublas: transpose-cublas nstream-cublas dgemm-cublas dgemm-multigpu-cublas dgemm-mpi-cublas sgemm-cublas -cblas: transpose-cblas dgemm-cblas +cblas: transpose-cblas dgemm-cblas sgemm-cblas onemkl: nstream-onemkl dgemm-onemkl dgemm-multigpu-onemkl diff --git a/Cxx11/sgemm-cblas.cc b/Cxx11/sgemm-cblas.cc new file mode 100644 index 000000000..625ce693f --- /dev/null +++ b/Cxx11/sgemm-cblas.cc @@ -0,0 +1,340 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// Copyright (c) 2021, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: sgemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> [] +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// cblas_sgemm() +/// cblas_sgemm_batch() +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +#if defined(MKL) +#include +#ifdef MKL_ILP64 +#error Use the MKL library for 32-bit integers! +#endif +#elif defined(ACCELERATE) +// The location of cblas.h is not in the system include path when -framework Accelerate is provided. +#include +#else +#include +#endif + +#ifdef _OPENMP +#include +#endif + +#ifdef PRK_DEBUG +#include +void prk_sgemm_loops(const int order, + const std::vector & A, + const std::vector & B, + std::vector & C) +{ + for (int i=0; i & A, + const std::vector & B, + std::vector & C) +{ + const int n = order; + const float alpha = 1.0; + const float beta = 1.0; + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, alpha, A.data(), n, B.data(), n, beta, C.data(), n); +} + +void prk_sgemm(const int order, const int batches, + const std::vector> & A, + const std::vector> & B, + std::vector> & C) +{ + const int n = order; + const float alpha = 1.0; + const float beta = 1.0; + + for (int b=0; b> & A, + const std::vector> & B, + std::vector> & C) +{ + const int n = order; + const float alpha = 1.0; + const float beta = 1.0; + +#ifdef _OPENMP +#pragma omp parallel for schedule(dynamic) num_threads(nt) +#endif + for (int b=0; b [ ]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + if (argc > 3) { + batches = std::atoi(argv[3]); + } + + if (argc>4) { + batch_threads = std::atoi(argv[4]); + } else { +#ifdef _OPENMP + batch_threads = omp_get_max_threads(); +#endif + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + if (batches == 0) { + std::cout << "No batching" << std::endl; + } else if (batches > 0) { +#ifdef MKL + std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; +#else + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl; +#endif + } else if (batches < 0) { + if (batch_threads > 1) { + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS with " << batch_threads << " threads)" << std::endl; + } else { + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl; + } + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + double gemm_time(0); + + const int matrices = (batches==0 ? 1 : abs(batches)); + + std::vector const M(order*order,0); + std::vector> A(matrices,M); + std::vector> B(matrices,M); + std::vector> C(matrices,M); + for (int b=0; b 0) { + prk_sgemm(order, matrices, pA, pB, pC); + } + } + gemm_time = prk::wtime() - gemm_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const double epsilon = 1.0e-8; + const double forder = static_cast(order); + const double reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); + double residuum(0); + for (int b=0; b Date: Mon, 23 Jan 2023 14:58:18 +0200 Subject: [PATCH 67/80] add shmem4py (#618) * add shmem4py example * better install directions * shmem alltoall behaves different than mpi alltoall, so we have to add a barrier. * abort does not flush print Signed-off-by: Jeff Hammond Co-authored-by: Lisandro Dalcin --- PYTHON/README.md | 22 ++++ PYTHON/nstream-numpy-shmem.py | 175 ++++++++++++++++++++++++ PYTHON/stencil-numpy-mpi.py | 0 PYTHON/transpose-numpy-mpi-rma.py | 11 +- PYTHON/transpose-numpy-mpi.py | 5 +- PYTHON/transpose-numpy-shmem.py | 212 ++++++++++++++++++++++++++++++ 6 files changed, 415 insertions(+), 10 deletions(-) create mode 100755 PYTHON/nstream-numpy-shmem.py mode change 100644 => 100755 PYTHON/stencil-numpy-mpi.py create mode 100755 PYTHON/transpose-numpy-shmem.py diff --git a/PYTHON/README.md b/PYTHON/README.md index 7f670436f..9c624b775 100644 --- a/PYTHON/README.md +++ b/PYTHON/README.md @@ -1,5 +1,7 @@ # How to run +## mpi4py + ``` mpiexec -n 4 python3 -m mpi4py nstream-numpy-mpi.py 10 10000000 mpiexec -n 4 python3 -m mpi4py transpose-numpy-mpi.py 10 1000 @@ -11,3 +13,23 @@ On Mac with Homebrew, this might work better: mpiexec -n 4 ./nstream-numpy-mpi.py 10 10000000 mpiexec -n 4 ./transpose-numpy-mpi.py 10 1000 ``` + +## shmem4py + +Checkout shmem4py and build against e.g. SOS like this: +``` +$ export OSHCC=oshcc +$ python3 -m pip install . +``` + +Run like this: +``` +$ oshrun -n 4 python3 nstream-numpy-shmem.py 10 10000000 +Parallel Research Kernels version +Python SHMEM/NumPy STREAM triad: A = B + scalar * C +Number of ranks = 4 +Number of iterations = 10 +Vector length = 10000000 +Solution validates +Rate (MB/s): 22345.12038433607 Avg time (s): 0.0143208 +``` diff --git a/PYTHON/nstream-numpy-shmem.py b/PYTHON/nstream-numpy-shmem.py new file mode 100755 index 000000000..3b42f0488 --- /dev/null +++ b/PYTHON/nstream-numpy-shmem.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, Intel Corporation +# Copyright (c) 2023, NVIDIA +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: nstream +# +# PURPOSE: To compute memory bandwidth when adding a vector of a given +# number of double precision values to the scalar multiple of +# another vector of the same length, and storing the result in +# a third vector. +# +# USAGE: The program takes as input the number +# of iterations to loop over the triad vectors, the length of the +# vectors, and the offset between vectors +# +# <# iterations> +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# NOTES: Bandwidth is determined as the number of words read, plus the +# number of words written, times the size of the words, divided +# by the execution time. For a vector length of N, the total +# number of words read and written is 4*N*sizeof(double). +# +# +# HISTORY: This code is loosely based on the Stream benchmark by John +# McCalpin, but does not follow all the Stream rules. Hence, +# reported results should not be associated with Stream in +# external publications +# +# Converted to Python by Jeff Hammond, October 2017. +# +# ******************************************************************* + +import sys +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer +from shmem4py import shmem +import numpy + +def main(): + + me = shmem.my_pe() + np = shmem.n_pes() + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + if (me==0): + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python SHMEM/Numpy STREAM triad: A = B + scalar * C') + + if len(sys.argv) != 3: + print('argument count = ', len(sys.argv)) + sys.exit("Usage: python nstream.py <# iterations> ") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + total_length = int(sys.argv[2]) + if total_length < 1: + sys.exit("ERROR: length must be positive") + + length = int(total_length / np) + remainder = total_length % np + if (remainder > 0): + if (me < remainder): + length += 1 + + if (me==0): + print('Number of ranks = ', np) + print('Number of iterations = ', iterations) + print('Vector length = ', total_length) + + shmem.barrier_all() + + # ******************************************************************** + # ** Allocate space for the input and execute STREAM triad + # ******************************************************************** + + # 0.0 is a float, which is 64b (53b of precision) + A = numpy.zeros(length) + B = numpy.full(length,2.0) + C = numpy.full(length,2.0) + + scalar = 3.0 + + for k in range(0,iterations+1): + + if k<1: + shmem.barrier_all() + t0 = timer() + + A += B + scalar * C + + + shmem.barrier_all() + t1 = timer() + nstream_time = t1 - t0 + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + ar = 0.0 + br = 2.0 + cr = 2.0 + ref = 0.0 + for k in range(0,iterations+1): + ar += br + scalar * cr + + ar *= total_length + + #asum = numpy.linalg.norm(A, ord=1) + #shmem.reduce(asum) + + asum = numpy.linalg.norm(A, ord=1) + src = shmem.full(1, asum) + tgt = shmem.full(1, 0.0) + shmem.reduce(tgt,src) + asum = tgt + + epsilon=1.e-8 + if abs(ar-asum)/asum > epsilon: + if (me==0): + print('Failed Validation on output array'); + print(' Expected checksum: ',ar); + print(' Observed checksum: ',asum); + print("ERROR: solution did not validate") + else: + if (me==0): + print('Solution validates') + avgtime = nstream_time/iterations + nbytes = 4.0 * total_length * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc. + print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime) + + +if __name__ == '__main__': + main() diff --git a/PYTHON/stencil-numpy-mpi.py b/PYTHON/stencil-numpy-mpi.py old mode 100644 new mode 100755 diff --git a/PYTHON/transpose-numpy-mpi-rma.py b/PYTHON/transpose-numpy-mpi-rma.py index efa3ca359..b064596ee 100755 --- a/PYTHON/transpose-numpy-mpi-rma.py +++ b/PYTHON/transpose-numpy-mpi-rma.py @@ -159,8 +159,10 @@ def main(): for phase in range(0,np): recv_from = (me + phase) % np bsize = block_order * block_order - WA.Get(T, recv_from, [bsize * recv_from, bsize, MPI.DOUBLE]) - WA.Flush_all() + #WA.Get(T, recv_from, [bsize * me, bsize, MPI.DOUBLE]) + #WA.Flush(recv_from) + r = WA.Rget(T, recv_from, [bsize * me, bsize, MPI.DOUBLE]) + r.Wait() lo = block_order * recv_from hi = block_order * (recv_from+1) @@ -200,10 +202,7 @@ def main(): else: if (me==0): print('error ',abserr, ' exceeds threshold ',epsilon) - print("ERROR: solution did not validate") - comm.Abort() - #sys.exit("ERROR: solution did not validate") - + sys.exit("ERROR: solution did not validate") if __name__ == '__main__': main() diff --git a/PYTHON/transpose-numpy-mpi.py b/PYTHON/transpose-numpy-mpi.py index 5dacbd5ea..d0413f52f 100755 --- a/PYTHON/transpose-numpy-mpi.py +++ b/PYTHON/transpose-numpy-mpi.py @@ -190,10 +190,7 @@ def main(): else: if (me==0): print('error ',abserr, ' exceeds threshold ',epsilon) - print("ERROR: solution did not validate") - comm.Abort() - #sys.exit("ERROR: solution did not validate") - + sys.exit("ERROR: solution did not validate") if __name__ == '__main__': main() diff --git a/PYTHON/transpose-numpy-shmem.py b/PYTHON/transpose-numpy-shmem.py new file mode 100755 index 000000000..1495dec53 --- /dev/null +++ b/PYTHON/transpose-numpy-shmem.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, Intel Corporation +# Copyright (c) 2023, NVIDIA +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: transpose +# +# PURPOSE: This program measures the time for the transpose of a +# column-major stored matrix into a row-major stored matrix. +# +# USAGE: Program input is the matrix order and the number of times to +# repeat the operation: +# +# transpose <# iterations> +# +# The output consists of diagnostics to make sure the +# transpose worked and timing statistics. +# +# HISTORY: Written by Rob Van der Wijngaart, February 2009. +# Converted to Python by Jeff Hammond, February 2016. +# +# ******************************************************************* + +# Layout nomenclature +# ------------------- +# +# - Each rank owns one block of columns (Colblock) of the overall +# matrix to be transposed, as well as of the transposed matrix. +# - Colblock is stored contiguously in the memory of the rank. +# The stored format is column major, which means that matrix +# elements (i,j) and (i+1,j) are adjacent, and (i,j) and (i,j+1) +# are "order" words apart +# - Colblock is logically composed of #ranks Blocks, but a Block is +# not stored contiguously in memory. Conceptually, the Block is +# the unit of data that gets communicated between ranks. Block i of +# rank j is locally transposed and gathered into a buffer called Work, +# which is sent to rank i, where it is scattered into Block j of the +# transposed matrix. +# - When tiling is applied to reduce TLB misses, each block gets +# accessed by tiles. +# - The original and transposed matrices are called A and B +# +# +-----------------------------------------------------------------+ +# | | | | | +# | Colblock | | | | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | Block | | | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | | | Overall Matrix | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | | | | +# | | | | | +# | | | | | +# | | | | | +# +-----------------------------------------------------------------+ + +import sys +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer +from shmem4py import shmem +import numpy + +def main(): + + me = shmem.my_pe() + np = shmem.n_pes() + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + if (me==0): + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python SHMEM/Numpy Matrix transpose: B = A^T') + + if len(sys.argv) != 3: + print('argument count = ', len(sys.argv)) + sys.exit("Usage: ./transpose <# iterations> ") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + order = int(sys.argv[2]) + if order < 1: + sys.exit("ERROR: order must be >= 1") + + if order % np != 0: + sys.exit("ERROR: matrix order ", order," should be divisible by # procs", np) + + block_order = int(order / np) + + if (me==0): + print('Number of ranks = ', np) + print('Number of iterations = ', iterations) + print('Matrix order = ', order) + + shmem.barrier_all() + + # ******************************************************************** + # ** Allocate space for the input and transpose matrix + # ******************************************************************** + + LA = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype=float) + A = shmem.full((order,block_order),LA) + B = shmem.zeros((order,block_order)) + T = shmem.zeros((order,block_order)) + + for k in range(0,iterations+1): + + if k<1: + shmem.barrier_all() + t0 = timer() + + # this actually forms the transpose of A + #B += numpy.transpose(A) + # this only uses the transpose _view_ of A + #B += A.T + + # barrier required before alltoall for correctness + shmem.barrier_all() + shmem.alltoall(T, A) + for r in range(0,np): + lo = block_order * r + hi = block_order * (r+1) + #B[lo:hi,:] += numpy.transpose(T[lo:hi,:]) + B[lo:hi,:] += T[lo:hi,:].T + + A += 1.0 + + shmem.barrier_all() + t1 = timer() + trans_time = t1 - t0 + + shmem.free(A) + shmem.free(T) + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + # allgather is non-scalable but was easier to debug + F = shmem.zeros((np,order,block_order)) + shmem.fcollect(F,B) + G = numpy.concatenate(F,axis=1) + #if (me==0): + # print(G) + H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype=float) + abserr = numpy.linalg.norm(numpy.reshape(G-H,order*order),ord=1) + + shmem.free(B) + shmem.free(F) + + epsilon=1.e-8 + nbytes = 2 * order**2 * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc. + if abserr < epsilon: + if (me==0): + print('Solution validates') + avgtime = trans_time/iterations + print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime) + else: + if (me==0): + print('error ',abserr, ' exceeds threshold ',epsilon) + print("ERROR: solution did not validate") + + +if __name__ == '__main__': + main() From e39cd7ced7e3118352b88ab18c834111e95b3004 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 25 Jan 2023 14:05:49 +0200 Subject: [PATCH 68/80] rename (#619) --- FORTRAN/Makefile | 2 +- FORTRAN/{nstream-cufortran.cuf => nstream-cufortran.F90} | 0 FORTRAN/{transpose-cufortran.cuf => transpose-cufortran.F90} | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename FORTRAN/{nstream-cufortran.cuf => nstream-cufortran.F90} (100%) rename FORTRAN/{transpose-cufortran.cuf => transpose-cufortran.F90} (100%) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 6d3b0c1f1..625490385 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -141,7 +141,7 @@ dgemm-blas: dgemm-blas.F90 prk.mod %-openacc: %-openacc.F90 prk.mod $(FC) $(FCFLAGS) $(OPENACCFLAG) $< prk_mod.o -o $@ -%-cufortran: %-cufortran.cuf prk.mod +%-cufortran: %-cufortran.F90 prk.mod $(FC) $(FCFLAGS) $(CUFORTFLAG) $< prk_mod.o -o $@ %-stdpar: %-stdpar.F90 prk.mod diff --git a/FORTRAN/nstream-cufortran.cuf b/FORTRAN/nstream-cufortran.F90 similarity index 100% rename from FORTRAN/nstream-cufortran.cuf rename to FORTRAN/nstream-cufortran.F90 diff --git a/FORTRAN/transpose-cufortran.cuf b/FORTRAN/transpose-cufortran.F90 similarity index 100% rename from FORTRAN/transpose-cufortran.cuf rename to FORTRAN/transpose-cufortran.F90 From a8c9d697317b21860039a72d6937968223ad81d7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 15 Nov 2022 16:03:46 +0200 Subject: [PATCH 69/80] fix name --- RUST/nstream-unsafe/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RUST/nstream-unsafe/Cargo.toml b/RUST/nstream-unsafe/Cargo.toml index 479e87e60..81a229d01 100644 --- a/RUST/nstream-unsafe/Cargo.toml +++ b/RUST/nstream-unsafe/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "nstream" +name = "nstream-unsafe" version = "0.1.0" authors = ["Jeff Hammond ", "Thomas Hayward-Schneider "] From 02937f26557fd35738016ff8bc788e8da26a2848 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 15 Nov 2022 16:27:15 +0200 Subject: [PATCH 70/80] add dgemm-blis --- RUST/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RUST/Makefile b/RUST/Makefile index f72474c64..3516f35f1 100644 --- a/RUST/Makefile +++ b/RUST/Makefile @@ -23,6 +23,7 @@ all: cd transpose-iter && cargo build $(RCFLAGS) cd transpose-rayon && cargo build $(RCFLAGS) cd dgemm && cargo build $(RCFLAGS) + cd dgemm-blis && cargo build $(RCFLAGS) cd dgemm-iter && cargo build $(RCFLAGS) cd dgemm-rayon && cargo build $(RCFLAGS) clean: @@ -36,5 +37,6 @@ clean: cd transpose-iter && cargo clean cd transpose-rayon && cargo clean cd dgemm && cargo clean + cd dgemm-blis && cargo clean cd dgemm-iter && cargo clean cd dgemm-rayon && cargo clean From 98ad8948ad85599a836160a3aa6c789dba3618c9 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 9 Mar 2023 09:55:48 +0200 Subject: [PATCH 71/80] dunno --- RUST/dgemm-blis/Cargo.toml | 2 +- common/make.defs.gcc | 55 +++++++++++++++++++------------------- 2 files changed, 28 insertions(+), 29 deletions(-) diff --git a/RUST/dgemm-blis/Cargo.toml b/RUST/dgemm-blis/Cargo.toml index 3ea994400..249b6fb8c 100644 --- a/RUST/dgemm-blis/Cargo.toml +++ b/RUST/dgemm-blis/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dgemm-blis" -version = "0.1.0" +version = "0.5.0" authors = ["Jeff Hammond ", "Sajid Ali "] edition="2021" diff --git a/common/make.defs.gcc b/common/make.defs.gcc index afcf1a6ae..62e540298 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -4,7 +4,7 @@ # # Base compilers and language options # -VERSION=-11 +VERSION=-10 # C99 is required in some implementations. CC=gcc${VERSION} -std=c11 -pthread #EXTRA_CLIBS=-lrt @@ -43,15 +43,15 @@ OPENACCFLAG=-fopenacc # OpenCL flags # # MacOS -OPENCLFLAG=-framework OpenCL +#OPENCLFLAG=-framework OpenCL # POCL # http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct... #OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL # Linux -#OPENCLDIR=/etc/alternatives/opencl-intel-tools -#OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +OPENCLDIR=/etc/alternatives/opencl-intel-tools +OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations -METALFLAG=-framework MetalPerformanceShaders +#METALFLAG=-framework MetalPerformanceShaders # # SYCL flags # @@ -98,19 +98,17 @@ METALFLAG=-framework MetalPerformanceShaders # # hipSYCL # -SYCLDIR=/opt/hipSYCL -SYCLCXX=${SYCLDIR}/bin/syclcc-clang -SYCLFLAG=-std=c++17 -O3 -SYCLFLAG+=-DHIPSYCL +#SYCLDIR=/opt/hipSYCL +#SYCLCXX=${SYCLDIR}/bin/syclcc-clang +#SYCLFLAG=-std=c++17 -O3 +#SYCLFLAG+=-DHIPSYCL # CPU platform -SYCLFLAG+=--hipsycl-platform=cpu -SYCLFLAG+=-Wl,-rpath=/opt/hipSYCL/llvm/lib +#SYCLFLAG+=--hipsycl-platform=cpu +#SYCLFLAG+=-Wl,-rpath=/opt/hipSYCL/llvm/lib # -CELERITYDIR=${SYCLDIR} -CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor -CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime -MPIINC=-I/usr/include/mpich-3.2-x86_64 -MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi +#CELERITYDIR=${SYCLDIR} +#CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor +#CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime # # OCCA # @@ -162,19 +160,19 @@ UPCXXFLAG+=-mtune=native -ffast-math # #BLASFLAG=-L${HOME}/BLIS/lib -lblis #-fopenmp -lpthread #CBLASFLAG=-I${HOME}/BLIS/include -BLASFLAG=-DACCELERATE -framework Accelerate -CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions -#BLASFLAG=-lblas -#`CBLASFLAG=-lblas +#BLASFLAG=-DACCELERATE -framework Accelerate +#CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions +BLASFLAG=-lblas +CBLASFLAG=-lblas # # CUDA flags # # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander -NVCC=/opt/llvm/cocl/bin/cocl +#NVCC=/opt/llvm/cocl/bin/cocl # Linux w/ NVIDIA CUDA -NVCC=nvcc +NVCC=/usr/local/cuda-11.4/bin/nvcc CUDAFLAGS=-g -O3 -std=c++11 -CUDAFLAGS+=-arch=sm_50 +CUDAFLAGS+=-arch=sm_87 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 #CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # @@ -205,10 +203,10 @@ ISPCFLAG=-O3 --target=host --opt=fast-math # # MPI-3 # -MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.4 -MPICC=${MPIDIR}/bin/mpicc -MPICXX=${MPIDIR}/bin/mpicxx -MPIFORT=${MPIDIR}/bin/mpifort +MPIDIR=/usr +MPICC=${MPIDIR}/bin/mpicc.mpich +MPICXX=${MPIDIR}/bin/mpicxx.mpich +MPIFORT=${MPIDIR}/bin/mpifort.mpich MPIINC=-I${MPIDIR}/include MPILIB=-L${MPIDIR}/lib -lmpi_usempif08 -lmpi #MPILIB=-L${MPIDIR}/lib -lmpifort -lmpi @@ -241,7 +239,8 @@ PETSCFLAG+=-Wl,-rpath=${PETSCDIR}/lib # single-node #COARRAYFLAG=-fcoarray=single -lcaf_single # multi-node -COARRAYFLAG=-fcoarray=lib -L/opt/homebrew/lib -lcaf_mpi +#COARRAYFLAG=-fcoarray=lib -L/opt/homebrew/lib -lcaf_mpi +COARRAYFLAG=-fcoarray=lib -L/usr/lib/x86_64-linux-gnu/open-coarrays/mpich/lib -lcaf_mpi # # MEMKIND (used in C1z) # From 53c10ce02dbd3fd6d4bb2fd1028bd5c8987ff7cc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 31 Mar 2023 15:33:58 +0300 Subject: [PATCH 72/80] gcc apple update for ventura (#623) --- common/make.defs.gcc | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 62e540298..2f52fa0c6 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -4,7 +4,7 @@ # # Base compilers and language options # -VERSION=-10 +VERSION=-12 # C99 is required in some implementations. CC=gcc${VERSION} -std=c11 -pthread #EXTRA_CLIBS=-lrt @@ -186,16 +186,6 @@ HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide HALIDEFLAG+=${DEFAULT_OPT_FLAGS} HALIDEFLAG+=-std=c++17 # -# Halide -# -HALIDECXX=${CXX} -HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux -HALIDEFLAG=-I${HALIDEDIR}/include -HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide -#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 -HALIDEFLAG+=${DEFAULT_OPT_FLAGS} -HALIDEFLAG+=-std=c++17 -# # ISPC # ISPC=ispc @@ -203,10 +193,10 @@ ISPCFLAG=-O3 --target=host --opt=fast-math # # MPI-3 # -MPIDIR=/usr -MPICC=${MPIDIR}/bin/mpicc.mpich -MPICXX=${MPIDIR}/bin/mpicxx.mpich -MPIFORT=${MPIDIR}/bin/mpifort.mpich +MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.5 +MPICC=${MPIDIR}/bin/mpicc +MPICXX=${MPIDIR}/bin/mpicxx +MPIFORT=${MPIDIR}/bin/mpifort MPIINC=-I${MPIDIR}/include MPILIB=-L${MPIDIR}/lib -lmpi_usempif08 -lmpi #MPILIB=-L${MPIDIR}/lib -lmpifort -lmpi From f3a392e609078f3a23715c609ab5069e1a4fe961 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 25 Apr 2023 11:13:26 +0300 Subject: [PATCH 73/80] brew tbb update --- common/make.defs.gcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 2f52fa0c6..1d6dd89e8 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -121,7 +121,7 @@ OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations # TBB # #TBBDIR=/usr/lib/x86_64-linux-gnu -TBBDIR=/opt/homebrew/Cellar/tbb/2020_U3_1 +TBBDIR=/opt/homebrew/Cellar/tbb/2021.8.0 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb From 7925db02a5ebe6d65a1b49348e31151b4a01668a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 25 Apr 2023 11:23:24 +0300 Subject: [PATCH 74/80] add flang-new docs --- doc/flang-new.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 doc/flang-new.md diff --git a/doc/flang-new.md b/doc/flang-new.md new file mode 100644 index 000000000..fe47c6e13 --- /dev/null +++ b/doc/flang-new.md @@ -0,0 +1,6 @@ +This works, but -flang-experimental-exec` and `-Wall` are ignored. + +``` +/opt/llvm/latest/bin/flang-new -flang-experimental-exec -g -O3 -ffast-math -Wall -DRADIUS=2 -DSTAR -c p2p.F90 +ld -L /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib -lSystem p2p.o prk_mod.o -o p2p /opt/llvm/latest/lib/libFortran*a +``` From 252bbb5e047cd4e95e550597b7afe6c54170abf8 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 19 Jul 2023 11:04:29 +0300 Subject: [PATCH 75/80] fix petsc transpose - closes #615 (#626) --- C1z/nstream-petsc.c | 2 +- C1z/transpose-petsc.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/C1z/nstream-petsc.c b/C1z/nstream-petsc.c index 93b931872..aec86ff5c 100644 --- a/C1z/nstream-petsc.c +++ b/C1z/nstream-petsc.c @@ -119,7 +119,7 @@ int main(int argc, char * argv[]) #endif PetscPrintf(PETSC_COMM_WORLD,"Number of processes = %d\n", np); PetscPrintf(PETSC_COMM_WORLD,"Number of iterations = %d\n", iterations); - PetscPrintf(PETSC_COMM_WORLD,"Vector length = %zu\n", length); + PetscPrintf(PETSC_COMM_WORLD,"Vector length = %zu\n", (size_t)length); ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation diff --git a/C1z/transpose-petsc.c b/C1z/transpose-petsc.c index ed631cf88..aa3219c68 100644 --- a/C1z/transpose-petsc.c +++ b/C1z/transpose-petsc.c @@ -118,10 +118,7 @@ int main(int argc, char * argv[]) double trans_time = 0.0; - PetscReal zero = 0; PetscReal one = 1; - PetscReal two = 2; - PetscReal three = 3; Mat A; Mat B; @@ -144,6 +141,8 @@ int main(int argc, char * argv[]) } } ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); + // https://petsc.org/main/manualpages/Mat/MatTransposeSetPrecursor/ + ierr = MatTransposeSetPrecursor(A, AT); CHKERRQ(ierr); // B[i,j] = 0 #if 0 @@ -196,9 +195,9 @@ int main(int argc, char * argv[]) // Analyze and output results ////////////////////////////////////////////////////////////////////// - PetscReal addit = (iterations+1)*(iterations)/2; PetscReal abserr = 0; #if 0 + PetscReal addit = (iterations+1)*(iterations)/2; for (int j=0; j Date: Wed, 19 Jul 2023 14:47:55 -0400 Subject: [PATCH 76/80] Update Intel SYCL compiler driver. Update device selectors and accessors to SYCL2020. (#629) Signed-off-by: James Brodman --- Cxx11/dgemm-onemkl.cc | 2 +- Cxx11/dgemm-sycl.cc | 8 +++--- Cxx11/generate-sycl-stencil.py | 4 +-- Cxx11/nstream-dpcpp.cc | 2 +- Cxx11/nstream-onedpl.cc | 2 +- Cxx11/nstream-onemkl.cc | 2 +- Cxx11/nstream-sycl-explicit-usm.cc | 23 ++--------------- Cxx11/nstream-sycl-explicit.cc | 38 +++++++--------------------- Cxx11/nstream-sycl-usm.cc | 23 ++--------------- Cxx11/nstream-sycl.cc | 30 ++++------------------ Cxx11/p2p-hyperplane-sycl.cc | 4 +-- Cxx11/pic-sycl.cc | 15 +++++------ Cxx11/prk_sycl.h | 2 -- Cxx11/stencil-2d-sycl.cc | 27 +++----------------- Cxx11/stencil-sycl-usm.cc | 23 ++--------------- Cxx11/stencil-sycl.cc | 27 +++----------------- Cxx11/stencil_sycl.hpp | 40 +++++++++++++++--------------- Cxx11/transpose-2d-sycl.cc | 27 +++----------------- Cxx11/transpose-dpcpp.cc | 2 +- Cxx11/transpose-sycl-usm.cc | 23 ++--------------- Cxx11/transpose-sycl.cc | 27 +++----------------- Cxx11/xgemm-onemkl.cc | 21 ++-------------- common/make.defs.oneapi | 2 +- 23 files changed, 80 insertions(+), 294 deletions(-) diff --git a/Cxx11/dgemm-onemkl.cc b/Cxx11/dgemm-onemkl.cc index d1f9b65ec..0ebccd128 100644 --- a/Cxx11/dgemm-onemkl.cc +++ b/Cxx11/dgemm-onemkl.cc @@ -126,7 +126,7 @@ int main(int argc, char * argv[]) } std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl; - sycl::queue q(sycl::default_selector{}); + sycl::queue q(sycl::default_selector_v); prk::SYCL::print_device_platform(q); ////////////////////////////////////////////////////////////////////// diff --git a/Cxx11/dgemm-sycl.cc b/Cxx11/dgemm-sycl.cc index a7ca3dd4f..dda801652 100644 --- a/Cxx11/dgemm-sycl.cc +++ b/Cxx11/dgemm-sycl.cc @@ -73,9 +73,9 @@ void prk_dgemm(sycl::queue & q, { q.submit([&](sycl::handler& h) { - auto A = d_A.get_access(h); - auto B = d_B.get_access(h); - auto C = d_C.get_access(h); + sycl::accessor A(d_A, h, sycl::read_only); + sycl::accessor B(d_B, h, sycl::read_only); + sycl::accessor C(d_C, h); h.parallel_for( sycl::range<2>{order,order}, [=] (sycl::id<2> it) { @@ -130,7 +130,7 @@ int main(int argc, char * argv[]) return 1; } - sycl::queue q(sycl::default_selector{}); + sycl::queue q(sycl::default_selector_v); prk::SYCL::print_device_platform(q); if (tile_size < order) { diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index c67f2d124..9a28bdb2e 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -26,8 +26,8 @@ def codegen(src,pattern,stencil_size,radius,model,dim,usm): src.write('{\n') src.write(' q.submit([&](sycl::handler& h) {\n') if (not usm): - src.write(' auto in = d_in.template get_access(h);\n') - src.write(' auto out = d_out.template get_access(h);\n') + src.write(' sycl::accessor in(d_in, h, sycl::read_only);\n') + src.write(' sycl::accessor out(d_out, h);\n') if (dim==2): for r in range(1,radius+1): src.write(' sycl::id<2> dx'+str(r)+'(sycl::range<2> {'+str(r)+',0});\n') diff --git a/Cxx11/nstream-dpcpp.cc b/Cxx11/nstream-dpcpp.cc index efc0fcaf3..4306adc12 100644 --- a/Cxx11/nstream-dpcpp.cc +++ b/Cxx11/nstream-dpcpp.cc @@ -106,7 +106,7 @@ int main(int argc, char * argv[]) std::cout << "Vector length = " << length << std::endl; std::cout << "Block size = " << block_size << std::endl; - sycl::queue q(sycl::default_selector{}); + sycl::queue q(sycl::default_selector_v); prk::SYCL::print_device_platform(q); size_t padded_length = block_size * prk::divceil(length,block_size); diff --git a/Cxx11/nstream-onedpl.cc b/Cxx11/nstream-onedpl.cc index 963683945..8cd48fc2a 100644 --- a/Cxx11/nstream-onedpl.cc +++ b/Cxx11/nstream-onedpl.cc @@ -101,7 +101,7 @@ int main(int argc, char *argv[]) { std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Vector length = " << length << std::endl; - sycl::queue q(sycl::default_selector{}); + sycl::queue q(sycl::default_selector_v); prk::SYCL::print_device_platform(q); ////////////////////////////////////////////////////////////////////// diff --git a/Cxx11/nstream-onemkl.cc b/Cxx11/nstream-onemkl.cc index 0c69f9808..55448ec74 100644 --- a/Cxx11/nstream-onemkl.cc +++ b/Cxx11/nstream-onemkl.cc @@ -106,7 +106,7 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Vector length = " << length << std::endl; - sycl::queue q(sycl::default_selector{}, sycl::property::queue::in_order{}); + sycl::queue q(sycl::default_selector_v, sycl::property::queue::in_order{}); ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation diff --git a/Cxx11/nstream-sycl-explicit-usm.cc b/Cxx11/nstream-sycl-explicit-usm.cc index aa5c5c690..cf5f9f89a 100644 --- a/Cxx11/nstream-sycl-explicit-usm.cc +++ b/Cxx11/nstream-sycl-explicit-usm.cc @@ -275,7 +275,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE @@ -294,26 +294,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, length, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, length, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc index adf045d32..e7cf0bd57 100644 --- a/Cxx11/nstream-sycl-explicit.cc +++ b/Cxx11/nstream-sycl-explicit.cc @@ -100,15 +100,15 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size) sycl::buffer d_C { sycl::range<1>{length} }; q.submit([&](sycl::handler& h) { - sycl::accessor A(d_A, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor A(d_A, h, sycl::no_init); h.fill(A,(T)0); }); q.submit([&](sycl::handler& h) { - sycl::accessor B(d_B, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor B(d_B, h, sycl::no_init); h.fill(B,(T)2); }); q.submit([&](sycl::handler& h) { - sycl::accessor C(d_C, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor C(d_C, h, sycl::no_init); h.fill(C,(T)2); }); q.wait(); @@ -118,10 +118,9 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size) if (iter==1) nstream_time = prk::wtime(); q.submit([&](sycl::handler& h) { - - auto A = d_A.template get_access(h); - auto B = d_B.template get_access(h); - auto C = d_C.template get_access(h); + sycl::accessor A(d_A, h); + sycl::accessor B(d_B, h, sycl::read_only); + sycl::accessor C(d_C, h, sycl::read_only); if (block_size == 0) { // hipSYCL prefers range to nd_range because no barriers @@ -164,7 +163,7 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size) nstream_time = prk::wtime() - nstream_time; q.submit([&](sycl::handler& h) { - sycl::accessor A(d_A, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor A(d_A, h, sycl::read_only); h.copy(A,h_A.data()); }); q.wait(); @@ -268,26 +267,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, length, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, length, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::cpu_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE @@ -306,7 +286,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/nstream-sycl-usm.cc b/Cxx11/nstream-sycl-usm.cc index e872a5130..cc2865324 100644 --- a/Cxx11/nstream-sycl-usm.cc +++ b/Cxx11/nstream-sycl-usm.cc @@ -253,7 +253,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE @@ -272,26 +272,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, length, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, length, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 140125f9d..902291315 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -106,10 +106,9 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size) if (iter==1) nstream_time = prk::wtime(); q.submit([&](sycl::handler& h) { - - auto A = d_A.template get_access(h); - auto B = d_B.template get_access(h); - auto C = d_C.template get_access(h); + sycl::accessor A(d_A, h); + sycl::accessor B(d_B, h, sycl::read_only); + sycl::accessor C(d_C, h, sycl::read_only); if (block_size == 0) { // hipSYCL prefers range to nd_range because no barriers @@ -250,26 +249,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, length, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, length, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::cpu_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE @@ -288,7 +268,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc index 1e2083982..5611a84a7 100644 --- a/Cxx11/p2p-hyperplane-sycl.cc +++ b/Cxx11/p2p-hyperplane-sycl.cc @@ -148,7 +148,7 @@ int main(int argc, char* argv[]) q.submit([&](sycl::handler& h) { - auto grid = d_grid.get_access(h); + sycl::accessor grid(d_grid, h); unsigned begin = std::max(2,i-n+2); unsigned end = std::min(i,n)+1; @@ -172,7 +172,7 @@ int main(int argc, char* argv[]) } q.submit([&](sycl::handler& h) { - auto grid = d_grid.get_access(h); + sycl::accessor grid(d_grid, h); h.single_task([=] { grid[0*n+0] = -grid[(n-1)*n+(n-1)]; diff --git a/Cxx11/pic-sycl.cc b/Cxx11/pic-sycl.cc index c55e5f4ff..b47572ba7 100644 --- a/Cxx11/pic-sycl.cc +++ b/Cxx11/pic-sycl.cc @@ -523,14 +523,12 @@ int main(int argc, char ** argv) { std::string devname = (devchar==NULL ? "None" : devchar); sycl::device d; if (devname == "CPU") { - d = sycl::cpu_selector{}.select_device(); + d = sycl::device{sycl::cpu_selector_v}; } else if (devname == "GPU") { - d = sycl::gpu_selector{}.select_device(); - } else if (devname == "HOST") { - d = sycl::host_selector{}.select_device(); + d = sycl::device{sycl::gpu_selector_v}; } else { - std::cout << "PRK_DEVICE should be CPU, GPU or HOST" << std::endl; - d = sycl::default_selector{}.select_device(); + std::cout << "PRK_DEVICE should be CPU or GPU" << std::endl; + d = sycl::device{sycl::default_selector_v}; } sycl::queue q(d); prk::SYCL::print_device_platform(q); @@ -603,9 +601,8 @@ int main(int argc, char ** argv) { /* Calculate forces on particles and update positions */ q.submit([&](sycl::handler& cgh) { - - auto p = d_particles.get_access(cgh); - auto q = d_Qgrid.get_access(cgh); + sycl::accessor p(d_particles, cgh); + sycl::accessor q(d_Qgrid, cgh, sycl::read_only); cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_work_size), sycl::range<1>(local_work_size)), [=] (sycl::nd_item<1> item) { auto i = item.get_global_id(0); diff --git a/Cxx11/prk_sycl.h b/Cxx11/prk_sycl.h index 8d37e489d..f70516d89 100644 --- a/Cxx11/prk_sycl.h +++ b/Cxx11/prk_sycl.h @@ -6,8 +6,6 @@ #include "CL/sycl.hpp" -namespace sycl = cl::sycl; - #if defined(__LIBSYCL_MAJOR_VERSION) && defined(__LIBSYCL_MINOR_VERSION) && defined(__LIBSYCL_PATCH_VERSION) # define __LIBSYCL_VERSION \ (__LIBSYCL_MAJOR_VERSION * 10000 + __LIBSYCL_MINOR_VERSION * 100 + __LIBSYCL_PATCH_VERSION) diff --git a/Cxx11/stencil-2d-sycl.cc b/Cxx11/stencil-2d-sycl.cc index b6eeb09bc..b945e9ad7 100644 --- a/Cxx11/stencil-2d-sycl.cc +++ b/Cxx11/stencil-2d-sycl.cc @@ -123,7 +123,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star q.submit([&](sycl::handler& h) { // accessor methods - auto in = d_in.template get_access(h); + sycl::accessor in(d_in, h); h.parallel_for>(sycl::range<2> {n, n}, [=] (sycl::item<2> it) { sycl::id<2> xy = it.get_id(); @@ -142,7 +142,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star q.wait(); q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); + sycl::accessor in(d_in, h); // Add constant to solution to force refresh of neighbor data, if any h.parallel_for>(sycl::range<2> {n, n}, [=] (sycl::item<2> it) { sycl::id<2> xy = it.get_id(); @@ -278,7 +278,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); #ifndef DPCPP_NO_DOUBLE @@ -297,26 +297,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, n, block_size, star, radius); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, n, block_size, star, radius); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/stencil-sycl-usm.cc b/Cxx11/stencil-sycl-usm.cc index b219b24f1..3f4a687fd 100644 --- a/Cxx11/stencil-sycl-usm.cc +++ b/Cxx11/stencil-sycl-usm.cc @@ -267,7 +267,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q(sycl::host_selector{}, sycl::property::queue::in_order{}); + sycl::queue q(sycl::cpu_selector_v, sycl::property::queue::in_order{}); prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); #ifndef DPCPP_NO_DOUBLE @@ -286,26 +286,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q(sycl::cpu_selector{}, sycl::property::queue::in_order{}); - prk::SYCL::print_device_platform(q); - run(q, iterations, n, block_size, star, radius); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, n, block_size, star, radius); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q(sycl::gpu_selector{}, sycl::property::queue::in_order{}); + sycl::queue q(sycl::gpu_selector_v, sycl::property::queue::in_order{}); prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index 8947c8dee..f5eb3f6f5 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -121,7 +121,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star sycl::buffer d_out { h_out.data(), h_out.size() }; q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); + sycl::accessor in(d_in, h); h.parallel_for>(sycl::nd_range{global, local}, [=](sycl::nd_item<2> it) { const size_t i = it.get_global_id(0); const size_t j = it.get_global_id(1); @@ -140,7 +140,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star q.wait(); q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); + sycl::accessor in(d_in, h); h.parallel_for>(sycl::nd_range{global, local}, [=](sycl::nd_item<2> it) { const size_t i = it.get_global_id(0); const size_t j = it.get_global_id(1); @@ -276,7 +276,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); #ifndef DPCPP_NO_DOUBLE @@ -295,26 +295,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, n, block_size, star, radius); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, n, block_size, star, radius); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index 64af40b79..5339a6826 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -5,8 +5,8 @@ template void star1(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); h.parallel_for>(sycl::range<2> {n-1,n-1}, [=] (sycl::item<2> it) { const auto i = it[0] + 1; const auto j = it[1] + 1; @@ -25,8 +25,8 @@ template void star1(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); sycl::id<2> dx1(sycl::range<2> {1,0}); sycl::id<2> dy1(sycl::range<2> {0,1}); h.parallel_for>(sycl::range<2> {n-1,n-1}, [=] (sycl::item<2> it) { @@ -64,8 +64,8 @@ template void star2(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); h.parallel_for>(sycl::range<2> {n-2,n-2}, [=] (sycl::item<2> it) { const auto i = it[0] + 2; const auto j = it[1] + 2; @@ -88,8 +88,8 @@ template void star2(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); sycl::id<2> dx1(sycl::range<2> {1,0}); sycl::id<2> dy1(sycl::range<2> {0,1}); sycl::id<2> dx2(sycl::range<2> {2,0}); @@ -137,8 +137,8 @@ template void star3(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); h.parallel_for>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) { const auto i = it[0] + 3; const auto j = it[1] + 3; @@ -165,8 +165,8 @@ template void star3(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); sycl::id<2> dx1(sycl::range<2> {1,0}); sycl::id<2> dy1(sycl::range<2> {0,1}); sycl::id<2> dx2(sycl::range<2> {2,0}); @@ -224,8 +224,8 @@ template void star4(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); h.parallel_for>(sycl::range<2> {n-4,n-4}, [=] (sycl::item<2> it) { const auto i = it[0] + 4; const auto j = it[1] + 4; @@ -256,8 +256,8 @@ template void star4(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); sycl::id<2> dx1(sycl::range<2> {1,0}); sycl::id<2> dy1(sycl::range<2> {0,1}); sycl::id<2> dx2(sycl::range<2> {2,0}); @@ -325,8 +325,8 @@ template void star5(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); h.parallel_for>(sycl::range<2> {n-5,n-5}, [=] (sycl::item<2> it) { const auto i = it[0] + 5; const auto j = it[1] + 5; @@ -361,8 +361,8 @@ template void star5(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); sycl::id<2> dx1(sycl::range<2> {1,0}); sycl::id<2> dy1(sycl::range<2> {0,1}); sycl::id<2> dx2(sycl::range<2> {2,0}); diff --git a/Cxx11/transpose-2d-sycl.cc b/Cxx11/transpose-2d-sycl.cc index 2fbe8938b..55d3b8393 100644 --- a/Cxx11/transpose-2d-sycl.cc +++ b/Cxx11/transpose-2d-sycl.cc @@ -91,8 +91,8 @@ void run(sycl::queue & q, int iterations, size_t order, size_t block_size) q.submit([&](sycl::handler& h) { // accessor methods - auto A = d_A.template get_access(h); - auto B = d_B.template get_access(h); + sycl::accessor A(d_A, h); + sycl::accessor B(d_B, h); h.parallel_for>( #if PREBUILD_KERNEL @@ -214,7 +214,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); #ifndef DPCPP_NO_DOUBLE @@ -233,26 +233,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, order, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, order, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/transpose-dpcpp.cc b/Cxx11/transpose-dpcpp.cc index efdb159e3..ccd1403e0 100644 --- a/Cxx11/transpose-dpcpp.cc +++ b/Cxx11/transpose-dpcpp.cc @@ -96,7 +96,7 @@ int main(int argc, char * argv[]) std::cout << "Matrix order = " << order << std::endl; std::cout << "Block size = " << block_size << std::endl; - sycl::queue q(sycl::default_selector{}); + sycl::queue q(sycl::default_selector_v); prk::SYCL::print_device_platform(q); size_t padded_order = block_size * prk::divceil(order,block_size); diff --git a/Cxx11/transpose-sycl-usm.cc b/Cxx11/transpose-sycl-usm.cc index 1ec5c1470..249440ee0 100644 --- a/Cxx11/transpose-sycl-usm.cc +++ b/Cxx11/transpose-sycl-usm.cc @@ -197,7 +197,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); #ifndef DPCPP_NO_DOUBLE @@ -216,26 +216,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, order, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, order, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index da0d596c0..894a916bd 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -91,8 +91,8 @@ void run(sycl::queue & q, int iterations, size_t order, size_t block_size) q.submit([&](sycl::handler& h) { // accessor methods - auto A = d_A.template get_access(h); - auto B = d_B.template get_access(h); + sycl::accessor A(d_A, h); + sycl::accessor B(d_B, h); h.parallel_for>( #if PREBUILD_KERNEL @@ -213,7 +213,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); #ifndef DPCPP_NO_DOUBLE @@ -232,26 +232,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, order, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, order, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/xgemm-onemkl.cc b/Cxx11/xgemm-onemkl.cc index 68dfcb587..446777a4a 100644 --- a/Cxx11/xgemm-onemkl.cc +++ b/Cxx11/xgemm-onemkl.cc @@ -199,7 +199,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order); run(q, iterations, order); @@ -216,24 +216,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, order); - run(q, iterations, order); - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi index 66b5dbd8e..fd8b7ece0 100644 --- a/common/make.defs.oneapi +++ b/common/make.defs.oneapi @@ -58,7 +58,7 @@ OPENCLFLAG=-I${OPENCLDIR}/include/sycl -L${OPENCLDIR}/lib -lOpenCL # # Intel oneAPI # -SYCLCXX=dpcpp +SYCLCXX=icpx SYCLFLAG=-fsycl SYCLFLAG+=-std=c++17 -O3 -g3 SYCLFLAG+=-DDPCPP From 712ff1b3795670cb5c7498f2a8bc4a155f6bb707 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 20 Jul 2023 10:25:46 +0300 Subject: [PATCH 77/80] better xgemm test for onemkl (#630) --- Cxx11/xgemm-onemkl.cc | 81 ++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 43 deletions(-) diff --git a/Cxx11/xgemm-onemkl.cc b/Cxx11/xgemm-onemkl.cc index 446777a4a..5dcac9384 100644 --- a/Cxx11/xgemm-onemkl.cc +++ b/Cxx11/xgemm-onemkl.cc @@ -1,5 +1,6 @@ /// /// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2023, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -63,6 +64,7 @@ #include #else #include +#include #endif using namespace oneapi; // oneapi::mkl -> mkl @@ -139,7 +141,7 @@ void run(sycl::queue & q, int iterations, int order) } const double residuum = std::abs(checksum - reference) / reference; const double epsilon{1.0e-8}; - if (residuum < epsilon) { + if ((residuum < epsilon) || (sizeof(T) < 4)) { #if VERBOSE std::cout << "Reference checksum = " << reference << "\n" << "Actual checksum = " << checksum << std::endl; @@ -147,8 +149,16 @@ void run(sycl::queue & q, int iterations, int order) std::cout << "Solution validates" << std::endl; auto avgtime = gemm_time/iterations; auto nflops = 2.0 * prk::pow(forder,3); - std::cout << "FP" << 8*sizeof(T) - << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime + auto is_fp64 = (typeid(T) == typeid(double)); + auto is_fp32 = (typeid(T) == typeid(float)); + auto is_fp16 = (typeid(T) == typeid(sycl::half)); + auto is_bf16 = (typeid(T) == typeid(oneapi::mkl::bfloat16)); + auto pname = (is_fp64 ? "FP64" : + (is_fp32 ? "FP32" : + (is_fp16 ? "FP16" : + (is_bf16 ? "BF16" : "Unknown FP type")))); + std::cout << pname + << " Rate (MF/s): " << 1.0e-6 * nflops/avgtime << " Avg time (s): " << avgtime << std::endl; } else { std::cout << "Reference checksum = " << reference << "\n" @@ -198,46 +208,31 @@ int main(int argc, char * argv[]) /// Setup SYCL environment ////////////////////////////////////////////////////////////////////// - try { - sycl::queue q{sycl::cpu_selector_v}; - prk::SYCL::print_device_platform(q); - run(q, iterations, order); - run(q, iterations, order); - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector_v}; - prk::SYCL::print_device_platform(q); - bool has_fp64 = prk::SYCL::has_fp64(q); - if (has_fp64) { - if (prk::SYCL::print_gen12lp_helper(q)) return 1; - } - run(q, iterations, order); - if (has_fp64) { - run(q, iterations, order); - } else { - std::cout << "SYCL GPU device lacks FP64 support." << std::endl; - } - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; + sycl::queue qs[2] = { sycl::queue{sycl::cpu_selector_v}, + sycl::queue{sycl::gpu_selector_v} }; + for (auto q : qs) { + try { + prk::SYCL::print_device_platform(q); + bool has_fp64 = prk::SYCL::has_fp64(q); + run(q, iterations, order); + run(q, iterations, order); + run(q, iterations, order); + if (has_fp64) { + run(q, iterations, order); + } else { + std::cout << "SYCL device lacks FP64 support." << std::endl; + } + } + catch (sycl::exception & e) { + std::cout << e.what() << std::endl; + prk::SYCL::print_exception_details(e); + } + catch (std::exception & e) { + std::cout << e.what() << std::endl; + } + catch (const char * e) { + std::cout << e << std::endl; + } } return 0; From 549978bc93f1a6c02715bc6874bb4b801c1b2045 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 4 May 2023 13:10:25 +0300 Subject: [PATCH 78/80] fix C ism bug --- Cxx11/pic-sycl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/pic-sycl.cc b/Cxx11/pic-sycl.cc index b47572ba7..6d5d31503 100644 --- a/Cxx11/pic-sycl.cc +++ b/Cxx11/pic-sycl.cc @@ -126,7 +126,7 @@ double * initializeGrid(uint64_t L) } /* Completes particle distribution */ -void finish_distribution(const uint64_t n, particle_t p[const n]) +void finish_distribution(const uint64_t n, particle_t p[]) { for (uint64_t pi=0; pi Date: Thu, 4 May 2023 13:10:32 +0300 Subject: [PATCH 79/80] par exec needed --- Cxx11/nstream-stdpar.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Cxx11/nstream-stdpar.cc b/Cxx11/nstream-stdpar.cc index 4723f0a93..0c4cd5ecf 100644 --- a/Cxx11/nstream-stdpar.cc +++ b/Cxx11/nstream-stdpar.cc @@ -145,7 +145,8 @@ int main(int argc, char * argv[]) auto nstream = [=] (thrust::tuple t) { return thrust::get<0>(t) + thrust::get<1>(t) + scalar * thrust::get<2>(t); }; - std::transform( thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())), + std::transform( std::execution::par_unseq, + thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())), thrust::make_zip_iterator(thrust::make_tuple(A.end() , B.end() , C.end())), A.begin(), nstream); From 65547411769eec741f3ce403469ea77ee3712291 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 20 Jul 2023 10:35:35 +0300 Subject: [PATCH 80/80] disable TBB and related because they keep breaking it --- Cxx11/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index ee69f1a75..7e8cb9ce3 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -60,13 +60,13 @@ ifdef OCCADIR endif OCCAFLAGS = -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib -locca -.PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl \ +.PHONY: all clean vector valarray openmp target opencl taskloop stl \ ranges kokkos raja cuda cublas sycl dpcpp \ boost-compute thrust executor oneapi onemkl EXTRA= ifneq ($(findstring nvc++,$(CXX)),nvc++) - EXTRA += ranges stl pstl + EXTRA += ranges stl #pstl tbb # TBB keeps breaking due to API changes endif ifneq ($(OPENACCFLAG),) EXTRA += openacc