Skip to content

Commit

Permalink
Merge branch 'default' into cuf-rename
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffhammond authored Jul 25, 2023
2 parents 9fe9c32 + 6554741 commit 62b799a
Show file tree
Hide file tree
Showing 109 changed files with 9,201 additions and 1,928 deletions.
25 changes: 25 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ C1z/nstream-mmap
C1z/nstream-mmap-openmp
C1z/nstream-mpi
C1z/nstream-openmp
C1z/nstream-openacc
C1z/nstream-petsc
C1z/nstream-target
C1z/nstream-taskloop
Expand All @@ -88,6 +89,7 @@ C1z/stencil-2d
C1z/stencil-2d-openmp
C1z/stencil-cilk
C1z/stencil-openmp
C1z/stencil-openacc
C1z/stencil-target
C1z/stencil-taskloop
C1z/transpose
Expand All @@ -96,6 +98,7 @@ C1z/transpose-2d-openmp
C1z/transpose-cilk
C1z/transpose-ispc
C1z/transpose-openmp
C1z/transpose-openacc
C1z/transpose-petsc
C1z/transpose-target
C1z/transpose-taskloop
Expand Down Expand Up @@ -136,7 +139,9 @@ Cxx11/nstream-cublas
Cxx11/nstream-cuda
Cxx11/nstream-cuda-managed
Cxx11/nstream-dpcpp
Cxx11/nstream-onedpl
Cxx11/nstream-executors
Cxx11/nstream-halide
Cxx11/nstream-hip
Cxx11/nstream-hipblas
Cxx11/nstream-hipstl
Expand All @@ -151,6 +156,7 @@ Cxx11/nstream-multigpu-dpcpp
Cxx11/nstream-onemkl
Cxx11/nstream-opencl
Cxx11/nstream-openmp
Cxx11/nstream-openacc
Cxx11/nstream-openmp-target
Cxx11/nstream-pstl
Cxx11/nstream-raja
Expand All @@ -171,6 +177,7 @@ Cxx11/nstream-vector-raja
Cxx11/p2p
Cxx11/p2p-doacross-openmp
Cxx11/p2p-hyperplane-openmp
Cxx11/p2p-hyperplane-openacc
Cxx11/p2p-hyperplane-pstl
Cxx11/p2p-hyperplane-stl
Cxx11/p2p-hyperplane-sycl
Expand Down Expand Up @@ -204,11 +211,13 @@ Cxx11/stencil
Cxx11/stencil-cilk
Cxx11/stencil-cuda
Cxx11/stencil-dpcpp
Cxx11/stencil-halide
Cxx11/stencil-hip
Cxx11/stencil-kokkos
Cxx11/stencil-mpi
Cxx11/stencil-opencl
Cxx11/stencil-openmp
Cxx11/stencil-openacc
Cxx11/stencil-openmp-target
Cxx11/stencil-pstl
Cxx11/stencil-raja
Expand All @@ -233,13 +242,15 @@ Cxx11/transpose-cublas
Cxx11/transpose-cuda
Cxx11/transpose-device-thrust
Cxx11/transpose-dpcpp
Cxx11/transpose-halide
Cxx11/transpose-hip
Cxx11/transpose-hipblas
Cxx11/transpose-host-thrust
Cxx11/transpose-kokkos
Cxx11/transpose-mpi
Cxx11/transpose-opencl
Cxx11/transpose-openmp
Cxx11/transpose-openacc
Cxx11/transpose-openmp-target
Cxx11/transpose-pstl
Cxx11/transpose-raja
Expand Down Expand Up @@ -366,12 +377,26 @@ RUST/nstream-unsafe/Cargo.lock
RUST/nstream-unsafe/target/
RUST/nstream-iter/Cargo.lock
RUST/nstream-iter/target/
RUST/nstream-rayon/Cargo.lock
RUST/nstream-rayon/target/
RUST/dgemm/Cargo.lock
RUST/dgemm/target/
RUST/dgemm-blis/Cargo.lock
RUST/dgemm-blis/target/
RUST/dgemm-iter/Cargo.lock
RUST/dgemm-iter/target/
RUST/dgemm-rayon/Cargo.lock
RUST/dgemm-rayon/target/
RUST/p2p/Cargo.lock
RUST/p2p/target/
RUST/stencil/Cargo.lock
RUST/stencil/target/
RUST/transpose/Cargo.lock
RUST/transpose/target/
RUST/transpose-iter/Cargo.lock
RUST/transpose-iter/target/
RUST/transpose-rayon/Cargo.lock
RUST/transpose-rayon/target/
SERIAL/AMR/amr
SERIAL/Branch/branch
SERIAL/DGEMM/dgemm
Expand Down
33 changes: 14 additions & 19 deletions C1z/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ include ../common/PRKVERSION

CPPFLAGS = -DPRKVERSION=$(PRKVERSION)

CFLAGS = $(DEFAULT_OPT_FLAGS) $(CPPFLAGS)

# debugging
ifdef VERBOSE
CFLAGS += -DVERBOSE
CPPFLAGS += -DVERBOSE
endif

CFLAGS = $(DEFAULT_OPT_FLAGS) $(CPPFLAGS)


ifdef PRK_USE_MMAP
CFLAGS += -DPRK_USE_MMAP
endif
Expand All @@ -25,31 +26,19 @@ OMPFLAGS = $(OPENMPFLAG)
TARGETFLAGS = $(OFFLOADFLAG)
CILKFLAGS = $(CILKFLAG)
ISPCFLAGS = $(ISPCFLAG)
OPENACCFLAGS = $(OPENACCFLAG)

.PHONY: all clean serial thread openmp target taskloop ispc # cilk
.PHONY: all clean serial thread openmp tasks target taskloop ispc

EXTRA=
ifeq ($(shell uname -s),Darwin)
ifneq ($(findstring icc,$(CC)),icc)
EXTRA += target
endif
else
ifneq ($(findstring icx,$(CC)),icx)
EXTRA += target
endif
endif
ifdef ($(ISPC))
EXTRA += ispc
endif
ifneq ($(CILKFLAG),)
EXTRA += cilk
endif
ifeq ($(findstring xlc,$(CC)),xlc)
EXTRA = target
CFLAGS += -DXLC
endif
ifneq ($(findstring icx,$(CC)),icx)
EXTRA += tasks
ifneq ($(OPENACCFLAG),)
EXTRA += openacc
endif

all: serial thread openmp $(EXTRA)
Expand Down Expand Up @@ -83,6 +72,8 @@ target: nstream-target stencil-target transpose-target nstream-alloc-target nstr

taskloop: nstream-taskloop stencil-taskloop transpose-taskloop

openacc: nstream-openacc stencil-openacc transpose-openacc

cilk: stencil-cilk transpose-cilk

ispc: transpose-ispc
Expand Down Expand Up @@ -132,6 +123,9 @@ p2p-2d: p2p-2d.c prk_util.h
%-openmp: %-openmp.c prk_util.h prk_openmp.h
$(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@

%-openacc: %-openacc.c prk_util.h
$(CC) $(CFLAGS) $< $(OPENACCFLAGS) $(EXTRA_CLIBS) -o $@

%-cilk: %-cilk.c prk_util.h
$(CC) $(CFLAGS) $< $(CILKFLAGS) $(EXTRA_CLIBS) -o $@

Expand Down Expand Up @@ -161,6 +155,7 @@ clean:
-rm -f p2p-sse p2p-avx p2p-avx3 p2p-avx-tasks-openmp
-rm -f *-2d
-rm -f *-openmp
-rm -f *-openacc
-rm -f *-mpi
-rm -f *-petsc
-rm -f *-target
Expand Down
6 changes: 4 additions & 2 deletions C1z/generate-c-stencil.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ def codegen(src,pattern,stencil_size,radius,W,model,dim):
if (model=='openmp'):
outer += 'OMP_FOR()\n '
elif (model=='target'):
outer += 'OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )\n '
outer += 'OMP_TARGET( teams distribute parallel for simd collapse(2) )\n '
elif (model=='taskloop'):
outer += 'OMP_TASKLOOP( firstprivate(n) shared(in,out) grainsize(gs) )\n '
elif (model=='openacc'):
outer += 'PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )\n '
elif (model=='cilk'):
outer += '_Cilk_'

Expand Down Expand Up @@ -82,7 +84,7 @@ def instance(src,model,pattern,r,dim):
codegen(src,pattern,stencil_size,r,W,model,dim)

def main():
for model in ['seq','openmp','target','cilk','taskloop']:
for model in ['seq','openmp','target','cilk','taskloop','openacc']:
src = open('stencil_'+model+'.h','w')
for pattern in ['star','grid']:
for r in range(1,10):
Expand Down
172 changes: 172 additions & 0 deletions C1z/nstream-openacc.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
///
/// Copyright (c) 2019, Intel Corporation
/// Copyright (c) 2022, NVIDIA
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions
/// are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above
/// copyright notice, this list of conditions and the following
/// disclaimer in the documentation and/or other materials provided
/// with the distribution.
/// * Neither the name of Intel Corporation nor the names of its
/// contributors may be used to endorse or promote products
/// derived from this software without specific prior written
/// permission.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.

//////////////////////////////////////////////////////////////////////
///
/// NAME: nstream
///
/// PURPOSE: To compute memory bandwidth when adding a vector of a given
/// number of double precision values to the scalar multiple of
/// another vector of the same length, and storing the result in
/// a third vector.
///
/// USAGE: The program takes as input the number
/// of iterations to loop over the triad vectors and
/// the length of the vectors.
///
/// <progname> <# iterations> <vector length>
///
/// The output consists of diagnostics to make sure the
/// algorithm worked, and of timing statistics.
///
/// NOTES: Bandwidth is determined as the number of words read, plus the
/// number of words written, times the size of the words, divided
/// by the execution time. For a vector length of N, the total
/// number of words read and written is 4*N*sizeof(double).
///
/// HISTORY: This code is loosely based on the Stream benchmark by John
/// McCalpin, but does not follow all the Stream rules. Hence,
/// reported results should not be associated with Stream in
/// external publications
///
/// Converted to C++11 by Jeff Hammond, November 2017.
/// Converted to C11 by Jeff Hammond, February 2019.
///
//////////////////////////////////////////////////////////////////////

#include <openacc.h>
#include "prk_util.h"

int main(int argc, char * argv[])
{
printf("Parallel Research Kernels version %d\n", PRKVERSION );
printf("C11/OpenACC STREAM triad: A = B + scalar * C\n");

//////////////////////////////////////////////////////////////////////
/// Read and test input parameters
//////////////////////////////////////////////////////////////////////

if (argc < 3) {
printf("Usage: <# iterations> <vector length>\n");
return 1;
}

int iterations = atoi(argv[1]);
if (iterations < 1) {
printf("ERROR: iterations must be >= 1\n");
return 1;
}

// length of a the vector
size_t length = atol(argv[2]);
if (length <= 0) {
printf("ERROR: Vector length must be greater than 0\n");
return 1;
}

printf("Number of iterations = %d\n", iterations);
printf("Vector length = %zu\n", length);

//////////////////////////////////////////////////////////////////////
// Allocate space and perform the computation
//////////////////////////////////////////////////////////////////////

double nstream_time = 0.0;

size_t bytes = length*sizeof(double);
double * restrict A = acc_malloc(bytes);
double * restrict B = acc_malloc(bytes);
double * restrict C = acc_malloc(bytes);

double scalar = 3.0;

{
#pragma acc parallel loop deviceptr(A,B,C)
for (size_t i=0; i<length; i++) {
A[i] = 0.0;
B[i] = 2.0;
C[i] = 2.0;
}

for (int iter = 0; iter<=iterations; iter++) {

if (iter==1) nstream_time = prk_wtime();

#pragma acc parallel loop deviceptr(A,B,C)
for (size_t i=0; i<length; i++) {
A[i] += B[i] + scalar * C[i];
}
}
nstream_time = prk_wtime() - nstream_time;
}

//////////////////////////////////////////////////////////////////////
/// Analyze and output results
//////////////////////////////////////////////////////////////////////

double ar = 0.0;
double br = 2.0;
double cr = 2.0;
for (int i=0; i<=iterations; i++) {
ar += br + scalar * cr;
}

ar *= length;

double asum = 0.0;
#pragma acc parallel loop reduction( +:asum ) deviceptr(A)
for (size_t i=0; i<length; i++) {
asum += fabs(A[i]);
}

acc_free(A);
acc_free(B);
acc_free(C);

double epsilon=1.e-8;
if (fabs(ar-asum)/asum > epsilon) {
printf("Failed Validation on output array\n"
" Expected checksum: %lf\n"
" Observed checksum: %lf\n"
"ERROR: solution did not validate\n", ar, asum);
return 1;
} else {
printf("Solution validates\n");
double avgtime = nstream_time/iterations;
double nbytes = 4.0 * length * sizeof(double);
printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime);
}

return 0;
}


2 changes: 1 addition & 1 deletion C1z/nstream-petsc.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ int main(int argc, char * argv[])
#endif
PetscPrintf(PETSC_COMM_WORLD,"Number of processes = %d\n", np);
PetscPrintf(PETSC_COMM_WORLD,"Number of iterations = %d\n", iterations);
PetscPrintf(PETSC_COMM_WORLD,"Vector length = %zu\n", length);
PetscPrintf(PETSC_COMM_WORLD,"Vector length = %zu\n", (size_t)length);

//////////////////////////////////////////////////////////////////////
// Allocate space and perform the computation
Expand Down
Loading

0 comments on commit 62b799a

Please sign in to comment.