Skip to content

Commit 62b799a

Browse files
authored
Merge branch 'default' into cuf-rename
2 parents 9fe9c32 + 6554741 commit 62b799a

File tree

109 files changed

+9201
-1928
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

109 files changed

+9201
-1928
lines changed

.gitignore

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ C1z/nstream-mmap
6565
C1z/nstream-mmap-openmp
6666
C1z/nstream-mpi
6767
C1z/nstream-openmp
68+
C1z/nstream-openacc
6869
C1z/nstream-petsc
6970
C1z/nstream-target
7071
C1z/nstream-taskloop
@@ -88,6 +89,7 @@ C1z/stencil-2d
8889
C1z/stencil-2d-openmp
8990
C1z/stencil-cilk
9091
C1z/stencil-openmp
92+
C1z/stencil-openacc
9193
C1z/stencil-target
9294
C1z/stencil-taskloop
9395
C1z/transpose
@@ -96,6 +98,7 @@ C1z/transpose-2d-openmp
9698
C1z/transpose-cilk
9799
C1z/transpose-ispc
98100
C1z/transpose-openmp
101+
C1z/transpose-openacc
99102
C1z/transpose-petsc
100103
C1z/transpose-target
101104
C1z/transpose-taskloop
@@ -136,7 +139,9 @@ Cxx11/nstream-cublas
136139
Cxx11/nstream-cuda
137140
Cxx11/nstream-cuda-managed
138141
Cxx11/nstream-dpcpp
142+
Cxx11/nstream-onedpl
139143
Cxx11/nstream-executors
144+
Cxx11/nstream-halide
140145
Cxx11/nstream-hip
141146
Cxx11/nstream-hipblas
142147
Cxx11/nstream-hipstl
@@ -151,6 +156,7 @@ Cxx11/nstream-multigpu-dpcpp
151156
Cxx11/nstream-onemkl
152157
Cxx11/nstream-opencl
153158
Cxx11/nstream-openmp
159+
Cxx11/nstream-openacc
154160
Cxx11/nstream-openmp-target
155161
Cxx11/nstream-pstl
156162
Cxx11/nstream-raja
@@ -171,6 +177,7 @@ Cxx11/nstream-vector-raja
171177
Cxx11/p2p
172178
Cxx11/p2p-doacross-openmp
173179
Cxx11/p2p-hyperplane-openmp
180+
Cxx11/p2p-hyperplane-openacc
174181
Cxx11/p2p-hyperplane-pstl
175182
Cxx11/p2p-hyperplane-stl
176183
Cxx11/p2p-hyperplane-sycl
@@ -204,11 +211,13 @@ Cxx11/stencil
204211
Cxx11/stencil-cilk
205212
Cxx11/stencil-cuda
206213
Cxx11/stencil-dpcpp
214+
Cxx11/stencil-halide
207215
Cxx11/stencil-hip
208216
Cxx11/stencil-kokkos
209217
Cxx11/stencil-mpi
210218
Cxx11/stencil-opencl
211219
Cxx11/stencil-openmp
220+
Cxx11/stencil-openacc
212221
Cxx11/stencil-openmp-target
213222
Cxx11/stencil-pstl
214223
Cxx11/stencil-raja
@@ -233,13 +242,15 @@ Cxx11/transpose-cublas
233242
Cxx11/transpose-cuda
234243
Cxx11/transpose-device-thrust
235244
Cxx11/transpose-dpcpp
245+
Cxx11/transpose-halide
236246
Cxx11/transpose-hip
237247
Cxx11/transpose-hipblas
238248
Cxx11/transpose-host-thrust
239249
Cxx11/transpose-kokkos
240250
Cxx11/transpose-mpi
241251
Cxx11/transpose-opencl
242252
Cxx11/transpose-openmp
253+
Cxx11/transpose-openacc
243254
Cxx11/transpose-openmp-target
244255
Cxx11/transpose-pstl
245256
Cxx11/transpose-raja
@@ -366,12 +377,26 @@ RUST/nstream-unsafe/Cargo.lock
366377
RUST/nstream-unsafe/target/
367378
RUST/nstream-iter/Cargo.lock
368379
RUST/nstream-iter/target/
380+
RUST/nstream-rayon/Cargo.lock
381+
RUST/nstream-rayon/target/
382+
RUST/dgemm/Cargo.lock
383+
RUST/dgemm/target/
384+
RUST/dgemm-blis/Cargo.lock
385+
RUST/dgemm-blis/target/
386+
RUST/dgemm-iter/Cargo.lock
387+
RUST/dgemm-iter/target/
388+
RUST/dgemm-rayon/Cargo.lock
389+
RUST/dgemm-rayon/target/
369390
RUST/p2p/Cargo.lock
370391
RUST/p2p/target/
371392
RUST/stencil/Cargo.lock
372393
RUST/stencil/target/
373394
RUST/transpose/Cargo.lock
374395
RUST/transpose/target/
396+
RUST/transpose-iter/Cargo.lock
397+
RUST/transpose-iter/target/
398+
RUST/transpose-rayon/Cargo.lock
399+
RUST/transpose-rayon/target/
375400
SERIAL/AMR/amr
376401
SERIAL/Branch/branch
377402
SERIAL/DGEMM/dgemm

C1z/Makefile

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@ include ../common/PRKVERSION
33

44
CPPFLAGS = -DPRKVERSION=$(PRKVERSION)
55

6-
CFLAGS = $(DEFAULT_OPT_FLAGS) $(CPPFLAGS)
7-
86
# debugging
97
ifdef VERBOSE
10-
CFLAGS += -DVERBOSE
8+
CPPFLAGS += -DVERBOSE
119
endif
1210

11+
CFLAGS = $(DEFAULT_OPT_FLAGS) $(CPPFLAGS)
12+
13+
1314
ifdef PRK_USE_MMAP
1415
CFLAGS += -DPRK_USE_MMAP
1516
endif
@@ -25,31 +26,19 @@ OMPFLAGS = $(OPENMPFLAG)
2526
TARGETFLAGS = $(OFFLOADFLAG)
2627
CILKFLAGS = $(CILKFLAG)
2728
ISPCFLAGS = $(ISPCFLAG)
29+
OPENACCFLAGS = $(OPENACCFLAG)
2830

29-
.PHONY: all clean serial thread openmp target taskloop ispc # cilk
31+
.PHONY: all clean serial thread openmp tasks target taskloop ispc
3032

3133
EXTRA=
32-
ifeq ($(shell uname -s),Darwin)
33-
ifneq ($(findstring icc,$(CC)),icc)
34-
EXTRA += target
35-
endif
36-
else
37-
ifneq ($(findstring icx,$(CC)),icx)
38-
EXTRA += target
39-
endif
40-
endif
4134
ifdef ($(ISPC))
4235
EXTRA += ispc
4336
endif
4437
ifneq ($(CILKFLAG),)
4538
EXTRA += cilk
4639
endif
47-
ifeq ($(findstring xlc,$(CC)),xlc)
48-
EXTRA = target
49-
CFLAGS += -DXLC
50-
endif
51-
ifneq ($(findstring icx,$(CC)),icx)
52-
EXTRA += tasks
40+
ifneq ($(OPENACCFLAG),)
41+
EXTRA += openacc
5342
endif
5443

5544
all: serial thread openmp $(EXTRA)
@@ -83,6 +72,8 @@ target: nstream-target stencil-target transpose-target nstream-alloc-target nstr
8372

8473
taskloop: nstream-taskloop stencil-taskloop transpose-taskloop
8574

75+
openacc: nstream-openacc stencil-openacc transpose-openacc
76+
8677
cilk: stencil-cilk transpose-cilk
8778

8879
ispc: transpose-ispc
@@ -132,6 +123,9 @@ p2p-2d: p2p-2d.c prk_util.h
132123
%-openmp: %-openmp.c prk_util.h prk_openmp.h
133124
$(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@
134125

126+
%-openacc: %-openacc.c prk_util.h
127+
$(CC) $(CFLAGS) $< $(OPENACCFLAGS) $(EXTRA_CLIBS) -o $@
128+
135129
%-cilk: %-cilk.c prk_util.h
136130
$(CC) $(CFLAGS) $< $(CILKFLAGS) $(EXTRA_CLIBS) -o $@
137131

@@ -161,6 +155,7 @@ clean:
161155
-rm -f p2p-sse p2p-avx p2p-avx3 p2p-avx-tasks-openmp
162156
-rm -f *-2d
163157
-rm -f *-openmp
158+
-rm -f *-openacc
164159
-rm -f *-mpi
165160
-rm -f *-petsc
166161
-rm -f *-target

C1z/generate-c-stencil.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,11 @@ def codegen(src,pattern,stencil_size,radius,W,model,dim):
2121
if (model=='openmp'):
2222
outer += 'OMP_FOR()\n '
2323
elif (model=='target'):
24-
outer += 'OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )\n '
24+
outer += 'OMP_TARGET( teams distribute parallel for simd collapse(2) )\n '
2525
elif (model=='taskloop'):
2626
outer += 'OMP_TASKLOOP( firstprivate(n) shared(in,out) grainsize(gs) )\n '
27+
elif (model=='openacc'):
28+
outer += 'PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )\n '
2729
elif (model=='cilk'):
2830
outer += '_Cilk_'
2931

@@ -82,7 +84,7 @@ def instance(src,model,pattern,r,dim):
8284
codegen(src,pattern,stencil_size,r,W,model,dim)
8385

8486
def main():
85-
for model in ['seq','openmp','target','cilk','taskloop']:
87+
for model in ['seq','openmp','target','cilk','taskloop','openacc']:
8688
src = open('stencil_'+model+'.h','w')
8789
for pattern in ['star','grid']:
8890
for r in range(1,10):

C1z/nstream-openacc.c

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
///
2+
/// Copyright (c) 2019, Intel Corporation
3+
/// Copyright (c) 2022, NVIDIA
4+
///
5+
/// Redistribution and use in source and binary forms, with or without
6+
/// modification, are permitted provided that the following conditions
7+
/// are met:
8+
///
9+
/// * Redistributions of source code must retain the above copyright
10+
/// notice, this list of conditions and the following disclaimer.
11+
/// * Redistributions in binary form must reproduce the above
12+
/// copyright notice, this list of conditions and the following
13+
/// disclaimer in the documentation and/or other materials provided
14+
/// with the distribution.
15+
/// * Neither the name of Intel Corporation nor the names of its
16+
/// contributors may be used to endorse or promote products
17+
/// derived from this software without specific prior written
18+
/// permission.
19+
///
20+
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21+
/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22+
/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23+
/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24+
/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25+
/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26+
/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27+
/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28+
/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29+
/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30+
/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31+
/// POSSIBILITY OF SUCH DAMAGE.
32+
33+
//////////////////////////////////////////////////////////////////////
34+
///
35+
/// NAME: nstream
36+
///
37+
/// PURPOSE: To compute memory bandwidth when adding a vector of a given
38+
/// number of double precision values to the scalar multiple of
39+
/// another vector of the same length, and storing the result in
40+
/// a third vector.
41+
///
42+
/// USAGE: The program takes as input the number
43+
/// of iterations to loop over the triad vectors and
44+
/// the length of the vectors.
45+
///
46+
/// <progname> <# iterations> <vector length>
47+
///
48+
/// The output consists of diagnostics to make sure the
49+
/// algorithm worked, and of timing statistics.
50+
///
51+
/// NOTES: Bandwidth is determined as the number of words read, plus the
52+
/// number of words written, times the size of the words, divided
53+
/// by the execution time. For a vector length of N, the total
54+
/// number of words read and written is 4*N*sizeof(double).
55+
///
56+
/// HISTORY: This code is loosely based on the Stream benchmark by John
57+
/// McCalpin, but does not follow all the Stream rules. Hence,
58+
/// reported results should not be associated with Stream in
59+
/// external publications
60+
///
61+
/// Converted to C++11 by Jeff Hammond, November 2017.
62+
/// Converted to C11 by Jeff Hammond, February 2019.
63+
///
64+
//////////////////////////////////////////////////////////////////////
65+
66+
#include <openacc.h>
67+
#include "prk_util.h"
68+
69+
int main(int argc, char * argv[])
70+
{
71+
printf("Parallel Research Kernels version %d\n", PRKVERSION );
72+
printf("C11/OpenACC STREAM triad: A = B + scalar * C\n");
73+
74+
//////////////////////////////////////////////////////////////////////
75+
/// Read and test input parameters
76+
//////////////////////////////////////////////////////////////////////
77+
78+
if (argc < 3) {
79+
printf("Usage: <# iterations> <vector length>\n");
80+
return 1;
81+
}
82+
83+
int iterations = atoi(argv[1]);
84+
if (iterations < 1) {
85+
printf("ERROR: iterations must be >= 1\n");
86+
return 1;
87+
}
88+
89+
// length of a the vector
90+
size_t length = atol(argv[2]);
91+
if (length <= 0) {
92+
printf("ERROR: Vector length must be greater than 0\n");
93+
return 1;
94+
}
95+
96+
printf("Number of iterations = %d\n", iterations);
97+
printf("Vector length = %zu\n", length);
98+
99+
//////////////////////////////////////////////////////////////////////
100+
// Allocate space and perform the computation
101+
//////////////////////////////////////////////////////////////////////
102+
103+
double nstream_time = 0.0;
104+
105+
size_t bytes = length*sizeof(double);
106+
double * restrict A = acc_malloc(bytes);
107+
double * restrict B = acc_malloc(bytes);
108+
double * restrict C = acc_malloc(bytes);
109+
110+
double scalar = 3.0;
111+
112+
{
113+
#pragma acc parallel loop deviceptr(A,B,C)
114+
for (size_t i=0; i<length; i++) {
115+
A[i] = 0.0;
116+
B[i] = 2.0;
117+
C[i] = 2.0;
118+
}
119+
120+
for (int iter = 0; iter<=iterations; iter++) {
121+
122+
if (iter==1) nstream_time = prk_wtime();
123+
124+
#pragma acc parallel loop deviceptr(A,B,C)
125+
for (size_t i=0; i<length; i++) {
126+
A[i] += B[i] + scalar * C[i];
127+
}
128+
}
129+
nstream_time = prk_wtime() - nstream_time;
130+
}
131+
132+
//////////////////////////////////////////////////////////////////////
133+
/// Analyze and output results
134+
//////////////////////////////////////////////////////////////////////
135+
136+
double ar = 0.0;
137+
double br = 2.0;
138+
double cr = 2.0;
139+
for (int i=0; i<=iterations; i++) {
140+
ar += br + scalar * cr;
141+
}
142+
143+
ar *= length;
144+
145+
double asum = 0.0;
146+
#pragma acc parallel loop reduction( +:asum ) deviceptr(A)
147+
for (size_t i=0; i<length; i++) {
148+
asum += fabs(A[i]);
149+
}
150+
151+
acc_free(A);
152+
acc_free(B);
153+
acc_free(C);
154+
155+
double epsilon=1.e-8;
156+
if (fabs(ar-asum)/asum > epsilon) {
157+
printf("Failed Validation on output array\n"
158+
" Expected checksum: %lf\n"
159+
" Observed checksum: %lf\n"
160+
"ERROR: solution did not validate\n", ar, asum);
161+
return 1;
162+
} else {
163+
printf("Solution validates\n");
164+
double avgtime = nstream_time/iterations;
165+
double nbytes = 4.0 * length * sizeof(double);
166+
printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime);
167+
}
168+
169+
return 0;
170+
}
171+
172+

C1z/nstream-petsc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ int main(int argc, char * argv[])
119119
#endif
120120
PetscPrintf(PETSC_COMM_WORLD,"Number of processes = %d\n", np);
121121
PetscPrintf(PETSC_COMM_WORLD,"Number of iterations = %d\n", iterations);
122-
PetscPrintf(PETSC_COMM_WORLD,"Vector length = %zu\n", length);
122+
PetscPrintf(PETSC_COMM_WORLD,"Vector length = %zu\n", (size_t)length);
123123

124124
//////////////////////////////////////////////////////////////////////
125125
// Allocate space and perform the computation

0 commit comments

Comments
 (0)