Skip to content

Commit 966e81e

Browse files
authored
OpenCL: support different format for parameter-stack (#901)
- Allow LIBSMM to be reused for CP2K/DBM. - Provide opencl_libsmm_acc_process to support param_format. - Allow host_param_stack to be a NULL-pointer. - Signal support for OPENCL_LIBSMM_PFORMAT. - Allow external perf-event control. - Updated tuned parameters. ACC_BENCH: - Adjusted rule for MAX_KERNEL_DIM (max_kernel_dim).
1 parent 12671ff commit 966e81e

8 files changed

+241
-133
lines changed

src/acc/acc_bench.c

+21-21
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,6 @@
5454
#if !defined(ELEM_TYPE)
5555
# define ELEM_TYPE double
5656
#endif
57-
#if !defined(MAX_KERNEL_DIM)
58-
# define MAX_KERNEL_DIM 80
59-
#endif
6057
#if !defined(ALIGNMENT)
6158
# define ALIGNMENT 64
6259
#endif
@@ -291,6 +288,7 @@ int main(int argc, char* argv[]) {
291288
#else
292289
const int mn = m * n, mk = m * k, kn = k * n;
293290
#endif
291+
const int max_kernel_dim = ceil(sqrt(m * n));
294292
int *stack_hst = NULL, *stack_dev = NULL, *trans_hst = NULL, *trans_dev = NULL;
295293
ELEM_TYPE *amat_hst = NULL, *bmat_hst = NULL, *cmat_hst = NULL;
296294
ELEM_TYPE *amat_dev = NULL, *bmat_dev = NULL, *cmat_dev = NULL;
@@ -353,7 +351,7 @@ int main(int argc, char* argv[]) {
353351
PRINTF(
354352
"%s%s%i %i %i %i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n, k, nc, na, nb);
355353
PRINTF("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE));
356-
if (MAX_KERNEL_DIM < m || MAX_KERNEL_DIM < n || MAX_KERNEL_DIM < k) {
354+
if (MAX_KERNEL_DIM < max_kernel_dim) {
357355
fprintf(stderr, "ERROR: Matrix shape exceeds MAX_KERNEL_DIM!\n");
358356
result = EXIT_FAILURE;
359357
}
@@ -364,7 +362,7 @@ int main(int argc, char* argv[]) {
364362
CHECK(c_dbcsr_acc_host_mem_allocate((void**)(void*)&stack_hst, sizeof(int) * 3 * stack_size, stream), &result, check);
365363
CHECK(c_dbcsr_acc_host_mem_allocate((void**)(void*)&trans_hst, sizeof(int) * nb, stream), &result, check);
366364
CHECK(c_dbcsr_acc_stream_sync(stream), &result, check); /* ensure host-data is allocated */
367-
if (NULL != amat_hst && NULL != bmat_hst && NULL != trans_hst && NULL != stack_hst) {
365+
if (NULL != amat_hst && NULL != bmat_hst && NULL != trans_hst && NULL != stack_hst && EXIT_SUCCESS == result) {
368366
init_stack(stack_hst, stack_size, NRAND, rnd, mn, mk, kn, nc, na, nb);
369367
#if defined(_OPENMP)
370368
# pragma omp parallel
@@ -404,7 +402,7 @@ int main(int argc, char* argv[]) {
404402
}
405403
#if defined(USE_LIBXSMM)
406404
CHECK(c_dbcsr_acc_stream_sync(stream), &result, check);
407-
if (NULL != amat_hst && NULL != bmat_hst && NULL != stack_hst) {
405+
if (NULL != amat_hst && NULL != bmat_hst && NULL != stack_hst && EXIT_SUCCESS == result) {
408406
const size_t size = (sizeof(ELEM_TYPE) * (mk * na + kn * nb) + sizeof(int) * 3 * stack_size) * nrepeat_h2d;
409407
duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
410408
perf_h2d = size / (duration * (1ULL << 30));
@@ -414,17 +412,17 @@ int main(int argc, char* argv[]) {
414412
#if defined(TRANSPOSE) && defined(VALIDATE)
415413
/* warmup execution and prebuild transpose-kernel */
416414
for (r = 0; r < warmup / 2; ++r) {
417-
CHECK(libsmm_acc_transpose(trans_dev, 0 /*offset*/, nb, bmat_dev, DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream),
415+
CHECK(libsmm_acc_transpose(trans_dev, 0 /*offset*/, nb, bmat_dev, DBCSR_TYPE(ELEM_TYPE), k, n, max_kernel_dim, stream),
418416
&result, check);
419-
CHECK(libsmm_acc_transpose(trans_dev, 0 /*offset*/, nb, bmat_dev, DBCSR_TYPE(ELEM_TYPE), n, k, MAX_KERNEL_DIM, stream),
417+
CHECK(libsmm_acc_transpose(trans_dev, 0 /*offset*/, nb, bmat_dev, DBCSR_TYPE(ELEM_TYPE), n, k, max_kernel_dim, stream),
420418
&result, check);
421419
}
422420
# if defined(USE_LIBXSMM)
423421
CHECK(c_dbcsr_acc_stream_sync(stream), &result, check);
424422
start = libxsmm_timer_tick();
425423
# endif
426424
/* to perform NN-SMMs on the device, all B-matrices are transposed upfront (SMM-kernel is limited to NT) */
427-
CHECK(libsmm_acc_transpose(trans_dev, 0 /*offset*/, nb, bmat_dev, DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream),
425+
CHECK(libsmm_acc_transpose(trans_dev, 0 /*offset*/, nb, bmat_dev, DBCSR_TYPE(ELEM_TYPE), k, n, max_kernel_dim, stream),
428426
&result, check);
429427
# if defined(USE_LIBXSMM)
430428
CHECK(c_dbcsr_acc_stream_sync(stream), &result, check);
@@ -434,7 +432,7 @@ int main(int argc, char* argv[]) {
434432
/* warmup execution and prebuild SMM-kernel */
435433
for (r = 0; r < warmup; ++r) {
436434
CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, DBCSR_TYPE(ELEM_TYPE), amat_dev, bmat_dev, cmat_dev, m, n, k,
437-
MAX_KERNEL_DIM, 1 /*homogeneous*/, stream, stream),
435+
max_kernel_dim, 1 /*homogeneous*/, stream, stream),
438436
&result, check);
439437
}
440438
CHECK(c_dbcsr_acc_memset_zero(cmat_dev, 0 /*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result, check);
@@ -445,28 +443,30 @@ int main(int argc, char* argv[]) {
445443
for (r = 0; r < nrepeat; ++r) {
446444
/* GPU-kernel is limited to C += Ai * Bi^T, i.e., NT (for NN, all Bi must be transposed upfront) */
447445
CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, DBCSR_TYPE(ELEM_TYPE), amat_dev, bmat_dev, cmat_dev, m, n, k,
448-
MAX_KERNEL_DIM, 1 /*homogeneous*/, stream, stream),
446+
max_kernel_dim, 1 /*homogeneous*/, stream, stream),
449447
&result, check);
450448
}
451449
#if defined(USE_LIBXSMM)
452450
CHECK(c_dbcsr_acc_stream_sync(stream), &result, check);
453451
duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
454-
if (0 < duration && EXIT_SUCCESS == result) {
452+
if (EXIT_SUCCESS == result) {
453+
if (0 < duration) {
455454
# if defined(TRANSPOSE) && defined(VALIDATE)
456-
PRINTF("transpose: %.2g ms %.1f GFLOPS/s\n", 1000.0 * (duration + transpose) / (nrepeat * nrepeat_smm),
457-
1E-9 * ((size_t)2 * m * n * k * stack_size * nrepeat * nrepeat_smm) / (duration + transpose));
455+
PRINTF("transpose: %.2g ms %.1f GFLOPS/s\n", 1000.0 * (duration + transpose) / (nrepeat * nrepeat_smm),
456+
1E-9 * ((size_t)2 * m * n * k * stack_size * nrepeat * nrepeat_smm) / (duration + transpose));
458457
# endif
459-
perf_dev = 1E-9 * ((size_t)2 * m * n * k * stack_size * nrepeat * nrepeat_smm) / duration;
460-
PRINTF("device: %.2g ms %.1f GFLOPS/s\n", 1000.0 * duration / (nrepeat * nrepeat_smm), perf_dev);
461-
}
462-
else {
458+
perf_dev = 1E-9 * ((size_t)2 * m * n * k * stack_size * nrepeat * nrepeat_smm) / duration;
459+
PRINTF("device: %.2g ms %.1f GFLOPS/s\n", 1000.0 * duration / (nrepeat * nrepeat_smm), perf_dev);
460+
}
461+
else {
463462
# if defined(TRANSPOSE)
464-
PRINTF("transpose: 0 ms 0 GFLOPS/s\n");
463+
PRINTF("transpose: 0 ms 0 GFLOPS/s\n");
465464
# endif
466-
PRINTF("device: 0 ms 0 GFLOPS/s\n");
465+
PRINTF("device: 0 ms 0 GFLOPS/s\n");
466+
}
467467
}
468468
# if defined(VALIDATE)
469-
{
469+
if (EXIT_SUCCESS == result) {
470470
ELEM_TYPE* const gold_hst = (ELEM_TYPE*)(0 != check ? libxsmm_malloc(sizeof(ELEM_TYPE) * mn * nc) : NULL);
471471
/* determine host's performance independent of current result code/status */
472472
if (NULL != gold_hst && NULL != amat_hst && NULL != bmat_hst && NULL != stack_hst) {

src/acc/libsmm_acc/libsmm_acc_benchmark.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ int libsmm_acc_benchmark(
323323
//===========================================================================
324324
int libsmm_acc_benchmark_transpose_(int n_stack, int* stack, int* d_stack, double* mat, double* mat_trs, double* d_mat, int n,
325325
int mat_m, int mat_n, ACC_DRV(event) start, ACC_DRV(event) stop, char** kernel_descr, TransposeLauncher* launcher) {
326-
if (mat_m > MAX_KERNEL_DIM || mat_n > MAX_KERNEL_DIM) {
326+
if ((mat_m * mat_n) > (MAX_KERNEL_DIM * MAX_KERNEL_DIM)) {
327327
printf("Cannot transpose matrices with dimensions above %i, got (%i x %i)\n", MAX_KERNEL_DIM, mat_m, mat_n);
328328
exit(1);
329329
}

src/acc/opencl/smm/kernels/multiply.cl

+41-34
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,6 @@
5353
#if !defined(SINT) /* covers matrix shape */
5454
# define SINT signed char
5555
#endif
56-
57-
#if defined(SLM_P) && (1 < BS)
58-
# define IDXBASE 0
59-
#else
60-
# define IDXBASE 1
61-
#endif
6256
#if !defined(REPEAT)
6357
# define REPEAT 1
6458
#endif
@@ -76,20 +70,23 @@ __attribute__((reqd_work_group_size(WG, 1, 1)))
7670
__attribute__((intel_reqd_sub_group_size(SG)))
7771
#endif
7872
kernel void
79-
FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* restrict bdata,
73+
FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* restrict bdata, GLOBAL const int* restrict param_stack,
8074
#if (1 < BS)
81-
GLOBAL const int* restrict param_stack, int stack_size, int bs) {
75+
int param_format, int stack_size, int bs) {
8276
const int gid = get_group_id(0), idx = get_local_id(0);
8377
#else
84-
GLOBAL const int* restrict param_stack) {
78+
int param_format) {
8579
const int gid = get_group_id(0), idx = get_local_id(0), bs = 1;
8680
#endif
87-
/* indexes given by param_stack are one-based (Fortran) */
88-
GLOBAL const int* restrict pbase = param_stack + gid * (3 * bs);
81+
const int pzero = (param_format) & 255;
82+
const int pbase = (param_format >> 8) & 255;
83+
const int pnext = (param_format >> 16) & 255;
84+
/* param_stack/indexes can be one-based (Fortran) depending on param_format */
85+
GLOBAL const int* restrict param_base = param_stack + gid * (pnext * bs);
8986
#if defined(SLM_P) && (1 < BS)
9087
local int params[3 * BS]; /* bs <= BS */
9188
#else
92-
GLOBAL const int* restrict params = pbase;
89+
GLOBAL const int* restrict params = param_base;
9390
#endif
9491
#if defined(SLM_A)
9592
# if (1 != BK || BM < SM || 1 != BN)
@@ -143,9 +140,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
143140
int c0;
144141
# if defined(SLM_C)
145142
local T cnm[SN][SM + SLM_C - 1]; /* tile in SLM */
146-
for (SINT n = (SINT)idx; n < SN; n += WG) {
147-
UNROLL_FORCE(SM) for (SINT m = 0; m < SM; ++m) cnm[n][m] = ZERO;
148-
}
143+
UNROLL_AUTO for (SINT n = (SINT)idx; n < SN; n += WG) { UNROLL_FORCE(SM) for (SINT m = 0; m < SM; ++m) cnm[n][m] = ZERO; }
149144
# elif (BM < SM || 1 != BN)
150145
# if (1 != BN)
151146
UNROLL(BN)
@@ -158,15 +153,21 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
158153
UNROLL_FORCE(SM) for (SINT m = 0; m < SM; ++m) cnm[m] = ZERO;
159154
# endif
160155
# if defined(SLM_P)
161-
UNROLL_FORCE(3 * BS) for (int i = idx; i < (3 * batchsize); i += WG) params[i] = pbase[i] - 1;
156+
UNROLL_AUTO for (int i = idx; i < batchsize; i += WG) {
157+
UNROLL_FORCE(3) for (int j = 0; j < 3; ++j) { params[3 * i + j] = param_base[pnext * i + pbase + j] - pzero; }
158+
}
162159
# endif
163160
# if defined(BARRIER) && (MAX(1, SG) < WG) && (defined(SLM_C) || defined(SLM_P))
164161
BARRIER(CLK_LOCAL_MEM_FENCE);
165162
# endif
166163
# if (WRK < WG)
167164
if (WRK <= idx) return; /* WRK <= idx */
168165
# endif
169-
c0 = params[2] - IDXBASE;
166+
# if defined(SLM_P)
167+
c0 = params[pbase + 2];
168+
# else
169+
c0 = params[pbase + 2] - pzero;
170+
# endif
170171
# if defined(BSC) && (1 != BK) && (1 != UM)
171172
UNROLL_OUTER(REPEAT * BS)
172173
# else
@@ -179,21 +180,27 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
179180
for (int item = 0; item < (REPEAT * batchsize); ++item) {
180181
const int i = item;
181182
# endif
182-
const int a0 = params[3 * i] - IDXBASE, b0 = params[3 * i + 1] - IDXBASE;
183-
const int c1 = ((i + 1) < batchsize ? (params[3 * i + 5] - IDXBASE) : -1);
183+
# if defined(SLM_P)
184+
const int idxstride = 3, idxbase = 0;
185+
# else
186+
const int idxstride = pnext, idxbase = pzero;
187+
# endif
188+
const int a0 = params[idxstride * i + pbase] - idxbase, b0 = params[idxstride * i + pbase + 1] - idxbase;
189+
const int c1 = ((i + 1) < batchsize ? (params[idxstride * i + pbase + idxstride + 2] - idxbase) : -1);
184190
#else
185191
# if (WRK < WG)
186192
if (WRK > idx) /* WRK > idx */
187193
# endif
188194
{
189-
const int a0 = params[0] - IDXBASE, b0 = params[1] - IDXBASE, c0 = params[2] - IDXBASE;
195+
const int a0 = params[pbase + 0] - pzero, b0 = params[pbase + 1] - pzero;
196+
const int c0 = params[pbase + 2] - pzero;
190197
#endif
191198

192199
#if defined(SLM_A) && (1 != BK || BM < SM || 1 != BN)
193200
{ /* copy or transpose A-matrix into SLM */
194201
int m = idx;
195202
# if (WRK != SM)
196-
for (; m < SM; m += WRK)
203+
UNROLL_AUTO for (; m < SM; m += WRK)
197204
# endif
198205
{
199206
UNROLL_FORCE(SK) for (SINT k = 0; k < SK; ++k) amk[m][k] = ADX(m, k);
@@ -205,7 +212,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
205212
{ /* copy or transpose B-matrix into SLM */
206213
int n = idx;
207214
# if (WRK != SN)
208-
for (; n < SN; n += WRK)
215+
UNROLL_AUTO for (; n < SN; n += WRK)
209216
# endif
210217
{
211218
UNROLL(SK) for (SINT k = 0; k < SK; ++k) bnk[n][k] = BDX(k, n);
@@ -252,10 +259,10 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
252259
T cnm[BN]; /* row */
253260
UNROLL_FORCE(BN) for (SINT n = 0; n < BN; ++n) cnm[n] = ZERO;
254261
# endif
255-
UNROLL(BM)
256262
# if (SM % BM)
257-
for (SINT bm = 0, m = m0; bm < BM && m < SM; m = ++bm + m0)
263+
UNROLL_AUTO for (SINT bm = 0, m = m0; bm < BM && m < SM; m = ++bm + m0)
258264
# else
265+
UNROLL(BM)
259266
for (SINT bm = 0, m = m0; bm < BM; m = ++bm + m0)
260267
# endif
261268
{ /* general BK, A in registers */
@@ -337,10 +344,10 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
337344
# else
338345
const T b = BNK(n, k);
339346
# endif
340-
UNROLL_FORCE(BM)
341347
# if (SM % BM)
342-
for (SINT bm = 0, m = m0; bm < BM && m < SM; m = ++bm + m0)
348+
UNROLL_AUTO for (SINT bm = 0, m = m0; bm < BM && m < SM; m = ++bm + m0)
343349
# else
350+
UNROLL_FORCE(BM)
344351
for (SINT bm = 0, m = m0; bm < BM; m = ++bm + m0)
345352
# endif
346353
{
@@ -387,10 +394,10 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
387394
T cnm[BM]; /* column-block */
388395
UNROLL_FORCE(BM) for (SINT m = 0; m < BM; ++m) cnm[m] = ZERO;
389396
# endif
390-
UNROLL(BM)
391397
# if (SM % BM)
392-
for (SINT bm = 0, m = m0; bm < BM && m < SM; m = ++bm + m0)
398+
UNROLL_AUTO for (SINT bm = 0, m = m0; bm < BM && m < SM; m = ++bm + m0)
393399
# else
400+
UNROLL(BM)
394401
for (SINT bm = 0, m = m0; bm < BM; m = ++bm + m0)
395402
# endif
396403
{
@@ -440,7 +447,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
440447
const T b = BNK(idx, k);
441448
# if defined(SLM_A)
442449
# if (WRK != SM)
443-
for (SINT m = (SINT)idx; m < SM; m += WRK) amk[m] = ADX(m, k);
450+
UNROLL_AUTO for (SINT m = (SINT)idx; m < SM; m += WRK) amk[m] = ADX(m, k);
444451
# else
445452
amk[idx] = ADX(idx, k);
446453
# endif
@@ -483,7 +490,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
483490
# if (1 == UM)
484491
UNROLL_OUTER(SM)
485492
# endif
486-
for (; m < (SM - UM + 1); m += UM) {
493+
UNROLL_AUTO for (; m < (SM - UM + 1); m += UM) {
487494
u = 0;
488495
# if (1 < UM)
489496
UNROLL(UM)
@@ -571,10 +578,10 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
571578
if (n < SN) /* n < SN */
572579
# endif
573580
{
574-
UNROLL_FORCE(BM)
575581
# if (SM % BM)
576-
for (SINT bm = 0, m = m0; bm < BM && m < SM; m = ++bm + m0)
582+
UNROLL_AUTO for (SINT bm = 0, m = m0; bm < BM && m < SM; m = ++bm + m0)
577583
# else
584+
UNROLL_FORCE(BM)
578585
for (SINT bm = 0, m = m0; bm < BM; m = ++bm + m0)
579586
# endif
580587
{
@@ -605,7 +612,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
605612
{ /* atomically commit C-column to global memory */
606613
SINT m = 0;
607614
# if defined(ATOMIC_ADD2_GLOBAL)
608-
for (; m < (SM - 1); m += 2) {
615+
UNROLL_AUTO for (; m < (SM - 1); m += 2) {
609616
# if defined(ATOMIC_INC_NZ)
610617
if (ZERO != CNM(idx, m) && ZERO != CNM(idx, m + 1))
611618
# endif
@@ -619,7 +626,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
619626
UNROLL(SM)
620627
# endif
621628
# if !defined(ATOMIC_ADD2_GLOBAL) || (SM & 1)
622-
for (; m < SM; ++m) {
629+
UNROLL_AUTO for (; m < SM; ++m) {
623630
# if defined(ATOMIC_INC_NZ)
624631
if (ZERO != CNM(idx, m))
625632
# endif

0 commit comments

Comments
 (0)