From 0fe90e7a9bd975b8491c4d92cb06c22af360c97a Mon Sep 17 00:00:00 2001 From: Hans Pabst Date: Thu, 17 Feb 2022 10:49:09 +0100 Subject: [PATCH] ocl: pooled malloc for data-handles (#559) * Fixed evil style after applying clang-format. * Fixed pruning the number of devices. --- src/acc/opencl/acc_opencl.c | 205 +++++++++++++++++++---------- src/acc/opencl/acc_opencl.h | 8 ++ src/acc/opencl/acc_opencl_event.c | 34 +++-- src/acc/opencl/acc_opencl_mem.c | 54 +++++--- src/acc/opencl/acc_opencl_stream.c | 51 ++++--- src/acc/opencl/smm/opencl_libsmm.c | 165 +++++++++++++++-------- 6 files changed, 354 insertions(+), 163 deletions(-) diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c index 419348dc0f1..a55da08f6f8 100644 --- a/src/acc/opencl/acc_opencl.c +++ b/src/acc/opencl/acc_opencl.c @@ -75,10 +75,11 @@ cl_context c_dbcsr_acc_opencl_context(void) { for (; i < c_dbcsr_acc_opencl_config.nthreads; ++i) { if (tid != i) { /* adopt another context */ result = c_dbcsr_acc_opencl_config.contexts[i]; - if (NULL != result && CL_SUCCESS == clRetainContext(result)) + if (NULL != result && CL_SUCCESS == clRetainContext(result)) { break; - else + } else { result = NULL; + } } } } @@ -97,8 +98,9 @@ cl_context c_dbcsr_acc_opencl_device_context(cl_device_id device, const int *thr if (CL_SUCCESS == clGetContextInfo(result, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device_id, NULL) && device == device_id) { break; - } else + } else { result = NULL; + } } } return result; @@ -117,11 +119,11 @@ int c_dbcsr_acc_opencl_order_devices(const void *dev_a, const void *dev_b) { assert(NULL != a && NULL != b && a != b); ACC_OPENCL_EXPECT(EXIT_SUCCESS, clGetDeviceInfo(*a, CL_DEVICE_TYPE, sizeof(cl_device_type), &type_a, NULL)); ACC_OPENCL_EXPECT(EXIT_SUCCESS, clGetDeviceInfo(*b, CL_DEVICE_TYPE, sizeof(cl_device_type), &type_b, NULL)); - if (CL_DEVICE_TYPE_DEFAULT & type_a) + if (CL_DEVICE_TYPE_DEFAULT & type_a) { return -1; - else if (CL_DEVICE_TYPE_DEFAULT & type_b) + } else if (CL_DEVICE_TYPE_DEFAULT & type_b) { return 1; - else { + } else { if (CL_DEVICE_TYPE_GPU & type_a) { if (CL_DEVICE_TYPE_GPU & type_b) { int unified_a, unified_b; @@ -132,26 +134,29 @@ int c_dbcsr_acc_opencl_order_devices(const void *dev_a, const void *dev_b) { return (size_a < size_b ? 1 : (size_a != size_b ? -1 : (a < b ? -1 : 1))); } /* discrete GPU goes in front */ - else if (0 == unified_b) + else if (0 == unified_b) { return 1; - else + } else { return -1; - } else + } + } else { return -1; - } else if (CL_DEVICE_TYPE_GPU & type_b) + } + } else if (CL_DEVICE_TYPE_GPU & type_b) { return 1; - else { + } else { if (CL_DEVICE_TYPE_CPU & type_a) { if (CL_DEVICE_TYPE_CPU & type_b) { size_t size_a, size_b; ACC_OPENCL_EXPECT(EXIT_SUCCESS, c_dbcsr_acc_opencl_info_devmem(*a, NULL, &size_a, NULL, NULL)); ACC_OPENCL_EXPECT(EXIT_SUCCESS, c_dbcsr_acc_opencl_info_devmem(*b, NULL, &size_b, NULL, NULL)); return (size_a < size_b ? 1 : (size_a != size_b ? -1 : (a < b ? -1 : 1))); - } else + } else { return -1; - } else if (CL_DEVICE_TYPE_CPU & type_b) + } + } else if (CL_DEVICE_TYPE_CPU & type_b) { return 1; - else { + } else { size_t size_a = 0, size_b = 0; ACC_OPENCL_EXPECT(EXIT_SUCCESS, c_dbcsr_acc_opencl_info_devmem(*a, NULL, &size_a, NULL, NULL)); ACC_OPENCL_EXPECT(EXIT_SUCCESS, c_dbcsr_acc_opencl_info_devmem(*b, NULL, &size_b, NULL, NULL)); @@ -251,14 +256,15 @@ int c_dbcsr_acc_init(void) { } } if (NULL != env_devtype && '\0' != *env_devtype) { - if (NULL != libxsmm_stristr(env_devtype, "gpu")) + if (NULL != libxsmm_stristr(env_devtype, "gpu")) { type = CL_DEVICE_TYPE_GPU; - else if (NULL != libxsmm_stristr(env_devtype, "cpu")) + } else if (NULL != libxsmm_stristr(env_devtype, "cpu")) { type = CL_DEVICE_TYPE_CPU; - else if (NULL != libxsmm_stristr(env_devtype, "acc") || NULL != libxsmm_stristr(env_devtype, "other")) { + } else if (NULL != libxsmm_stristr(env_devtype, "acc") || NULL != libxsmm_stristr(env_devtype, "other")) { type = CL_DEVICE_TYPE_ACCELERATOR; - } else + } else { type = CL_DEVICE_TYPE_ALL; + } } c_dbcsr_acc_opencl_config.ndevices = 0; for (i = 0; i < nplatforms; ++i) { @@ -293,8 +299,9 @@ int c_dbcsr_acc_init(void) { c_dbcsr_acc_opencl_config.devices + c_dbcsr_acc_opencl_config.ndevices, NULL)) { ACC_OPENCL_CHECK(clReleaseDevice(devices[j]), "release device", result); c_dbcsr_acc_opencl_config.ndevices += n; - } else + } else { break; + } } else { c_dbcsr_acc_opencl_config.devices[c_dbcsr_acc_opencl_config.ndevices] = devices[j]; ++c_dbcsr_acc_opencl_config.ndevices; @@ -317,10 +324,12 @@ int c_dbcsr_acc_init(void) { memmove(c_dbcsr_acc_opencl_config.devices + i, c_dbcsr_acc_opencl_config.devices + i + 1, sizeof(cl_device_id) * (c_dbcsr_acc_opencl_config.ndevices - i)); } - } else + } else { ++i; - } else + } + } else { break; /* error: retrieving device vendor */ + } } } /* reorder devices according to c_dbcsr_acc_opencl_order_devices */ @@ -349,17 +358,19 @@ int c_dbcsr_acc_init(void) { && 0 != strncmp(buffer, tmp, ACC_OPENCL_BUFFERSIZE)) { c_dbcsr_acc_opencl_config.ndevices = i + 1; strncpy(tmp, buffer, ACC_OPENCL_BUFFERSIZE); - } else + } else { break; /* error: retrieving device name */ + } } - } else + } else { break; /* error: retrieving device type */ + } } - } - /* prune number of devices to only expose requested ID */ - else if (0 != device_id) { + } else { /* prune number of devices to only expose requested ID */ if (1 < c_dbcsr_acc_opencl_config.ndevices) { - c_dbcsr_acc_opencl_config.devices[0] = c_dbcsr_acc_opencl_config.devices[device_id]; + if (0 < device_id) { + c_dbcsr_acc_opencl_config.devices[0] = c_dbcsr_acc_opencl_config.devices[device_id]; + } c_dbcsr_acc_opencl_config.ndevices = 1; } device_id = 0; @@ -375,15 +386,39 @@ int c_dbcsr_acc_init(void) { if (NULL != c_dbcsr_acc_opencl_config.contexts) { result = c_dbcsr_acc_opencl_set_active_device(/*master*/ 0, device_id); assert(EXIT_SUCCESS == result || NULL == c_dbcsr_acc_opencl_config.contexts[/*master*/ 0]); - } else + } else { result = EXIT_FAILURE; + } + c_dbcsr_acc_opencl_config.handle = 0; + c_dbcsr_acc_opencl_config.handles = NULL; + c_dbcsr_acc_opencl_config.storage = NULL; +#if LIBXSMM_VERSION4(1, 17, 0, 2188) <= LIBXSMM_VERSION_NUMBER && defined(ACC_OPENCL_HANDLES_MAXCOUNT) && \ + (0 < ACC_OPENCL_HANDLES_MAXCOUNT) + if (EXIT_SUCCESS == result) { + c_dbcsr_acc_opencl_config.handle = ACC_OPENCL_HANDLES_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads; + c_dbcsr_acc_opencl_config.handles = (void **)malloc(sizeof(void *) * c_dbcsr_acc_opencl_config.handle); + c_dbcsr_acc_opencl_config.storage = malloc(sizeof(void *) * c_dbcsr_acc_opencl_config.handle); + if (NULL != c_dbcsr_acc_opencl_config.handles && NULL != c_dbcsr_acc_opencl_config.storage) { + libxsmm_pmalloc_init(sizeof(void *), &c_dbcsr_acc_opencl_config.handle, c_dbcsr_acc_opencl_config.handles, + c_dbcsr_acc_opencl_config.storage); + } else { + free(c_dbcsr_acc_opencl_config.handles); + free(c_dbcsr_acc_opencl_config.storage); + c_dbcsr_acc_opencl_config.handles = NULL; + c_dbcsr_acc_opencl_config.storage = NULL; + c_dbcsr_acc_opencl_config.handle = 0; + result = EXIT_FAILURE; + } + } +#endif if (EXIT_SUCCESS == result) { const int nelements = ACC_OPENCL_STREAMS_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads; c_dbcsr_acc_opencl_config.streams = (void **)calloc(nelements, sizeof(void *)); /* allocate streams */ if (NULL != c_dbcsr_acc_opencl_config.streams) { /* allocate counters */ c_dbcsr_acc_opencl_config.stats = (cl_command_queue *)calloc(nelements, sizeof(cl_command_queue)); - } else + } else { result = EXIT_FAILURE; + } } } ACC_OPENCL_DEBUG_FPRINTF( @@ -406,8 +441,9 @@ int c_dbcsr_acc_init(void) { * The implementation of c_dbcsr_acc_init should hence be safe against "over initialization". * However, DBCSR only calls c_dbcsr_acc_init (and expects an implicit libsmm_acc_init). */ - if (EXIT_SUCCESS == result) + if (EXIT_SUCCESS == result) { result = libsmm_acc_init(); + } #endif } #if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) @@ -441,8 +477,9 @@ int c_dbcsr_acc_finalize(void) { * The implementation of c_dbcsr_acc_init should be safe against "over initialization". * However, DBCSR only calls c_dbcsr_acc_init and expects an implicit libsmm_acc_init(). */ - if (EXIT_SUCCESS == result) + if (EXIT_SUCCESS == result) { result = libsmm_acc_finalize(); + } #endif if (0 != c_dbcsr_acc_opencl_config.verbosity) { cl_device_id device; @@ -459,11 +496,13 @@ int c_dbcsr_acc_finalize(void) { fprintf(stderr, " streams={"); for (i = 0; i < nelements; i += ACC_OPENCL_STREAMS_MAXCOUNT) { for (j = 0, nstreams = 0; j < ACC_OPENCL_STREAMS_MAXCOUNT; ++j) { - if (NULL != c_dbcsr_acc_opencl_config.stats[i + j]) + if (NULL != c_dbcsr_acc_opencl_config.stats[i + j]) { ++nstreams; + } } - if (0 != nstreams || 0 == i) + if (0 != nstreams || 0 == i) { fprintf(stderr, 0 < i ? " %i" : "%i", nstreams); + } } qsort(c_dbcsr_acc_opencl_config.stats, nelements, sizeof(cl_command_queue), c_dbcsr_acc_opencl_order_streams); /* NULL -> upper end */ @@ -503,6 +542,8 @@ int c_dbcsr_acc_finalize(void) { } } /* release/reset buffers */ + free(c_dbcsr_acc_opencl_config.handles); + free(c_dbcsr_acc_opencl_config.storage); free(c_dbcsr_acc_opencl_config.streams); free(c_dbcsr_acc_opencl_config.contexts); /* clear configuration */ @@ -556,10 +597,12 @@ int c_dbcsr_acc_opencl_device(int thread_id, cl_device_id *device) { #endif if (NULL != context) { result = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), device, NULL); - } else + } else { *device = NULL; - } else + } + } else { *device = NULL; + } ACC_OPENCL_RETURN(result); } @@ -568,12 +611,14 @@ int c_dbcsr_acc_opencl_device_id(cl_device_id device, int *device_id, int *globa assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_DEVICES_MAXCOUNT); assert(NULL != device_id || NULL != global_id); for (i = 0; i < c_dbcsr_acc_opencl_config.ndevices; ++i) { - if (device == c_dbcsr_acc_opencl_config.devices[i]) + if (device == c_dbcsr_acc_opencl_config.devices[i]) { break; + } } if (i < c_dbcsr_acc_opencl_config.ndevices) { - if (NULL != device_id) + if (NULL != device_id) { *device_id = i; + } if (NULL != global_id) { *global_id = i; for (++i; i < ACC_OPENCL_DEVICES_MAXCOUNT; ++i) { @@ -582,17 +627,21 @@ int c_dbcsr_acc_opencl_device_id(cl_device_id device, int *device_id, int *globa *global_id = i; break; } - } else + } else { break; + } } } } else { - if (NULL != device_id) + if (NULL != device_id) { *device_id = -1; - if (NULL != global_id) + } + if (NULL != global_id) { *global_id = -1; - if (NULL != device) + } + if (NULL != device) { result = EXIT_FAILURE; + } } ACC_OPENCL_RETURN(result); } @@ -605,8 +654,9 @@ int c_dbcsr_acc_opencl_device_vendor(cl_device_id device, const char vendor[]) { "retrieve device vendor", result); if (EXIT_SUCCESS == result) { return (NULL != libxsmm_stristr(buffer, vendor) ? EXIT_SUCCESS : EXIT_FAILURE); - } else + } else { ACC_OPENCL_RETURN(result); + } } int c_dbcsr_acc_opencl_device_name(cl_device_id device, const char match[]) { @@ -618,8 +668,9 @@ int c_dbcsr_acc_opencl_device_name(cl_device_id device, const char match[]) { if (EXIT_SUCCESS == result) { const char *const p = libxsmm_stristr(buffer, match); return (NULL != p ? EXIT_SUCCESS : EXIT_FAILURE); - } else + } else { ACC_OPENCL_RETURN(result); + } } int c_dbcsr_acc_opencl_devuid(const char devname[], int *uid) { @@ -665,53 +716,66 @@ int c_dbcsr_acc_opencl_device_level(cl_device_id device, int *level_major, int * if (CL_SUCCESS == result) { unsigned int cl_std_level[2]; if (2 == sscanf(buffer, "OpenCL %u.%u", cl_std_level, cl_std_level + 1)) { - if (NULL != level_major) + if (NULL != level_major) { *level_major = (int)cl_std_level[0]; - if (NULL != level_minor) + } + if (NULL != level_minor) { *level_minor = (int)cl_std_level[1]; + } if (NULL != cl_std) { if (2 <= cl_std_level[0]) { const int nchar = LIBXSMM_SNPRINTF(cl_std, 16, "-cl-std=CL%u.0", cl_std_level[0]); - if (0 >= nchar || 16 <= nchar) + if (0 >= nchar || 16 <= nchar) { result = EXIT_FAILURE; + } } else if (1 <= cl_std_level[0]) { if (1 <= cl_std_level[1]) { const int nchar = LIBXSMM_SNPRINTF(cl_std, 16, "-cl-std=CL%u.%u", cl_std_level[0], cl_std_level[1]); - if (0 >= nchar || 16 <= nchar) + if (0 >= nchar || 16 <= nchar) { result = EXIT_FAILURE; + } } else { result = clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, ACC_OPENCL_BUFFERSIZE, buffer, NULL); if (CL_SUCCESS == result) { if (2 == sscanf(buffer, "OpenCL C %u.%u", cl_std_level, cl_std_level + 1)) { const int nchar = LIBXSMM_SNPRINTF(cl_std, 16, "-cl-std=CL%u.%u", cl_std_level[0], cl_std_level[1]); - if (0 >= nchar || 16 <= nchar) + if (0 >= nchar || 16 <= nchar) { result = EXIT_FAILURE; + } } else { result = EXIT_FAILURE; *cl_std = '\0'; } - } else + } else { *cl_std = '\0'; + } } - } else + } else { *cl_std = '\0'; + } } } else { - if (NULL != level_major) + if (NULL != level_major) { *level_major = 0; - if (NULL != level_minor) + } + if (NULL != level_minor) { *level_minor = 0; - if (NULL != cl_std) + } + if (NULL != cl_std) { *cl_std = '\0'; + } result = EXIT_FAILURE; } } else { - if (NULL != level_major) + if (NULL != level_major) { *level_major = 0; - if (NULL != level_minor) + } + if (NULL != level_minor) { *level_minor = 0; - if (NULL != cl_std) + } + if (NULL != cl_std) { *cl_std = '\0'; + } } } if (NULL != type && EXIT_SUCCESS == result) { @@ -898,10 +962,12 @@ int c_dbcsr_acc_opencl_device_synchronize(int thread_id) { void *const stream = streams[i]; if (NULL != stream) { result = c_dbcsr_acc_stream_sync(stream); - if (EXIT_SUCCESS != result) + if (EXIT_SUCCESS != result) { break; - } else + } + } else { break; + } } } ACC_OPENCL_RETURN(result); @@ -986,7 +1052,7 @@ int c_dbcsr_acc_opencl_kernel(const char source[], const char kernel_name[], con if (NULL != extnames) { int n = num_exts, nflat = 0; size_t size_ext = 0; - for (; 0 < n; --n) + for (; 0 < n; --n) { if (NULL != extnames[n - 1]) { const char *const end = buffer + strlen(extnames[n - 1]); char *ext = strtok(strncpy(buffer, extnames[n - 1], ACC_OPENCL_BUFFERSIZE - 1), ACC_OPENCL_DELIMS " \t"); @@ -995,13 +1061,14 @@ int c_dbcsr_acc_opencl_kernel(const char source[], const char kernel_name[], con size_ext += strlen(ext); } } + } if (0 < size_ext && 0 < nflat) { const char *const enable_ext = "#pragma OPENCL EXTENSION %s : enable\n"; const size_t size_src_ext = size_src + size_ext + nflat * (strlen(enable_ext) - 2 /*%s*/); char *const ext_source_buffer = (char *)libxsmm_aligned_scratch(size_src_ext + 1 /*terminator*/, 0 /*auto-align*/); if (NULL != ext_source_buffer) { - for (n = 0; 0 < num_exts; --num_exts) + for (n = 0; 0 < num_exts; --num_exts) { if (NULL != extnames[num_exts - 1]) { const char *const end = buffer_name + strlen(extnames[num_exts - 1]); char *ext = strtok(strncpy(buffer_name, extnames[num_exts - 1], ACC_OPENCL_MAXSTRLEN * 2 - 1), @@ -1019,10 +1086,11 @@ int c_dbcsr_acc_opencl_kernel(const char source[], const char kernel_name[], con break; } line = strchr(line, '\n'); - if (NULL != line) + if (NULL != line) { ++line; - else + } else { break; + } } #if !defined(NDEBUG) if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_id, (const char **)&ext, 1)) @@ -1038,6 +1106,7 @@ int c_dbcsr_acc_opencl_kernel(const char source[], const char kernel_name[], con #endif } } + } memcpy(ext_source_buffer + n, source, size_src); size_src += n; /* according to given/permitted extensions */ assert(size_src <= size_src_ext); @@ -1089,8 +1158,9 @@ int c_dbcsr_acc_opencl_kernel(const char source[], const char kernel_name[], con libxsmm_free((void *)ext_source); src[size] = '\0'; ext_source = src; - } else + } else { libxsmm_free(src); + } } ACC_OPENCL_EXPECT(EXIT_SUCCESS, fclose(file)); } @@ -1129,8 +1199,9 @@ int c_dbcsr_acc_opencl_kernel(const char source[], const char kernel_name[], con } ok = EXIT_FAILURE; } - if (source != ext_source) + if (source != ext_source) { libxsmm_free((void *)ext_source); + } buffer[0] = '\0'; /* reset to empty */ if (CL_SUCCESS == result) { *kernel = clCreateKernel(program, kernel_name, &result); @@ -1186,11 +1257,13 @@ int c_dbcsr_acc_opencl_kernel(const char source[], const char kernel_name[], con } } #if !defined(NDEBUG) - if (EXIT_SUCCESS != result) + if (EXIT_SUCCESS != result) { *kernel = NULL; + } #endif - if (NULL != try_ok) + if (NULL != try_ok) { *try_ok = result | ok; + } ACC_OPENCL_RETURN_CAUSE(result, buffer); } diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h index 0181a87eb0e..cb62c2a9449 100644 --- a/src/acc/opencl/acc_opencl.h +++ b/src/acc/opencl/acc_opencl.h @@ -66,6 +66,10 @@ #define ACC_OPENCL_DEVICES_MAXCOUNT 256 #endif /** Counted on a per-thread basis! */ +#if !defined(ACC_OPENCL_HANDLES_MAXCOUNT) +#define ACC_OPENCL_HANDLES_MAXCOUNT 1024 +#endif +/** Counted on a per-thread basis! */ #if !defined(ACC_OPENCL_STREAMS_MAXCOUNT) #define ACC_OPENCL_STREAMS_MAXCOUNT 128 #endif @@ -224,6 +228,10 @@ typedef struct c_dbcsr_acc_opencl_config_t { cl_bool devmatch; /** Table of activated device contexts (thread-specific). */ cl_context *contexts; + /** Handle-counter. */ + size_t handle; + /** All handles and related storage. */ + void **handles, *storage; /** All created streams partitioned by thread-ID (thread-local slots). */ void **streams; /** Counts number of streams created (thread-local). */ diff --git a/src/acc/opencl/acc_opencl_event.c b/src/acc/opencl/acc_opencl_event.c index a22adb0a74d..1839d3bfd18 100644 --- a/src/acc/opencl/acc_opencl_event.c +++ b/src/acc/opencl/acc_opencl_event.c @@ -34,9 +34,9 @@ extern "C" { int c_dbcsr_acc_opencl_event_create(cl_event *event_p) { int result; assert(NULL != event_p); - if (NULL != *event_p) + if (NULL != *event_p) { result = EXIT_SUCCESS; - else { + } else { *event_p = clCreateUserEvent(c_dbcsr_acc_opencl_context(), &result); } if (CL_SUCCESS == result) { @@ -49,8 +49,9 @@ int c_dbcsr_acc_opencl_event_create(cl_event *event_p) { ACC_OPENCL_EXPECT(CL_SUCCESS, clReleaseEvent(*event_p)); *event_p = NULL; } - } else + } else { *event_p = NULL; /* error: creating user-defined event */ + } return result; } @@ -74,7 +75,10 @@ int c_dbcsr_acc_event_create(void **event_p) { assert(sizeof(void *) >= sizeof(cl_event)); *event_p = (void *)event; #else - *event_p = malloc(sizeof(cl_event)); + assert(NULL == c_dbcsr_acc_opencl_config.handles || sizeof(void *) >= sizeof(cl_event)); + *event_p = (NULL != c_dbcsr_acc_opencl_config.handles + ? libxsmm_pmalloc(c_dbcsr_acc_opencl_config.handles, &c_dbcsr_acc_opencl_config.handle) + : malloc(sizeof(cl_event))); if (NULL != *event_p) { *(cl_event *)*event_p = event; } else { @@ -86,8 +90,9 @@ int c_dbcsr_acc_event_create(void **event_p) { #endif } #if defined(ACC_OPENCL_EVENT_CREATE) - else + else { *event_p = NULL; /* error: creating user-defined event */ + } #endif #if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); @@ -105,12 +110,18 @@ int c_dbcsr_acc_event_destroy(void *event) { #endif if (NULL != event) { const cl_event clevent = *ACC_OPENCL_EVENT(event); - if (NULL != clevent) + if (NULL != clevent) { result = clReleaseEvent(clevent); + } #if defined(ACC_OPENCL_EVENT_NOALLOC) assert(sizeof(void *) >= sizeof(cl_event)); #else - free(event); + if (NULL != c_dbcsr_acc_opencl_config.handles) { + /**(cl_event*)event = NULL; assert(NULL == *ACC_OPENCL_EVENT(event));*/ + libxsmm_pfree(event, c_dbcsr_acc_opencl_config.handles, &c_dbcsr_acc_opencl_config.handle); + } else { + free(event); + } #endif } #if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) @@ -200,8 +211,9 @@ int c_dbcsr_acc_event_query(void *event, c_dbcsr_acc_bool_t *has_occurred) { } } else { /* error state */ #if defined(ACC_OPENCL_EVENT_CREATE) - if (CL_SUCCESS == result) + if (CL_SUCCESS == result) { result = EXIT_FAILURE; + } #else result = EXIT_SUCCESS; #endif @@ -225,11 +237,13 @@ int c_dbcsr_acc_event_synchronize(void *event) { /* waits on the host-side */ assert(NULL != event); clevent = *ACC_OPENCL_EVENT(event); #if !defined(ACC_OPENCL_EVENT_CREATE) - if (NULL == clevent) + if (NULL == clevent) { result = EXIT_SUCCESS; - else + } else #endif + { result = clWaitForEvents(1, &clevent); + } #if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); #endif diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c index 2b52986e058..e4ffc92ec18 100644 --- a/src/acc/opencl/acc_opencl_mem.c +++ b/src/acc/opencl/acc_opencl_mem.c @@ -122,8 +122,9 @@ int c_dbcsr_acc_host_mem_allocate(void **host_mem, size_t nbytes, void *stream) ACC_OPENCL_EXPECT(CL_SUCCESS, clReleaseMemObject(memory)); *host_mem = NULL; } - } else + } else { *host_mem = NULL; /* error: creating host buffer */ + } #if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); #endif @@ -152,19 +153,23 @@ int c_dbcsr_acc_host_mem_deallocate(void *host_mem, void *stream) { { cl_context context; result = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL); - if (CL_SUCCESS == result) + if (CL_SUCCESS == result) { clSVMFree(context, info.mapped); + } } #elif defined(ACC_OPENCL_MALLOC_LIBXSMM) libxsmm_free(info.mapped); #endif result_release = clReleaseMemObject(info.memory); - if (EXIT_SUCCESS == result) + if (EXIT_SUCCESS == result) { result = result_release; - } else + } + } else { result = EXIT_FAILURE; - } else + } + } else { result = EXIT_FAILURE; + } #if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); #endif @@ -218,7 +223,10 @@ int c_dbcsr_acc_dev_mem_allocate(void **dev_mem, size_t nbytes) { assert(sizeof(void *) >= sizeof(cl_mem)); *dev_mem = (void *)buffer; #else - *dev_mem = malloc(sizeof(cl_mem)); + assert(NULL == c_dbcsr_acc_opencl_config.handles || sizeof(void *) >= sizeof(cl_mem)); + *dev_mem = (NULL != c_dbcsr_acc_opencl_config.handles + ? libxsmm_pmalloc(c_dbcsr_acc_opencl_config.handles, &c_dbcsr_acc_opencl_config.handle) + : malloc(sizeof(cl_mem))); if (NULL != *dev_mem) { *(cl_mem *)*dev_mem = buffer; } else { @@ -233,8 +241,9 @@ int c_dbcsr_acc_dev_mem_allocate(void **dev_mem, size_t nbytes) { result = EXIT_FAILURE; } #endif - } else + } else { *dev_mem = NULL; /* error: creating device buffer */ + } #if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); #endif @@ -260,7 +269,12 @@ int c_dbcsr_acc_dev_mem_deallocate(void *dev_mem) { #if defined(ACC_OPENCL_MEM_NOALLOC) assert(sizeof(void *) >= sizeof(cl_mem)); #else - free(dev_mem); + if (NULL != c_dbcsr_acc_opencl_config.handles) { + /**(cl_mem*)dev_mem = NULL; assert(NULL == *ACC_OPENCL_MEM(dev_mem));*/ + libxsmm_pfree(dev_mem, c_dbcsr_acc_opencl_config.handles, &c_dbcsr_acc_opencl_config.handle); + } else { + free(dev_mem); + } #endif #if defined(ACC_OPENCL_SVM) /*if (NULL != ptr)*/ @@ -289,8 +303,9 @@ int c_dbcsr_acc_dev_mem_set_ptr(void **dev_mem, void *other, size_t lb) { if (NULL != other || 0 == lb) { *dev_mem = (char *)other + lb; result = EXIT_SUCCESS; - } else + } else { result = EXIT_FAILURE; + } #if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); #endif @@ -451,8 +466,9 @@ int c_dbcsr_acc_opencl_info_devmem(cl_device_id device, size_t *mem_free, size_t #elif defined(__APPLE__) && defined(__MACH__) /*const*/ size_t size_pages_free = sizeof(const long), size_pages_total = sizeof(const long); ACC_OPENCL_EXPECT(0, sysctlbyname("hw.memsize", &pages_total, &size_pages_total, NULL, 0)); - if (0 < page_size) + if (0 < page_size) { pages_total /= page_size; + } if (0 != sysctlbyname("vm.page_free_count", &pages_free, &size_pages_free, NULL, 0)) { pages_free = pages_total; } @@ -479,24 +495,30 @@ int c_dbcsr_acc_opencl_info_devmem(cl_device_id device, size_t *mem_free, size_t ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &cl_unified, NULL), "retrieve if host memory is unified", result); if (EXIT_SUCCESS == result) { - if (cl_size_total < size_total) + if (cl_size_total < size_total) { size_total = cl_size_total; - if (size_total < size_free) + } + if (size_total < size_free) { size_free = size_total; + } size_local = cl_size_local; unified = cl_unified; } } result = (size_free <= size_total ? EXIT_SUCCESS : EXIT_FAILURE); assert(NULL != mem_local || NULL != mem_total || NULL != mem_free || NULL != mem_unified); - if (NULL != mem_unified) + if (NULL != mem_unified) { *mem_unified = unified; - if (NULL != mem_local) + } + if (NULL != mem_local) { *mem_local = size_local; - if (NULL != mem_total) + } + if (NULL != mem_total) { *mem_total = size_total; - if (NULL != mem_free) + } + if (NULL != mem_free) { *mem_free = size_free; + } return result; } diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c index 3b0a22a3470..98001a9ffd1 100644 --- a/src/acc/opencl/acc_opencl_stream.c +++ b/src/acc/opencl/acc_opencl_stream.c @@ -40,7 +40,7 @@ c_dbcsr_acc_opencl_info_stream_t *c_dbcsr_acc_opencl_info_stream(void *stream) { result = (c_dbcsr_acc_opencl_info_stream_t *)((uintptr_t)stream - sizeof(c_dbcsr_acc_opencl_info_stream_t)); } else #endif - result = NULL; + { result = NULL; } return result; } @@ -50,11 +50,11 @@ const int *c_dbcsr_acc_opencl_stream_priority(const void *stream) { LIBXSMM_UNUSED(stream); #else const c_dbcsr_acc_opencl_info_stream_t *const info = c_dbcsr_acc_opencl_info_stream((void *)stream); - if (NULL != info) + if (NULL != info) { result = &info->priority; - else + } else #endif - result = NULL; + { result = NULL; } return result; } @@ -65,8 +65,9 @@ int c_dbcsr_acc_opencl_stream_is_thread_specific(int thread_id, const void *stre if (NULL != stream) { int i = 0; for (; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) { - if (stream == streams[i]) + if (stream == streams[i]) { return EXIT_SUCCESS; + } } return EXIT_FAILURE; } @@ -99,8 +100,9 @@ int c_dbcsr_acc_stream_create(void **stream_p, const char *name, int priority) { (NULL != libxsmm_stristr(name, "calc") || (NULL != strstr(name, "priority")))) ? CL_QUEUE_PRIORITY_HIGH_KHR : CL_QUEUE_PRIORITY_MED_KHR; - } else + } else { properties[3] = least; + } } if (CL_QUEUE_PRIORITY_HIGH_KHR <= properties[3] && CL_QUEUE_PRIORITY_LOW_KHR >= properties[3]) { priority = properties[3]; /* sanitize */ @@ -133,7 +135,9 @@ int c_dbcsr_acc_stream_create(void **stream_p, const char *name, int priority) { c_dbcsr_acc_opencl_config.contexts[/*master*/ 0], LIBXSMM_ATOMIC_RELAXED); } else #endif + { tid = 0; /*master*/ + } { const cl_context context = c_dbcsr_acc_opencl_config.contexts[tid]; if (NULL != context) { @@ -153,11 +157,12 @@ int c_dbcsr_acc_stream_create(void **stream_p, const char *name, int priority) { t = j < c_dbcsr_acc_opencl_config.nthreads ? j : (j - c_dbcsr_acc_opencl_config.nthreads); if (0 != t) { /* avoid cloning master's streams */ streams = c_dbcsr_acc_opencl_config.streams + ACC_OPENCL_STREAMS_MAXCOUNT * t; - for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) + for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) { if (NULL != streams[i]) { n = c_dbcsr_acc_opencl_config.nthreads; break; } + } } } assert(i == ACC_OPENCL_STREAMS_MAXCOUNT || c_dbcsr_acc_opencl_config.streams <= streams); @@ -166,23 +171,28 @@ int c_dbcsr_acc_stream_create(void **stream_p, const char *name, int priority) { assert(NULL != streams); stream = *ACC_OPENCL_STREAM(streams[i]); result = clRetainCommandQueue(stream); - if (CL_SUCCESS == result) + if (CL_SUCCESS == result) { queue = stream; - } else + } + } else { queue = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, device, properties, &result); + } } } - } else + } else { result = EXIT_FAILURE; + } } assert(NULL != stream_p); if (EXIT_SUCCESS == result) { const int base = ACC_OPENCL_STREAMS_MAXCOUNT * tid; cl_command_queue *const stats = c_dbcsr_acc_opencl_config.stats + base; streams = c_dbcsr_acc_opencl_config.streams + base; - for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) - if (NULL == streams[i]) + for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) { + if (NULL == streams[i]) { break; + } + } if (i < ACC_OPENCL_STREAMS_MAXCOUNT) { /* register stream */ #if defined(ACC_OPENCL_STREAM_NOALLOC) assert(sizeof(void *) >= sizeof(cl_command_queue) && NULL != queue); @@ -224,8 +234,9 @@ int c_dbcsr_acc_stream_create(void **stream_p, const char *name, int priority) { #endif ACC_OPENCL_DEBUG_FPRINTF(stderr, ").\n"); } - } else + } else { *stream_p = NULL; + } #if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); #endif @@ -247,11 +258,12 @@ int c_dbcsr_acc_stream_destroy(void *stream) { void **streams = NULL; for (; tid < c_dbcsr_acc_opencl_config.nthreads; ++tid) { /* unregister */ streams = c_dbcsr_acc_opencl_config.streams + ACC_OPENCL_STREAMS_MAXCOUNT * tid; - for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) + for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) { if (stream == streams[i]) { tid = c_dbcsr_acc_opencl_config.nthreads; /* break outer loop */ break; } + } } if (i < ACC_OPENCL_STREAMS_MAXCOUNT) { const int j = i + 1; @@ -261,8 +273,9 @@ int c_dbcsr_acc_stream_destroy(void *stream) { if (j < ACC_OPENCL_STREAMS_MAXCOUNT && NULL != streams[j]) { memmove(streams + i, streams + j, sizeof(cl_command_queue) * (ACC_OPENCL_STREAMS_MAXCOUNT - j)); } - } else + } else { result = EXIT_FAILURE; + } } #if defined(_OPENMP) #if (201107 /*v3.1*/ <= _OPENMP) @@ -315,10 +328,12 @@ int c_dbcsr_acc_stream_priority_range(int *least, int *greatest) { } } #endif - if (NULL != greatest) + if (NULL != greatest) { *greatest = priohi; - if (NULL != least) + } + if (NULL != least) { *least = priolo; + } #if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); #endif @@ -349,7 +364,9 @@ int c_dbcsr_acc_stream_sync(void *stream) { } } else #endif + { result = clFinish(*ACC_OPENCL_STREAM(stream)); + } #if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); #endif diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c index b42d2a652fa..1e84c400826 100644 --- a/src/acc/opencl/smm/opencl_libsmm.c +++ b/src/acc/opencl/smm/opencl_libsmm.c @@ -130,10 +130,11 @@ void opencl_libsmm_print_matrix(FILE *ostream, const char *label, libsmm_acc_dat const char *const s = (NULL != label ? label : ""); const int len = (int)strlen(s); for (i = 0; i < m; ++i) { - if (0 < i) + if (0 < i) { fprintf(ostream, "%*s", len, " "); - else + } else { fprintf(ostream, "%s", s); + } for (j = 0; j < n; ++j) { switch (type) { case dbcsr_type_real_8: @@ -157,8 +158,9 @@ int opencl_libsmm_write_trans_params(FILE *stream, int only_key, const opencl_li int result = 0; if (NULL != stream) { const char d = (NULL == delim ? *ACC_OPENCL_DELIMS : *delim); - if (NULL != key || 0 == only_key) + if (NULL != key || 0 == only_key) { result += fprintf(stream, "%c", NULL == begin ? '{' : *begin); + } if (NULL != config) { if (NULL != key) { result += fprintf(stream, @@ -178,10 +180,12 @@ int opencl_libsmm_write_trans_params(FILE *stream, int only_key, const opencl_li /*if (0 == only_key) result += fprintf(stream, "%c", d);*/ } } - if (NULL != key || 0 == only_key) + if (NULL != key || 0 == only_key) { result += fprintf(stream, "%c", NULL == close ? '}' : *close); - } else + } + } else { result = -1; + } assert(0 < result); return result; } @@ -192,8 +196,9 @@ int opencl_libsmm_write_smm_params(FILE *stream, int only_key, const opencl_libs int result = 0; if (NULL != stream) { const char d = (NULL == delim ? *ACC_OPENCL_DELIMS : *delim); - if (NULL != key || 0 == only_key) + if (NULL != key || 0 == only_key) { result += fprintf(stream, "%c", NULL == begin ? '{' : *begin); + } if (NULL != config) { if (NULL != key) { result += fprintf(stream, @@ -202,10 +207,11 @@ int opencl_libsmm_write_smm_params(FILE *stream, int only_key, const opencl_libs "%i%c" "%i", (int)key->type, d, key->m, d, key->n, d, key->k); - if (0 == only_key) + if (0 == only_key) { result += fprintf(stream, "%c", d); + } } - if (0 == only_key) + if (0 == only_key) { result += fprintf(stream, "%i%c%i%c%i%c%i%c%i" "%c%i%c%i%c%i%c%i%c%i" @@ -213,6 +219,7 @@ int opencl_libsmm_write_smm_params(FILE *stream, int only_key, const opencl_libs config->bs, d, config->bm, d, config->bn, d, config->bk, d, config->ws, d, config->wg, d, config->lu, d, config->nz, d, config->al, d, config->tb, d, config->tc, d, config->ap, d, config->aa, d, config->ab, d, config->ac); + } } else { if (NULL != key) { result += fprintf(stream, @@ -221,20 +228,24 @@ int opencl_libsmm_write_smm_params(FILE *stream, int only_key, const opencl_libs "n%c" "k", d, d, d); - if (0 == only_key) + if (0 == only_key) { result += fprintf(stream, "%c", d); + } } - if (0 == only_key) + if (0 == only_key) { result += fprintf(stream, "bs%cbm%cbn%cbk%cws" "%cwg%clu%cnz%cal%ctb" "%ctc%cap%caa%cab%cac", d, d, d, d, d, d, d, d, d, d, d, d, d, d); + } } - if (NULL != key || 0 == only_key) + if (NULL != key || 0 == only_key) { result += fprintf(stream, "%c", NULL == close ? '}' : *close); - } else + } + } else { result = -1; + } assert(0 < result); return result; } @@ -392,15 +403,17 @@ int opencl_libsmm_read_smm_params(char *parambuf, opencl_libsmm_smmkey_t *key, o case dbcsr_type_real_8: { const double ratio = gflops / OPENCL_LIBSMM_AI(key->m, key->n, key->k, sizeof(double)); libxsmm_kahan_sum(log(ratio), &perfest->gf_ai_dratio_sumlog, &perfest->gf_ai_dratio_kahan); - if (perfest->gf_ai_dratio_max < ratio) + if (perfest->gf_ai_dratio_max < ratio) { perfest->gf_ai_dratio_max = ratio; + } ++perfest->dcount; } break; case dbcsr_type_real_4: { const double ratio = gflops / OPENCL_LIBSMM_AI(key->m, key->n, key->k, sizeof(float)); libxsmm_kahan_sum(log(ratio), &perfest->gf_ai_sratio_sumlog, &perfest->gf_ai_sratio_kahan); - if (perfest->gf_ai_sratio_max < ratio) + if (perfest->gf_ai_sratio_max < ratio) { perfest->gf_ai_sratio_max = ratio; + } ++perfest->scount; } break; default: @@ -430,8 +443,9 @@ int libsmm_acc_init(void) { * The implementation of c_dbcsr_acc_init should hence be safe against "over initialization". * However, DBCSR only calls c_dbcsr_acc_init (and expects an implicit libsmm_acc_init). */ - if (EXIT_SUCCESS == result) + if (EXIT_SUCCESS == result) { result = c_dbcsr_acc_init(); + } #endif libxsmm_init(); if (EXIT_SUCCESS == result) { @@ -486,8 +500,9 @@ int libsmm_acc_init(void) { } fclose(file); control = '1'; - } else + } else { control = '2'; + } } #if defined(OPENCL_LIBSMM_PARAMS_SMM) if (EXIT_SUCCESS == result && '1' != control) { @@ -649,8 +664,9 @@ int libsmm_acc_finalize(void) { for (; NULL != regentry; regentry = libxsmm_get_registry_next(regentry, ®key)) { /* opencl_libsmm_trans_t/opencl_libsmm_smm_t carry cl_kernel as 1st data member */ cl_kernel kernel = *(const cl_kernel *)regentry; - if (NULL == kernel) + if (NULL == kernel) { kernel = ((const opencl_libsmm_smm_t *)regentry)->kernel[1]; + } if (NULL != kernel) { if (3 == c_dbcsr_acc_opencl_config.verbosity) { char fname[ACC_OPENCL_MAXSTRLEN]; @@ -667,8 +683,9 @@ int libsmm_acc_finalize(void) { int blocks; OPENCL_LIBSMM_ISORT(entry->size, size); blocks = entry->size[size >> 1]; - if (0 == (1 & size)) + if (0 == (1 & size)) { blocks = (blocks + entry->size[(size >> 1) - 1]) >> 1; + } LIBXSMM_STDIO_ACQUIRE(); fprintf(stderr, "INFO ACC/OpenCL: TRANS-kernel "); opencl_libsmm_write_trans_params(stderr, 1 /*only_key*/, desc, NULL /*config*/, NULL /*delim*/, @@ -695,8 +712,9 @@ int libsmm_acc_finalize(void) { int blocks; OPENCL_LIBSMM_ISORT(entry->size, size); blocks = entry->size[size >> 1]; - if (0 == (1 & size)) + if (0 == (1 & size)) { blocks = (blocks + entry->size[(size >> 1) - 1]) >> 1; + } LIBXSMM_STDIO_ACQUIRE(); fprintf(stderr, "INFO ACC/OpenCL: SMM-kernel "); opencl_libsmm_write_smm_params(stderr, 1 /*only_key*/, desc, NULL /*config*/, NULL /*delim*/, @@ -705,8 +723,9 @@ int libsmm_acc_finalize(void) { opencl_libsmm_write_smm_params(stderr, 1 /*only_key*/, desc, &dummy, NULL /*delim*/, NULL /*begin*/, NULL /*close*/); fprintf(stderr, " ss=%i geo=%.1f", blocks, geo); - if (0 < est) + if (0 < est) { fprintf(stderr, " est=%.1f", est); + } fprintf(stderr, " GFLOPS/s\n"); LIBXSMM_STDIO_RELEASE(); } @@ -753,7 +772,8 @@ int libsmm_acc_transpose(const int *dev_trs_stack, int offset, int stack_size, v result = EXIT_FAILURE; #else const int mn = m * n; - assert((NULL != dev_trs_stack && NULL != dev_data && NULL != stream && 0 <= offset && 0 <= stack_size) || + assert((NULL != dev_trs_stack && NULL != stream && NULL != dev_data && NULL != *ACC_OPENCL_MEM(dev_data) && + 0 <= offset && 0 <= stack_size) || 0 == stack_size); if (( #if defined(OPENCL_LIBSMM_F64) @@ -856,8 +876,9 @@ int libsmm_acc_transpose(const int *dev_trs_stack, int offset, int stack_size, v result = c_dbcsr_acc_opencl_kernel(OPENCL_LIBSMM_SOURCE_TRANSPOSE, fname, build_params, buffer, NULL /*try*/, NULL /*try_ok*/, NULL /*extnames*/, 0 /*num_exts*/, &new_config.kernel); - } else + } else { result = EXIT_FAILURE; + } } if (EXIT_SUCCESS == result) { config = (opencl_libsmm_trans_t *)OPENCL_LIBSMM_REGISTER(&key, sizeof(key), sizeof(new_config), @@ -879,8 +900,9 @@ int libsmm_acc_transpose(const int *dev_trs_stack, int offset, int stack_size, v } } } - } else if (EXIT_SUCCESS == result) + } else if (EXIT_SUCCESS == result) { result = EXIT_FAILURE; + } } } else { result = EXIT_FAILURE; @@ -917,10 +939,12 @@ int libsmm_acc_transpose(const int *dev_trs_stack, int offset, int stack_size, v ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_trs_stack, stack, sizeof(int) * offset_stack_size, stream), "transfer debug stack", result); ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_data, imat, data_size, stream), "transfer debug input", result); - } else + } else { result = EXIT_FAILURE; - } else + } + } else { result = EXIT_FAILURE; + } #endif assert(!(OPENCL_LIBSMM_NLOCKS_TRANS & (OPENCL_LIBSMM_NLOCKS_TRANS - 1))); /* POT */ { /* OpenCL is thread-safe except for clSetKernelArg and launching such shared kernel */ @@ -976,8 +1000,9 @@ int libsmm_acc_transpose(const int *dev_trs_stack, int offset, int stack_size, v if (0 == (1 & s1)) config->size[s1] = (config->size[s1] + config->size[s2 - 1]) >> 1; } - } else + } else { config->size[config->nexec++] = stack_size; + } #endif if (4 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) { const int *const priority = c_dbcsr_acc_opencl_stream_priority(stream); @@ -1091,7 +1116,9 @@ c_dbcsr_acc_bool_t libsmm_acc_process_suitable(c_dbcsr_acc_bool_t def_mnk, libsm acc = ai * opencl_libsmm_dacc; if (0 == hst || 0 == acc || hst < acc) #endif + { result = 1; + } } break; #endif @@ -1107,7 +1134,9 @@ c_dbcsr_acc_bool_t libsmm_acc_process_suitable(c_dbcsr_acc_bool_t def_mnk, libsm acc = ai * opencl_libsmm_sacc; if (0 == hst || 0 == acc || hst < acc) #endif + { result = 1; + } } break; #endif @@ -1130,8 +1159,9 @@ c_dbcsr_acc_bool_t libsmm_acc_process_suitable(c_dbcsr_acc_bool_t def_mnk, libsm NULL /*close*/); fprintf(stderr, " ss=%i", stack_size); #if defined(OPENCL_LIBSMM_SUITABLE) - if (0 < hst && 0 < acc) + if (0 < hst && 0 < acc) { fprintf(stderr, " hst=%.1f acc=%.1f GFLOPS/s", hst, acc); + } #endif fprintf(stderr, " not suitable%s", 0 != def_mnk ? "\n" : " (inhomogeneous)\n"); LIBXSMM_STDIO_RELEASE(); @@ -1148,8 +1178,10 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, result = EXIT_FAILURE; #else LIBXSMM_UNUSED(c_stream); /* TODO */ - assert(0 == stack_size || (NULL != host_param_stack && NULL != dev_param_stack && NULL != dev_a_data && - NULL != dev_b_data && NULL != dev_c_data)); + assert(0 == stack_size || (NULL != dev_a_data && NULL != *ACC_OPENCL_MEM(dev_a_data))); + assert(0 == stack_size || (NULL != dev_b_data && NULL != *ACC_OPENCL_MEM(dev_b_data))); + assert(0 == stack_size || (NULL != dev_c_data && NULL != *ACC_OPENCL_MEM(dev_c_data))); + assert(0 == stack_size || (NULL != host_param_stack && NULL != dev_param_stack)); assert(0 < nparams && 0 < max_kernel_dim && NULL != stream); assert(0 <= stack_size && 0 <= m_max && 0 <= n_max && 0 <= k_max); if (0 != libsmm_acc_process_suitable(def_mnk, datatype, stack_size, m_max, n_max, k_max, max_kernel_dim)) { @@ -1188,8 +1220,9 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, kernel_idx = ((OPENCL_LIBSMM_DEFAULT_BS * s) <= stack_size ? 0 : 1); } else if (1 != config->bs) { kernel_idx = (((MAX(config->bs, 1) * s) <= stack_size || (0 < config->s && config->s <= stack_size)) ? 0 : 1); - } else + } else { kernel_idx = 1; + } } if (NULL == config || NULL == config->kernel[kernel_idx]) { char buffer[ACC_OPENCL_BUFFERSIZE], build_params[ACC_OPENCL_BUFFERSIZE]; @@ -1232,8 +1265,9 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, extensions[0] = "cl_khr_fp64 cl_khr_global_int32_base_atomics"; if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) { atomic_type = "-DATOMIC32_ADD64 -DTA=int"; - } else + } else { tname = NULL; + } } } } @@ -1265,9 +1299,9 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, : atoi(env_devid); size_t wgsize_max, wgsize_prf, sgs = 0; opencl_libsmm_smm_t new_config; - if (NULL == config) + if (NULL == config) { memset(&new_config, 0, sizeof(new_config)); - else { /* preserve kernels, performance counters, etc. */ + } else { /* preserve kernels, performance counters, etc. */ memcpy(&new_config, config, sizeof(opencl_libsmm_smm_t)); } result = c_dbcsr_acc_opencl_wgsize(active_device, NULL /*device-specific*/, &wgsize_max, &wgsize_prf); @@ -1384,11 +1418,13 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, if (-1 == new_config.wg) { /* cover entire WG-size in sub-group size */ for (; (i * sizeof(size_t)) < nbytes; ++i) { sgs = sizes[i]; - if (new_config.wgsize[kernel_idx] <= sgs) + if (new_config.wgsize[kernel_idx] <= sgs) { break; + } } - if (new_config.wgsize[kernel_idx] > sgs) + if (new_config.wgsize[kernel_idx] > sgs) { sgs = 0; + } } else { /* explicit sub-group size with minimized WG-remainder */ for (; (i * sizeof(size_t)) < nbytes; ++i) { r = libxsmm_remainder((unsigned int)new_config.wgsize[kernel_idx], (unsigned int)sizes[i], @@ -1400,17 +1436,23 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, } } wgsize_prf = new_config.wgsize[kernel_idx]; - } else + } else { wgsize_prf = r; - } else + } + } else { wgsize_prf = r; + } } else #endif + { wgsize_prf = new_config.wgsize[kernel_idx]; - if (2 <= new_config.wg) + } + if (2 <= new_config.wg) { wgsize_prf = LIBXSMM_UP2POT(wgsize_prf); - if (wgsize_prf < (2 * new_config.wgsize[kernel_idx])) + } + if (wgsize_prf < (2 * new_config.wgsize[kernel_idx])) { new_config.wgsize[kernel_idx] = wgsize_prf; /* limit */ + } assert(1 <= new_config.bs && 0 < new_config.wgsize[kernel_idx] && 0 < wgsize_max && 0 < wgsize_prf); /* ensure minimum requested WG-size */ while ((nbm * nbn) < new_config.ws && (nbm < m_max || nbn < n_max)) { @@ -1448,8 +1490,9 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, (CL_DEVICE_TYPE_CPU != device_type))) ? "-D\"BARRIER(A)=work_group_barrier(A, memory_scope_work_group)\"" : "-D\"BARRIER(A)=barrier(A)\""); - } else + } else { barrier_expr = ""; /* no barrier */ + } assert(NULL != barrier_expr); if (NULL == env_atomics || '0' != *env_atomics) { const int cl_nonv = (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "nvidia")); @@ -1476,8 +1519,9 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, if (0 == atomics_native) { extensions[1] = "cl_intel_global_float_atomics"; atomic_ops = "-Dcl_intel_global_float_atomics"; - } else + } else { atomic_ops = "-DATOMIC_PROTOTYPES"; + } atomic_exp = (0 != std_c11 ? "atomic_fetch_add_explicit((global volatile TF*)A, B, " "memory_order_relaxed, memory_scope_work_group)" : "atomic_add(A, B)"); @@ -1493,8 +1537,9 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1)) { assert(dbcsr_type_real_4 == datatype); atomic_expr2 = "-D\"ATOMIC_ADD2_GLOBAL(A,B)=atomic_add_global_cmpxchg2(A, B)\""; - } else + } else { extensions[1] = NULL; + } atomic_exp = "atomic_add_global_cmpxchg(A, B)"; atomic_ops = (dbcsr_type_real_4 == datatype ? "-DCMPXCHG=atomic_cmpxchg" : "-DCMPXCHG=atom_cmpxchg"); @@ -1510,8 +1555,9 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1)) { assert(dbcsr_type_real_4 == datatype); atomic_expr2 = "-D\"ATOMIC_ADD2_GLOBAL(A,B)=atomic_add_global_cmpxchg2(A, B)\""; - } else + } else { extensions[1] = NULL; + } atomic_exp = "atomic_add_global_cmpxchg(A, B)"; atomic_ops = (dbcsr_type_real_4 == datatype ? "-DCMPXCHG=atomic_cmpxchg" : "-DCMPXCHG=atom_cmpxchg"); @@ -1555,12 +1601,15 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, #endif nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "%s %s -cl-fast-relaxed-math -cl-denorms-are-zero", NULL == env_options ? "" : env_options, cl_debug); - if (0 >= nchar || (int)sizeof(buffer) <= nchar) + if (0 >= nchar || (int)sizeof(buffer) <= nchar) { result = EXIT_FAILURE; - } else + } + } else { result = EXIT_FAILURE; - } else + } + } else { result = EXIT_FAILURE; /* matrix-size causes too large WG-size */ + } } if (EXIT_SUCCESS == result) { static int cl_try_ok = EXIT_SUCCESS; @@ -1582,8 +1631,9 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, result = c_dbcsr_acc_opencl_kernel(src, fname, build_params, buffer, cl_try, &cl_try_ok, extensions, sizeof(extensions) / sizeof(*extensions), new_config.kernel + kernel_idx); - } else + } else { libxsmm_free(src); + } } fclose(src_kernel); } @@ -1623,8 +1673,9 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, #endif } /* failed to register config */ - else + else { result = EXIT_FAILURE; + } } else { if (0 != c_dbcsr_acc_opencl_config.verbosity) { fprintf(stderr, "ERROR LIBSMM: tile-size causes too large WG-size (min(%u,%u) < %u)!\n", @@ -1636,10 +1687,12 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, } } } - } else + } else { result = EXIT_FAILURE; /* insufficient device capabilities */ - } else + } + } else { result = EXIT_FAILURE; + } /* remove configuration from registry to avoid infinitely retrying code generation */ if (EXIT_SUCCESS != result && NULL != config) { libxsmm_xrelease(&key, sizeof(key)); @@ -1705,10 +1758,12 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_c_data, gold, csize, stream), "transfer debug c-data", result); kernel_cpu = libxsmm_xmmdispatch(desc); assert(NULL != kernel_cpu.xmm); - } else + } else { result = EXIT_FAILURE; - } else + } + } else { result = EXIT_FAILURE; + } #endif assert(!(OPENCL_LIBSMM_NLOCKS_SMM & (OPENCL_LIBSMM_NLOCKS_SMM - 1))); /* POT */ { /* OpenCL is thread-safe except for clSetKernelArg and launching such shared kernel */ @@ -1771,8 +1826,9 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, if (0 == (1 & s1)) config->size[s1] = (config->size[s1] + config->size[s2 - 1]) >> 1; } - } else + } else { config->size[config->nexec++] = stack_size; + } #endif if (4 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) { const double est = (dbcsr_type_real_8 == datatype @@ -1787,8 +1843,9 @@ int libsmm_acc_process(const int *host_param_stack, const int *dev_param_stack, opencl_libsmm_write_smm_params(stderr, 1 /*only_key*/, &key, config, NULL /*delim*/, NULL /*begin*/, NULL /*close*/); fprintf(stderr, " prio=%i ss=%i cur=%.1f", NULL != priority ? *priority : -1, stack_size, gflops); - if (0 < est) + if (0 < est) { fprintf(stderr, " est=%.1f", est); + } fprintf(stderr, " GFLOPS/s dur=%.2g ms\n", 1E3 * duration); LIBXSMM_STDIO_RELEASE(); }