diff --git a/examples/c/vector_add.py b/examples/c/vector_add.py index c5956f2ad..38122b49d 100755 --- a/examples/c/vector_add.py +++ b/examples/c/vector_add.py @@ -26,7 +26,7 @@ } """ -size = 72*1024*1024 +size = 72 * 1024 * 1024 a = numpy.random.randn(size).astype(numpy.float32) b = numpy.random.randn(size).astype(numpy.float32) @@ -39,7 +39,6 @@ tune_params["nthreads"] = [1, 2, 3, 4, 8, 12, 16, 24, 32] tune_params["vecsize"] = [1, 2, 4, 8, 16] -answer = [a+b, None, None, None] +answer = [a + b, None, None, None] -tune_kernel("vector_add", kernel_string, size, args, tune_params, - answer=answer, compiler_options=['-O3']) +tune_kernel("vector_add", kernel_string, size, args, tune_params, answer=answer, compiler_options=["-fopenmp", "-O3"]) diff --git a/examples/directives/histogram_c_openacc.py b/examples/directives/histogram_c_openacc.py new file mode 100644 index 000000000..d6fc1fba4 --- /dev/null +++ b/examples/directives/histogram_c_openacc.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +"""This is a simple example for tuning C++ OpenACC code with the kernel tuner""" +import numpy as np + +from kernel_tuner import tune_kernel +from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives + + +# Naive Python histogram implementation +def histogram(vector, hist): + for i in range(0, len(vector)): + hist[vector[i]] += 1 + return hist + + +code = """ +#include + +#define HIST_SIZE 256 +#define VECTOR_SIZE 1000000 + +#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE) +#if enable_reduction == 1 +#pragma acc parallel num_gangs(ngangs) vector_length(nthreads) reduction(+:hist[:HIST_SIZE]) +#else +#pragma acc parallel num_gangs(ngangs) vector_length(nthreads) +#endif +#pragma acc loop independent +for ( int i = 0; i < VECTOR_SIZE; i++ ) { +#if enable_atomic == 1 + #pragma acc atomic update +#endif + hist[vector[i]] += 1; +} +#pragma tuner stop +""" + +# Extract tunable directive +app = Code(OpenACC(), Cxx()) +kernel_string, kernel_args = process_directives(app, code) + +tune_params = dict() +tune_params["ngangs"] = [2**i for i in range(1, 11)] +tune_params["nthreads"] = [32 * i for i in range(1, 33)] +tune_params["enable_reduction"] = [0, 1] +tune_params["enable_atomic"] = [0, 1] +constraints = ["enable_reduction != enable_atomic"] +metrics = dict() +metrics["GB/s"] = ( + lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0]))) + / (x["time"] / 10**3) + / 10**9 +) + +kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32) +kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32) +reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32) +reference_hist = histogram(kernel_args["histogram"][0], reference_hist) +answer = [None, reference_hist] + +tune_kernel( + "histogram", + kernel_string["histogram"], + 0, + kernel_args["histogram"], + tune_params, + restrictions=constraints, + metrics=metrics, + answer=answer, + compiler="nvc++", + compiler_options=["-fast", "-acc=gpu"], +) diff --git a/examples/directives/histogram_c_openmp.py b/examples/directives/histogram_c_openmp.py new file mode 100644 index 000000000..190c35789 --- /dev/null +++ b/examples/directives/histogram_c_openmp.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +"""This is a simple example for tuning C++ OpenMP code with the kernel tuner""" +import numpy as np + +from kernel_tuner import tune_kernel +from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives + + +# Naive Python histogram implementation +def histogram(vector, hist): + for i in range(0, len(vector)): + hist[vector[i]] += 1 + return hist + + +code = """ +#include + +#define HIST_SIZE 256 +#define VECTOR_SIZE 1000000 + +#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE) +#if enable_reduction == 1 +#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) reduction(+:hist[:HIST_SIZE]) +#else +#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) +#endif +for ( int i = 0; i < VECTOR_SIZE; i++ ) { +#if enable_atomic == 1 + #pragma omp atomic update +#endif + hist[vector[i]] += 1; +} +#pragma tuner stop +""" + +# Extract tunable directive +app = Code(OpenMP(), Cxx()) +kernel_string, kernel_args = process_directives(app, code) + +tune_params = dict() +tune_params["nteams"] = [2**i for i in range(1, 11)] +tune_params["nthreads"] = [32 * i for i in range(1, 33)] +tune_params["enable_reduction"] = [0, 1] +tune_params["enable_atomic"] = [0, 1] +constraints = ["enable_reduction != enable_atomic"] +metrics = dict() +metrics["GB/s"] = ( + lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0]))) + / (x["time"] / 10**3) + / 10**9 +) + +kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32) +kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32) +reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32) +reference_hist = histogram(kernel_args["histogram"][0], reference_hist) +answer = [None, reference_hist] + +tune_kernel( + "histogram", + kernel_string["histogram"], + 0, + kernel_args["histogram"], + tune_params, + restrictions=constraints, + metrics=metrics, + answer=answer, + compiler="nvc++", + compiler_options=["-fast", "-mp=gpu"], +) diff --git a/examples/directives/matrix_multiply_c_openacc.py b/examples/directives/matrix_multiply_c_openacc.py index d8bf7cc4e..96b94a1d3 100644 --- a/examples/directives/matrix_multiply_c_openacc.py +++ b/examples/directives/matrix_multiply_c_openacc.py @@ -1,13 +1,8 @@ #!/usr/bin/env python """This is an example tuning a naive matrix multiplication using the simplified directives interface""" -from kernel_tuner import tune_kernel -from kernel_tuner.utils.directives import ( - Code, - OpenACC, - Cxx, - process_directives -) +from kernel_tuner import tune_kernel, run_kernel +from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives N = 4096 @@ -45,6 +40,12 @@ metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9 metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9 +# compute reference solution from CPU +results = run_kernel( + "mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"] +) +answer = [None, None, results[2]] + tune_kernel( "mm", kernel_string["mm"], @@ -52,6 +53,7 @@ kernel_args["mm"], tune_params, metrics=metrics, - compiler_options=["-fast", "-acc=gpu"], + answer=answer, compiler="nvc++", + compiler_options=["-fast", "-acc=gpu"], ) diff --git a/examples/directives/matrix_multiply_c_openmp.py b/examples/directives/matrix_multiply_c_openmp.py new file mode 100644 index 000000000..3298ffb7b --- /dev/null +++ b/examples/directives/matrix_multiply_c_openmp.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +"""This is an example tuning a naive matrix multiplication using the simplified directives interface""" + +from kernel_tuner import tune_kernel, run_kernel +from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives + +N = 4096 + +code = """ +#define N 4096 + +void matrix_multiply(float *A, float *B, float *C) { + #pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN) + float temp_sum = 0.0f; + #pragma omp target + #pragma omp teams distribute collapse(2) + for ( int i = 0; i < N; i++) { + for ( int j = 0; j < N; j++ ) { + temp_sum = 0.0f; + #pragma omp parallel for num_threads(nthreads) reduction(+:temp_sum) + for ( int k = 0; k < N; k++ ) { + temp_sum += A[(i * N) + k] * B[(k * N) + j]; + } + C[(i * N) + j] = temp_sum; + } + } + #pragma tuner stop +} +""" + +# Extract tunable directive +app = Code(OpenMP(), Cxx()) +dims = {"NN": N**2} +kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims) + +tune_params = dict() +tune_params["nthreads"] = [32 * i for i in range(1, 33)] +metrics = dict() +metrics["time_s"] = lambda x: x["time"] / 10**3 +metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9 +metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9 + +# compute reference solution from CPU +results = run_kernel( + "mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"] +) +answer = [None, None, results[2]] + +tune_kernel( + "mm", + kernel_string["mm"], + 0, + kernel_args["mm"], + tune_params, + metrics=metrics, + answer=answer, + compiler="nvc++", + compiler_options=["-fast", "-mp=gpu"], +) diff --git a/examples/directives/vector_add_c_openacc.py b/examples/directives/vector_add_c_openacc.py index c062f2b1e..4730f3fd5 100644 --- a/examples/directives/vector_add_c_openacc.py +++ b/examples/directives/vector_add_c_openacc.py @@ -67,6 +67,6 @@ tune_params, metrics=metrics, answer=answer, - compiler_options=["-fast", "-acc=gpu"], compiler="nvc++", + compiler_options=["-fast", "-acc=gpu"], ) diff --git a/examples/directives/vector_add_c_openmp.py b/examples/directives/vector_add_c_openmp.py new file mode 100644 index 000000000..e212532cf --- /dev/null +++ b/examples/directives/vector_add_c_openmp.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +"""This is a simple example for tuning C++ OpenMP code with the kernel tuner""" + +from kernel_tuner import tune_kernel +from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives + +code = """ +#include + +#define VECTOR_SIZE 1000000 + +int main(void) { + int size = VECTOR_SIZE; + float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); + float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); + float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); + + #pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) + #pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop + + free(a); + free(b); + free(c); +} +""" + +# Extract tunable directive +app = Code(OpenMP(), Cxx()) +kernel_string, kernel_args = process_directives(app, code) + +tune_params = dict() +tune_params["nteams"] = [2**i for i in range(1,11)] +tune_params["nthreads"] = [32 * i for i in range(1, 33)] +metrics = dict() +metrics["GB/s"] = ( + lambda x: ((2 * 4 * len(kernel_args["vector_add"][0])) + (4 * len(kernel_args["vector_add"][0]))) + / (x["time"] / 10**3) + / 10**9 +) + +answer = [None, None, kernel_args["vector_add"][0] + kernel_args["vector_add"][1], None] + +tune_kernel( + "vector_add", + kernel_string["vector_add"], + 0, + kernel_args["vector_add"], + tune_params, + metrics=metrics, + answer=answer, + compiler="nvc++", + compiler_options=["-fast", "-mp=gpu"], +) diff --git a/examples/directives/vector_add_fortran_openacc.py b/examples/directives/vector_add_fortran_openacc.py index 29e94646a..503a4538e 100644 --- a/examples/directives/vector_add_fortran_openacc.py +++ b/examples/directives/vector_add_fortran_openacc.py @@ -62,6 +62,6 @@ tune_params, metrics=metrics, answer=answer, - compiler_options=["-fast", "-acc=gpu"], compiler="nvfortran", + compiler_options=["-fast", "-acc=gpu"], ) diff --git a/examples/fortran/test_fortran_vector_add.py b/examples/fortran/test_fortran_vector_add.py deleted file mode 100755 index 415647024..000000000 --- a/examples/fortran/test_fortran_vector_add.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python -"""This is a minimal example for calling Fortran functions""" - -import json -from pathlib import Path -import numpy as np -from kernel_tuner import run_kernel - - -def test(): - filename = Path(__file__).parent / "vector_add.F90" - with open(filename, "r") as f: - kernel_string = f.read() - - size = 10000000 - - a = np.random.randn(size).astype(np.float32) - b = np.random.randn(size).astype(np.float32) - c = np.zeros_like(b) - n = np.int32(size) - - args = [c, a, b, n] - - tune_params = dict() - tune_params["N"] = size - tune_params["NTHREADS"] = 4 - - answer = run_kernel( - "vector_add", - kernel_string, - size, - args, - tune_params, - lang="fortran", - compiler="gfortran", - ) - - assert np.allclose(answer[0], a + b, atol=1e-8) - - -if __name__ == "__main__": - test() diff --git a/examples/fortran/vector_add.F90 b/examples/fortran/vector_add.F90 index b5541b59c..6d39752b5 100644 --- a/examples/fortran/vector_add.F90 +++ b/examples/fortran/vector_add.F90 @@ -17,11 +17,11 @@ subroutine vector_add(C, A, B, n) real (c_float), intent(in), dimension(N) :: A, B integer (c_int), intent(in) :: n - !$OMP parallel do + !$omp parallel do do i = 1, N C(i) = A(i) + B(i) end do - !$OMP end parallel do + !$omp end parallel do end subroutine vector_add diff --git a/examples/fortran/vector_add.py b/examples/fortran/vector_add.py index 5c1a5476b..813f6671e 100755 --- a/examples/fortran/vector_add.py +++ b/examples/fortran/vector_add.py @@ -1,7 +1,6 @@ #!/usr/bin/env python """This is a minimal example for calling Fortran functions""" -from __future__ import print_function import numpy as np from kernel_tuner import tune_kernel @@ -29,6 +28,7 @@ def tune(): tune_params, lang="C", compiler="gfortran", + compiler_options=["-fopenmp"], ) return result diff --git a/examples/fortran/vector_add_acc.F90 b/examples/fortran/vector_add_acc.F90 deleted file mode 100644 index 9188e74cc..000000000 --- a/examples/fortran/vector_add_acc.F90 +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef N -#define N 1024 -#endif -#ifndef block_size_x -#define block_size_x 256 -#endif - -module vector -use iso_c_binding -use omp_lib - -contains - -subroutine vector_add(C, A, B, n) - use iso_c_binding - real (c_float), intent(out), dimension(N) :: C - real (c_float), intent(in), dimension(N) :: A, B - integer (c_int), value, intent(in) :: n - - !$acc data copyin(A, B) copyout(C) - - !$acc parallel loop device_type(nvidia) vector_length(block_size_x) - do i = 1, n - C(i) = A(i) + B(i) - end do - !$acc end parallel loop - - !$acc end data - -end subroutine vector_add - - - -function time_vector_add(C, A, B, n) result(time) - use iso_c_binding - real (c_float), intent(out), dimension(N) :: C - real (c_float), intent(in), dimension(N) :: A, B - integer (c_int), value, intent(in) :: n - real (c_float) :: time - real (c_double) start_time, end_time - - start_time = omp_get_wtime() - - call vector_add(C, A, B, n) - - end_time = omp_get_wtime() - time = (end_time - start_time)*1e3 - -end function time_vector_add - - - -end module vector diff --git a/examples/fortran/vector_add_acc.py b/examples/fortran/vector_add_acc.py deleted file mode 100755 index ff84c8b6e..000000000 --- a/examples/fortran/vector_add_acc.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python -"""This is a minimal example for calling Fortran functions""" - -import numpy as np -from kernel_tuner import tune_kernel - - -def tune(): - size = int(72 * 1024 * 1024) - - a = np.random.randn(size).astype(np.float32) - b = np.random.randn(size).astype(np.float32) - c = np.zeros_like(b) - n = np.int32(size) - - args = [c, a, b, n] - - tune_params = dict() - tune_params["N"] = [size] - tune_params["block_size_x"] = [32, 64, 128, 256, 512] - - result, env = tune_kernel( - "time_vector_add", - "vector_add_acc.F90", - size, - args, - tune_params, - lang="C", - compiler="nvfortran", - compiler_options=["-fast", "-acc=gpu"], - ) - - return result - - -if __name__ == "__main__": - tune() diff --git a/kernel_tuner/backends/backend.py b/kernel_tuner/backends/backend.py index a37c9d6e7..c458789e1 100644 --- a/kernel_tuner/backends/backend.py +++ b/kernel_tuner/backends/backend.py @@ -57,6 +57,12 @@ def memcpy_htod(self, dest, src): """This method must implement a host to device copy.""" pass + def refresh_memory(self, arguments, should_sync): + """Copy the original content of the output memory to device memory.""" + for i, arg in enumerate(arguments): + if should_sync[i]: + self.memcpy_htod(self.allocations[i], arg) + class GPUBackend(Backend): """Base class for GPU backends""" @@ -87,3 +93,8 @@ class CompilerBackend(Backend): @abstractmethod def __init__(self, iterations, compiler_options, compiler): pass + + @abstractmethod + def cleanup_lib(self): + """Unload the previously loaded shared library""" + pass diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index 154f501ba..76028fb94 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -82,8 +82,12 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None, observers :param iterations: Number of iterations used while benchmarking a kernel, 7 by default. :type iterations: int """ + # allocations contains a clean copy of the memory + self.allocations = [] self.observers = observers or [] self.observers.append(CompilerRuntimeObserver(self)) + for obs in self.observers: + obs.register_device(self) self.iterations = iterations self.max_threads = 1024 @@ -93,7 +97,6 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None, observers self.lib = None self.using_openmp = False self.using_openacc = False - self.observers = [CompilerRuntimeObserver(self)] self.last_result = None if self.compiler == "g++": @@ -150,11 +153,6 @@ def ready_argument_list(self, arguments): dtype_str = str(arg.dtype) if isinstance(arg, np.ndarray): if dtype_str in dtype_map.keys(): - # In numpy <= 1.15, ndarray.ctypes.data_as does not itself keep a reference - # to its underlying array, so we need to store a reference to arg.copy() - # in the Argument object manually to avoid it being deleted. - # (This changed in numpy > 1.15.) - # data_ctypes = data.ctypes.data_as(C.POINTER(dtype_map[dtype_str])) data_ctypes = arg.ctypes.data_as(C.POINTER(dtype_map[dtype_str])) else: raise TypeError("unknown dtype for ndarray") @@ -163,6 +161,7 @@ def ready_argument_list(self, arguments): elif is_cupy_array(arg): data_ctypes = C.c_void_p(arg.data.ptr) ctype_args[i] = Argument(numpy=arg, ctypes=data_ctypes) + self.allocations.append(Argument(numpy=arg.copy(), ctypes=data_ctypes)) return ctype_args def compile(self, kernel_instance): @@ -186,16 +185,13 @@ def compile(self, kernel_instance): compiler_options = ["-fPIC"] # detect openmp - if "#include " in kernel_string or "use omp_lib" in kernel_string: + if "#pragma omp" in kernel_string or "!$omp" in kernel_string: logging.debug("set using_openmp to true") self.using_openmp = True - if self.compiler in ["nvc", "nvc++", "nvfortran"]: - compiler_options.append("-mp") - else: - compiler_options.append("-fopenmp") # detect openacc if "#pragma acc" in kernel_string or "!$acc" in kernel_string: + logging.debug("set using_openacc to true") self.using_openacc = True # if filename is known, use that one @@ -372,7 +368,8 @@ def memset(self, allocation, value, size): C.memset(allocation.ctypes, value, size) def memcpy_dtoh(self, dest, src): - """a simple memcpy copying from an Argument to a numpy array + """This method implements the semantic of a device to host copy for the Compiler backend. + There is no actual copy from device to host happening, but host to host. :param dest: A numpy or cupy array to store the data :type dest: np.ndarray or cupy.ndarray @@ -380,6 +377,7 @@ def memcpy_dtoh(self, dest, src): :param src: An Argument for some memory allocation :type src: Argument """ + # there is no real copy from device to host, but host to host if isinstance(dest, np.ndarray) and is_cupy_array(src.numpy): # Implicit conversion to a NumPy array is not allowed. value = src.numpy.get() @@ -389,27 +387,20 @@ def memcpy_dtoh(self, dest, src): dest[:] = xp.asarray(value) def memcpy_htod(self, dest, src): - """a simple memcpy copying from a numpy array to an Argument - - :param dest: An Argument for some memory allocation - :type dest: Argument + """There is no memcpy_htod implemented for the compiler backend.""" + pass - :param src: A numpy or cupy array containing the source data - :type src: np.ndarray or cupy.ndarray - """ - if isinstance(dest.numpy, np.ndarray) and is_cupy_array(src): - # Implicit conversion to a NumPy array is not allowed. - value = src.get() - else: - value = src - xp = get_array_module(dest.numpy) - dest.numpy[:] = xp.asarray(value) + def refresh_memory(self, arguments, should_sync): + """Copy the preserved content of the output memory to used arrays.""" + for i, arg in enumerate(arguments): + if should_sync[i]: + self.memcpy_dtoh(arg, self.allocations[i]) def cleanup_lib(self): """unload the previously loaded shared library""" if self.lib is None: return - + if not self.using_openmp and not self.using_openacc: # this if statement is necessary because shared libraries that use # OpenMP will core dump when unloaded, this is a well-known issue with OpenMP diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 4323c411c..2f51b9b5b 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -79,9 +79,7 @@ def __init__(self, kernel_name, kernel_sources, lang, defines=None): self.defines = defines if lang is None: if callable(self.kernel_sources[0]): - raise TypeError( - "Please specify language when using a code generator function" - ) + raise TypeError("Please specify language when using a code generator function") kernel_string = self.get_kernel_string(0) lang = util.detect_language(kernel_string) @@ -108,9 +106,7 @@ def get_kernel_string(self, index=0, params=None): kernel_source = self.kernel_sources[index] return util.get_kernel_string(kernel_source, params) - def prepare_list_of_files( - self, kernel_name, params, grid, threads, block_size_names - ): + def prepare_list_of_files(self, kernel_name, params, grid, threads, block_size_names): """prepare the kernel string along with any additional files The first file in the list is allowed to include or read in the others @@ -146,9 +142,7 @@ def prepare_list_of_files( for i, f in enumerate(self.kernel_sources): if i > 0 and not util.looks_like_a_filename(f): - raise ValueError( - "When passing multiple kernel sources, the secondary entries must be filenames" - ) + raise ValueError("When passing multiple kernel sources, the secondary entries must be filenames") ks = self.get_kernel_string(i, params) # add preprocessor statements @@ -182,9 +176,7 @@ def prepare_list_of_files( def get_user_suffix(self, index=0): """Get the suffix of the kernel filename, if the user specified one. Return None otherwise.""" - if util.looks_like_a_filename(self.kernel_sources[index]) and ( - "." in self.kernel_sources[index] - ): + if util.looks_like_a_filename(self.kernel_sources[index]) and ("." in self.kernel_sources[index]): return "." + self.kernel_sources[index].split(".")[-1] return None @@ -213,13 +205,9 @@ def check_argument_lists(self, kernel_name, arguments): """ for i, f in enumerate(self.kernel_sources): if not callable(f): - util.check_argument_list( - kernel_name, self.get_kernel_string(i), arguments - ) + util.check_argument_list(kernel_name, self.get_kernel_string(i), arguments) else: - logging.debug( - "Checking of arguments list not supported yet for code generators." - ) + logging.debug("Checking of arguments list not supported yet for code generators.") class DeviceInterface(object): @@ -303,6 +291,7 @@ def __init__( compiler=compiler, compiler_options=compiler_options, iterations=iterations, + observers=observers, ) elif lang.upper() == "HIP": dev = HipFunctions( @@ -312,7 +301,9 @@ def __init__( observers=observers, ) else: - raise ValueError("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet") + raise ValueError( + "Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet" + ) self.dev = dev # look for NVMLObserver and TegraObserver in observers, if present, enable special tunable parameters through nvml/tegra @@ -453,9 +444,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett obs.results = result duration = max(duration, obs.continuous_duration) - self.benchmark_continuous( - func, gpu_args, instance.threads, instance.grid, result, duration - ) + self.benchmark_continuous(func, gpu_args, instance.threads, instance.grid, result, duration) except Exception as e: # some launches may fail because too many registers are required @@ -468,9 +457,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett "INVALID_WORK_GROUP_SIZE", ] if any([skip_str in str(e) for skip_str in skippable_exceptions]): - logging.debug( - "benchmark fails due to runtime failure too many resources required" - ) + logging.debug("benchmark fails due to runtime failure too many resources required") if verbose: print( f"skipping config {util.get_instance_string(instance.params)} reason: too many resources requested for launch" @@ -482,13 +469,11 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett raise e return result - def check_kernel_output( - self, func, gpu_args, instance, answer, atol, verify, verbose - ): + def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, verbose): """runs the kernel once and checks the result against answer""" logging.debug("check_kernel_output") - #if not using custom verify function, check if the length is the same + # if not using custom verify function, check if the length is the same if answer: if len(instance.arguments) != len(answer): raise TypeError("The length of argument list and provided results do not match.") @@ -499,14 +484,13 @@ def check_kernel_output( # re-copy original contents of output arguments to GPU memory, to overwrite any changes # by earlier kernel runs - for i, arg in enumerate(instance.arguments): - if should_sync[i]: - self.dev.memcpy_htod(gpu_args[i], arg) + self.dev.refresh_memory(instance.arguments, should_sync) # run the kernel check = self.run_kernel(func, gpu_args, instance) if not check: - return # runtime failure occured that should be ignored, skip correctness check + # runtime failure occured that should be ignored, skip correctness check + return # retrieve gpu results to host memory result_host = [] @@ -517,7 +501,7 @@ def check_kernel_output( self.dev.memcpy_dtoh(result_host[-1], gpu_args[i]) elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor): if not answer[i].is_cuda: - #if the answer is on the host, copy gpu output to host as well + # if the answer is on the host, copy gpu output to host as well result_host.append(torch.zeros_like(answer[i])) self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor) else: @@ -545,10 +529,7 @@ def check_kernel_output( correct = True if not correct: - raise RuntimeError( - "Kernel result verification failed for: " - + util.get_config_string(instance.params) - ) + raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params)) def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to): # reset previous timers @@ -562,7 +543,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, # Compile and benchmark a kernel instance based on kernel strings and parameters instance_string = util.get_instance_string(params) - logging.debug('compile_and_benchmark ' + instance_string) + logging.debug("compile_and_benchmark " + instance_string) instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose) if isinstance(instance, util.ErrorConfig): @@ -580,9 +561,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, else: # add shared memory arguments to compiled module if kernel_options.smem_args is not None: - self.dev.copy_shared_memory_args( - util.get_smem_args(kernel_options.smem_args, params) - ) + self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params)) # add constant memory arguments to compiled module if kernel_options.cmem_args is not None: self.dev.copy_constant_memory_args(kernel_options.cmem_args) @@ -596,12 +575,8 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, # test kernel for correctness if func and (to.answer or to.verify or self.output_observers): start_verification = time.perf_counter() - self.check_kernel_output( - func, gpu_args, instance, to.answer, to.atol, to.verify, verbose - ) - last_verification_time = 1000 * ( - time.perf_counter() - start_verification - ) + self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose) + last_verification_time = 1000 * (time.perf_counter() - start_verification) # benchmark if func: @@ -617,10 +592,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, except Exception as e: # dump kernel sources to temp file temp_filenames = instance.prepare_temp_files_for_error_msg() - print( - "Error while compiling or benchmarking, see source files: " - + " ".join(temp_filenames) - ) + print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames)) raise e # clean up any temporary files, if no error occured @@ -666,7 +638,7 @@ def compile_kernel(self, instance, verbose): @staticmethod def preprocess_gpu_arguments(old_arguments, params): - """ Get a flat list of arguments based on the configuration given by `params` """ + """Get a flat list of arguments based on the configuration given by `params`""" return _preprocess_gpu_arguments(old_arguments, params) def copy_shared_memory_args(self, smem_args): @@ -702,9 +674,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose) ) if np.prod(threads) > self.dev.max_threads: if verbose: - print( - f"skipping config {util.get_instance_string(params)} reason: too many threads per block" - ) + print(f"skipping config {util.get_instance_string(params)} reason: too many threads per block") return util.InvalidConfig() # obtain the kernel_string and prepare additional files, if any @@ -723,7 +693,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose) # Preprocess GPU arguments. Require for handling `Tunable` arguments arguments = _preprocess_gpu_arguments(kernel_options.arguments, params) - #collect everything we know about this instance and return it + # collect everything we know about this instance and return it return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments) def get_environment(self): @@ -770,12 +740,8 @@ def run_kernel(self, func, gpu_args, instance): try: self.dev.run_kernel(func, gpu_args, instance.threads, instance.grid) except Exception as e: - if "too many resources requested for launch" in str( - e - ) or "OUT_OF_RESOURCES" in str(e): - logging.debug( - "ignoring runtime failure due to too many resources required" - ) + if "too many resources requested for launch" in str(e) or "OUT_OF_RESOURCES" in str(e): + logging.debug("ignoring runtime failure due to too many resources required") return False else: logging.debug("encountered unexpected runtime failure: " + str(e)) @@ -784,7 +750,7 @@ def run_kernel(self, func, gpu_args, instance): def _preprocess_gpu_arguments(old_arguments, params): - """ Get a flat list of arguments based on the configuration given by `params` """ + """Get a flat list of arguments based on the configuration given by `params`""" new_arguments = [] for argument in old_arguments: @@ -801,15 +767,11 @@ def _default_verify_function(instance, answer, result_host, atol, verbose): # first check if the length is the same if len(instance.arguments) != len(answer): - raise TypeError( - "The length of argument list and provided results do not match." - ) + raise TypeError("The length of argument list and provided results do not match.") # for each element in the argument list, check if the types match for i, arg in enumerate(instance.arguments): if answer[i] is not None: # skip None elements in the answer list - if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance( - arg, (np.ndarray, cp.ndarray) - ): + if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance(arg, (np.ndarray, cp.ndarray)): if answer[i].dtype != arg.dtype: raise TypeError( f"Element {i} of the expected results list is not of the same dtype as the kernel output: " @@ -857,16 +819,14 @@ def _default_verify_function(instance, answer, result_host, atol, verbose): ) else: # either answer[i] and argument have different types or answer[i] is not a numpy type - if not isinstance( - answer[i], (np.ndarray, cp.ndarray, torch.Tensor) - ) or not isinstance(answer[i], np.number): + if not isinstance(answer[i], (np.ndarray, cp.ndarray, torch.Tensor)) or not isinstance( + answer[i], np.number + ): raise TypeError( f"Element {i} of expected results list is not a numpy/cupy ndarray, torch Tensor or numpy scalar." ) else: - raise TypeError( - f"Element {i} of expected results list and kernel arguments have different types." - ) + raise TypeError(f"Element {i} of expected results list and kernel arguments have different types.") def _ravel(a): if hasattr(a, "ravel") and len(a.shape) > 1: @@ -886,26 +846,15 @@ def _flatten(a): expected = _flatten(expected) if any([isinstance(array, cp.ndarray) for array in [expected, result]]): output_test = cp.allclose(expected, result, atol=atol) - elif isinstance(expected, torch.Tensor) and isinstance( - result, torch.Tensor - ): + elif isinstance(expected, torch.Tensor) and isinstance(result, torch.Tensor): output_test = torch.allclose(expected, result, atol=atol) else: output_test = np.allclose(expected, result, atol=atol) if not output_test and verbose: - print( - "Error: " - + util.get_config_string(instance.params) - + " detected during correctness check" - ) - print( - "this error occured when checking value of the %oth kernel argument" - % (i,) - ) - print( - "Printing kernel output and expected result, set verbose=False to suppress this debug print" - ) + print("Error: " + util.get_config_string(instance.params) + " detected during correctness check") + print("this error occured when checking value of the %oth kernel argument" % (i,)) + print("Printing kernel output and expected result, set verbose=False to suppress this debug print") np.set_printoptions(edgeitems=50) print("Kernel output:") print(result) @@ -940,11 +889,7 @@ def apply_template_typenames(type_list, templated_typenames): def replace_typename_token(matchobj): """function for a whitespace preserving token regex replace""" # replace only the match, leaving the whitespace around it as is - return ( - matchobj.group(1) - + templated_typenames[matchobj.group(2)] - + matchobj.group(3) - ) + return matchobj.group(1) + templated_typenames[matchobj.group(2)] + matchobj.group(3) for i, arg_type in enumerate(type_list): for k, v in templated_typenames.items(): @@ -975,9 +920,7 @@ def wrap_templated_kernel(kernel_string, kernel_name): # relatively strict regex that does not allow nested template parameters like vector # within the template parameter list regex = ( - r"template\s*<([^>]*?)>\s*__global__\s+void\s+(__launch_bounds__\([^\)]+?\)\s+)?" - + name - + r"\s*\((.*?)\)\s*\{" + r"template\s*<([^>]*?)>\s*__global__\s+void\s+(__launch_bounds__\([^\)]+?\)\s+)?" + name + r"\s*\((.*?)\)\s*\{" ) match = re.search(regex, kernel_string, re.S) if not match: @@ -985,15 +928,12 @@ def wrap_templated_kernel(kernel_string, kernel_name): template_parameters = match.group(1).split(",") argument_list = match.group(3).split(",") - argument_list = [ - s.strip() for s in argument_list - ] # remove extra whitespace around 'type name' strings + # remove extra whitespace around 'type name' strings + argument_list = [s.strip() for s in argument_list] type_list, name_list = split_argument_list(argument_list) - templated_typenames = get_templated_typenames( - template_parameters, template_arguments - ) + templated_typenames = get_templated_typenames(template_parameters, template_arguments) apply_template_typenames(type_list, templated_typenames) # replace __global__ with __device__ in the templated kernel definition @@ -1007,9 +947,7 @@ def wrap_templated_kernel(kernel_string, kernel_name): launch_bounds = match.group(2) # generate code for the compile-time template instantiation - template_instantiation = ( - f"template __device__ void {kernel_name}(" + ", ".join(type_list) + ");\n" - ) + template_instantiation = f"template __device__ void {kernel_name}(" + ", ".join(type_list) + ");\n" # generate code for the wrapper kernel new_arg_list = ", ".join([" ".join((a, b)) for a, b in zip(type_list, name_list)]) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index ded8c57fe..36cd219b0 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -56,6 +56,13 @@ def get(self) -> str: return "openacc" +class OpenMP(Directive): + """Class to represent OpenMP""" + + def get(self) -> str: + return "openmp" + + class Cxx(Language): """Class to represent C++ code""" @@ -131,6 +138,11 @@ def is_openacc(directive: Directive) -> bool: return isinstance(directive, OpenACC) +def is_openmp(directive: Directive) -> bool: + """Check if a directive is OpenMP""" + return isinstance(directive, OpenMP) + + def is_cxx(lang: Language) -> bool: """Check if language is C++""" return isinstance(lang, Cxx) @@ -141,6 +153,19 @@ def is_fortran(lang: Language) -> bool: return isinstance(lang, Fortran) +def line_contains(line: str, target: str) -> bool: + """Generic helper to check if a line contains the target""" + return target in line + + +def directive_contains_clause(line: str, clauses: list) -> bool: + """Check if a directive contains one clause from a list""" + for clause in clauses: + if clause in line: + return True + return False + + def line_contains_openacc_directive(line: str, lang: Language) -> bool: """Check if line contains an OpenACC directive or not""" if is_cxx(lang): @@ -160,6 +185,25 @@ def line_contains_openacc_directive_fortran(line: str) -> bool: return line_contains(line, "!$acc") +def line_contains_openmp_directive(line: str, lang: Language) -> bool: + """Check if line contains an OpenMP directive or not""" + if is_cxx(lang): + return line_contains_openmp_directive_cxx(line) + elif is_fortran(lang): + return line_contains_openmp_directive_fortran(line) + return False + + +def line_contains_openmp_directive_cxx(line: str) -> bool: + """Check if a line of code contains a C++ OpenMP directive or not""" + return line_contains(line, "#pragma omp") + + +def line_contains_openmp_directive_fortran(line: str) -> bool: + """Check if a line of code contains a Fortran OpenMP directive or not""" + return line_contains(line, "!$omp") + + def line_contains_openacc_parallel_directive(line: str, lang: Language) -> bool: """Check if line contains an OpenACC parallel directive or not""" if is_cxx(lang): @@ -179,23 +223,35 @@ def line_contains_openacc_parallel_directive_fortran(line: str) -> bool: return line_contains(line, "!$acc parallel") -def line_contains(line: str, target: str) -> bool: - """Generic helper to check if a line contains the target""" - return target in line +def line_contains_openmp_target_directive(line: str, lang: Language) -> bool: + """Check if line contains an OpenMP target directive or not""" + if is_cxx(lang): + return line_contains_openmp_target_directive_cxx(line) + elif is_fortran(lang): + return line_contains_openmp_target_directive_fortran(line) + return False -def openacc_directive_contains_clause(line: str, clauses: list) -> bool: - """Check if an OpenACC directive contains one clause from a list""" - for clause in clauses: - if clause in line: - return True - return False +def line_contains_openmp_target_directive_cxx(line: str) -> bool: + """Check if a line of code contains a C++ OpenMP target directive or not""" + return line_contains(line, "#pragma omp target") + + +def line_contains_openmp_target_directive_fortran(line: str) -> bool: + """Check if a line of code contains a Fortran OpenMP target directive or not""" + return line_contains(line, "!$omp target") def openacc_directive_contains_data_clause(line: str) -> bool: """Check if an OpenACC directive contains one data clause""" data_clauses = ["copy", "copyin", "copyout", "create", "no_create", "present", "device_ptr", "attach"] - return openacc_directive_contains_clause(line, data_clauses) + return directive_contains_clause(line, data_clauses) + + +def openmp_directive_contains_data_clause(line: str) -> bool: + """Check if an OpenMP directive contains one data clause""" + data_clauses = ["map"] + return directive_contains_clause(line, data_clauses) def create_data_directive_openacc(name: str, size: ArraySize, lang: Language) -> str: @@ -223,6 +279,29 @@ def create_data_directive_openacc_fortran(name: str, size: ArraySize) -> str: ) +def create_data_directive_openmp(name: str, size: ArraySize, lang: Language) -> str: + """Create a data directive for a given language""" + if is_cxx(lang): + return create_data_directive_openmp_cxx(name, size) + elif is_fortran(lang): + return create_data_directive_openmp_fortran(name, size) + return "" + + +def create_data_directive_openmp_cxx(name: str, size: ArraySize) -> str: + """Create C++ OpenMP code to allocate and copy data""" + return f"#pragma omp target enter data map(to: {name}[:{size.get()}])\n" + + +def create_data_directive_openmp_fortran(name: str, size: ArraySize) -> str: + """Create Fortran OpenMP code to allocate and copy data""" + if len(size) == 1: + return f"!$omp target enter data map(to: {name}(:{size.get()}))\n" + else: + md_size = fortran_md_size(size) + return f"!$omp target enter data map(to: {name}({','.join(md_size)}))\n" + + def exit_data_directive_openacc(name: str, size: ArraySize, lang: Language) -> str: """Create code to copy data back for a given language""" if is_cxx(lang): @@ -246,6 +325,29 @@ def exit_data_directive_openacc_fortran(name: str, size: ArraySize) -> str: return f"!$acc exit data copyout({name}({','.join(md_size)}))\n" +def exit_data_directive_openmp(name: str, size: ArraySize, lang: Language) -> str: + """Create code to copy data back for a given language""" + if is_cxx(lang): + return exit_data_directive_openmp_cxx(name, size) + elif is_fortran(lang): + return exit_data_directive_openmp_fortran(name, size) + return "" + + +def exit_data_directive_openmp_cxx(name: str, size: ArraySize) -> str: + """Create C++ OpenMP code to copy back data""" + return f"#pragma omp target exit data map(from: {name}[:{size.get()}])\n" + + +def exit_data_directive_openmp_fortran(name: str, size: ArraySize) -> str: + """Create Fortran OpenMP code to copy back data""" + if len(size) == 1: + return f"!$omp target exit data map(from: {name}(:{size.get()}))\n" + else: + md_size = fortran_md_size(size) + return f"!$omp target exit data map(from: {name}({','.join(md_size)}))\n" + + def correct_kernel(kernel_name: str, line: str) -> bool: """Checks if the line contains the correct kernel name""" return f" {kernel_name} " in line or (kernel_name in line and len(line.partition(kernel_name)[2]) == 0) @@ -379,15 +481,38 @@ def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, use for name in data.keys(): if "*" in data[name][0]: size = parse_size(data[name][1], preprocessor=preprocessor, dimensions=user_dimensions) - if is_openacc(langs.directive) and is_cxx(langs.language): - intro += create_data_directive_openacc_cxx(name, size) - outro += exit_data_directive_openacc_cxx(name, size) - elif is_openacc(langs.directive) and is_fortran(langs.language): - intro += create_data_directive_openacc_fortran(name, size) - outro += exit_data_directive_openacc_fortran(name, size) + temp = [] + if is_openacc(langs.directive): + temp = wrap_data_openacc(name, size, langs) + elif is_openmp(langs.directive): + temp = wrap_data_openmp(name, size, langs) + intro += temp[0] + outro += temp[1] return "\n".join([intro, code, outro]) +def wrap_data_openacc(name: str, size: int, langs: Code) -> Tuple[str, str]: + """Create language specific data directives""" + if is_cxx(langs.language): + intro = create_data_directive_openacc_cxx(name, size) + outro = exit_data_directive_openacc_cxx(name, size) + elif is_fortran(langs.language): + intro = create_data_directive_openacc_fortran(name, size) + outro = exit_data_directive_openacc_fortran(name, size) + return intro, outro + + +def wrap_data_openmp(name: str, size: int, langs: Code) -> Tuple[str, str]: + """Create language specific data directives""" + if is_cxx(langs.language): + intro = create_data_directive_openmp_cxx(name, size) + outro = exit_data_directive_openmp_cxx(name, size) + elif is_fortran(langs.language): + intro = create_data_directive_openmp_fortran(name, size) + outro = exit_data_directive_openmp_fortran(name, size) + return intro, outro + + def extract_directive_code(code: str, langs: Code, kernel_name: str = None) -> dict: """Extract explicitly marked directive sections from code""" if is_cxx(langs.language): @@ -472,7 +597,7 @@ def extract_directive_signature(code: str, langs: Code, kernel_name: str = None) p_type = param[1:-1] p_type = p_type.split(":")[0] if "*" in p_type: - p_type = p_type.replace("*", " * restrict") + p_type = p_type.replace("*", " *") if is_cxx(langs.language): params.append(f"{p_type} {p_name}") elif is_fortran(langs.language): diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py index 913fee85d..9f37c1ee5 100644 --- a/test/test_compiler_functions.py +++ b/test/test_compiler_functions.py @@ -146,12 +146,6 @@ def test_byte_array_arguments(): assert all(output_arg1 == arg1) - dest = np.zeros_like(arg1) - - cfunc.memcpy_dtoh(dest, output[0]) - - assert all(dest == arg1) - @patch("kernel_tuner.backends.compiler.subprocess") @patch("kernel_tuner.backends.compiler.numpy.ctypeslib") @@ -159,9 +153,7 @@ def test_compile(npct, subprocess): kernel_string = "this is a fake C program" kernel_name = "blabla" kernel_sources = KernelSource(kernel_name, kernel_string, "C") - kernel_instance = KernelInstance( - kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [] - ) + kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) cfunc = CompilerFunctions() f = cfunc.compile(kernel_instance) @@ -191,9 +183,7 @@ def test_compile_detects_device_code(npct, subprocess): kernel_string = "this code clearly contains device code __global__ kernel(float* arg){ return; }" kernel_name = "blabla" kernel_sources = KernelSource(kernel_name, kernel_string, "C") - kernel_instance = KernelInstance( - kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [] - ) + kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) cfunc = CompilerFunctions() cfunc.compile(kernel_instance) @@ -230,111 +220,6 @@ def test_memset(): assert all(x == np.zeros(4)) -@skip_if_no_cupy -def test_memcpy_dtoh(): - import cupy as cp - - a = [1, 2, 3, 4] - x = cp.asarray(a, dtype=np.float32) - x_c = C.c_void_p(x.data.ptr) - arg = Argument(numpy=x, ctypes=x_c) - output = np.zeros(len(x), dtype=x.dtype) - - cfunc = CompilerFunctions() - cfunc.memcpy_dtoh(output, arg) - - print(f"{type(x)=} {x=}") - print(f"{type(a)=} {a=}") - print(f"{type(output)=} {output=}") - - assert all(output == a) - assert all(x.get() == a) - - -@skip_if_no_gcc -def test_memcpy_host_dtoh(): - a = [1, 2, 3, 4] - x = np.array(a).astype(np.float32) - x_c = x.ctypes.data_as(C.POINTER(C.c_float)) - arg = Argument(numpy=x, ctypes=x_c) - output = np.zeros_like(x) - - cfunc = CompilerFunctions() - cfunc.memcpy_dtoh(output, arg) - - print(a) - print(output) - - assert all(output == a) - assert all(x == a) - - -@skip_if_no_cupy -def test_memcpy_device_dtoh(): - import cupy as cp - - a = [1, 2, 3, 4] - x = cp.asarray(a, dtype=np.float32) - x_c = C.c_void_p(x.data.ptr) - arg = Argument(numpy=x, ctypes=x_c) - output = cp.zeros_like(x) - - cfunc = CompilerFunctions() - cfunc.memcpy_dtoh(output, arg) - - print(f"{type(x)=} {x=}") - print(f"{type(a)=} {a=}") - print(f"{type(output)=} {output=}") - - assert all(output.get() == a) - assert all(x.get() == a) - - -@skip_if_no_cupy -def test_memcpy_htod(): - import cupy as cp - - a = [1, 2, 3, 4] - src = np.array(a, dtype=np.float32) - x = cp.zeros(len(src), dtype=src.dtype) - x_c = C.c_void_p(x.data.ptr) - arg = Argument(numpy=x, ctypes=x_c) - - cfunc = CompilerFunctions() - cfunc.memcpy_htod(arg, src) - - assert all(arg.numpy.get() == a) - - -def test_memcpy_host_htod(): - a = [1, 2, 3, 4] - src = np.array(a).astype(np.float32) - x = np.zeros_like(src) - x_c = x.ctypes.data_as(C.POINTER(C.c_float)) - arg = Argument(numpy=x, ctypes=x_c) - - cfunc = CompilerFunctions() - cfunc.memcpy_htod(arg, src) - - assert all(arg.numpy == a) - - -@skip_if_no_cupy -def test_memcpy_device_htod(): - import cupy as cp - - a = [1, 2, 3, 4] - src = cp.array(a, dtype=np.float32) - x = cp.zeros(len(src), dtype=src.dtype) - x_c = C.c_void_p(x.data.ptr) - arg = Argument(numpy=x, ctypes=x_c) - - cfunc = CompilerFunctions() - cfunc.memcpy_htod(arg, src) - - assert all(arg.numpy.get() == a) - - @skip_if_no_gfortran def test_complies_fortran_function_no_module(): kernel_string = """ @@ -347,9 +232,7 @@ def test_complies_fortran_function_no_module(): """ kernel_name = "my_test_function" kernel_sources = KernelSource(kernel_name, kernel_string, "C") - kernel_instance = KernelInstance( - kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [] - ) + kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) cfunc = CompilerFunctions(compiler="gfortran") func = cfunc.compile(kernel_instance) @@ -378,9 +261,7 @@ def test_complies_fortran_function_with_module(): """ kernel_name = "my_test_function" kernel_sources = KernelSource(kernel_name, kernel_string, "C") - kernel_instance = KernelInstance( - kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [] - ) + kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) try: cfunc = CompilerFunctions(compiler="gfortran") @@ -427,7 +308,7 @@ def env(): @skip_if_no_openmp @skip_if_no_gcc def test_benchmark(env): - results, _ = kernel_tuner.tune_kernel(*env, block_size_names=["nthreads"]) + results, _ = kernel_tuner.tune_kernel(*env, compiler_options=["-fopenmp"], block_size_names=["nthreads"]) assert len(results) == 3 assert all(["nthreads" in result for result in results]) assert all(["time" in result for result in results]) @@ -487,3 +368,26 @@ def test_run_kernel(): lang="C", ) assert cp.all((a + b) == c) + + +def test_refresh_memory(): + arg1 = np.array([1, 2, 3]).astype(np.int32) + arguments = [arg1] + cfunc = CompilerFunctions() + _ = cfunc.ready_argument_list(arguments) + assert np.all(arguments[0] == [1, 2, 3]) + arguments[0] = np.array([0, 0, 0]).astype(np.int8) + assert np.all(arguments[0] == [0, 0, 0]) + cfunc.refresh_memory(arguments, [True]) + assert np.all(arguments[0] == [1, 2, 3]) + + +def test_memcpy_dtoh(): + arg1 = np.array([0, 5, 0, 7]).astype(np.int32) + arguments = [arg1] + cfunc = CompilerFunctions() + ready_arguments = cfunc.ready_argument_list(arguments) + output = np.array([5, 0, 7, 0]).astype(np.int32) + assert np.all(ready_arguments[0].numpy != output) + cfunc.memcpy_dtoh(output, ready_arguments[0]) + assert np.all(ready_arguments[0].numpy == output) diff --git a/test/test_core.py b/test/test_core.py index a8624470e..39597b86c 100644 --- a/test/test_core.py +++ b/test/test_core.py @@ -107,11 +107,12 @@ def test_check_kernel_output(dev_func_interface): dev.check_kernel_output('func', answer, instance, answer, atol, None, True) - dfi.memcpy_htod.assert_called_once_with(answer[0], answer[0]) + dfi.refresh_memory.assert_called() dfi.run_kernel.assert_called_once_with('func', answer, (256, 1, 1), (1, 1, 1)) print(dfi.mock_calls) + assert dfi.refresh_memory.called == 1 assert dfi.memcpy_dtoh.called == 1 for name, args, _ in dfi.mock_calls: diff --git a/test/utils/test_directives.py b/test/utils/test_directives.py index d58a44421..759a20a54 100644 --- a/test/utils/test_directives.py +++ b/test/utils/test_directives.py @@ -4,6 +4,13 @@ def test_is_openacc(): assert is_openacc(OpenACC()) assert not is_openacc(None) + assert not is_openacc(OpenMP()) + + +def test_is_openmp(): + assert is_openmp(OpenMP()) + assert not is_openmp(None) + assert not is_openmp(OpenACC()) def test_is_cxx(): @@ -28,6 +35,16 @@ def test_line_contains_openacc_directive(): assert not line_contains_openacc_directive(cxx_code, None) +def test_line_contains_openmp_directive(): + cxx_code = "int main(void) {\n#pragma omp target}" + f90_code = "!$omp target" + assert line_contains_openmp_directive(cxx_code, Cxx()) + assert not line_contains_openmp_directive(f90_code, Cxx()) + assert line_contains_openmp_directive(f90_code, Fortran()) + assert not line_contains_openmp_directive(cxx_code, Fortran()) + assert not line_contains_openmp_directive(cxx_code, None) + + def test_line_contains_openacc_parallel_directive(): assert line_contains_openacc_parallel_directive("#pragma acc parallel wait", Cxx()) assert line_contains_openacc_parallel_directive("!$acc parallel", Fortran()) @@ -36,11 +53,24 @@ def test_line_contains_openacc_parallel_directive(): assert not line_contains_openacc_parallel_directive("!$acc parallel", None) +def test_line_contains_openmp_target_directive(): + assert line_contains_openmp_target_directive("#pragma omp target teams", Cxx()) + assert line_contains_openmp_target_directive("!$omp target", Fortran()) + assert not line_contains_openmp_target_directive("#pragma omp loop", Cxx()) + assert not line_contains_openmp_target_directive("!$omp loop", Fortran()) + assert not line_contains_openmp_target_directive("!$omp parallel", None) + + def test_openacc_directive_contains_data_clause(): assert openacc_directive_contains_data_clause("#pragma acc parallel present(A[:1089])") assert not openacc_directive_contains_data_clause("#pragma acc parallel for") +def test_openmp_directive_contains_data_clause(): + assert openmp_directive_contains_data_clause("#pragma omp target teams map(tofrom: A[:1089])") + assert not openmp_directive_contains_data_clause("#pragma omp target") + + def test_create_data_directive(): size = ArraySize() size.add(1024) @@ -48,6 +78,7 @@ def test_create_data_directive(): create_data_directive_openacc("array", size, Cxx()) == "#pragma acc enter data create(array[:1024])\n#pragma acc update device(array[:1024])\n" ) + assert create_data_directive_openmp("array", size, Cxx()) == "#pragma omp target enter data map(to: array[:1024])\n" size.clear() size.add(35) size.add(16) @@ -55,18 +86,27 @@ def test_create_data_directive(): create_data_directive_openacc("matrix", size, Fortran()) == "!$acc enter data create(matrix(:35,:16))\n!$acc update device(matrix(:35,:16))\n" ) + assert ( + create_data_directive_openmp("matrix", size, Fortran()) == "!$omp target enter data map(to: matrix(:35,:16))\n" + ) assert create_data_directive_openacc("array", size, None) == "" + assert create_data_directive_openmp("array", size, None) == "" def test_exit_data_directive(): size = ArraySize() size.add(1024) assert exit_data_directive_openacc("array", size, Cxx()) == "#pragma acc exit data copyout(array[:1024])\n" + assert exit_data_directive_openmp("array", size, Cxx()) == "#pragma omp target exit data map(from: array[:1024])\n" size.clear() size.add(35) size.add(16) assert exit_data_directive_openacc("matrix", size, Fortran()) == "!$acc exit data copyout(matrix(:35,:16))\n" + assert ( + exit_data_directive_openmp("matrix", size, Fortran()) == "!$omp target exit data map(from: matrix(:35,:16))\n" + ) assert exit_data_directive_openacc("matrix", size, None) == "" + assert exit_data_directive_openmp("matrix", size, None) == "" def test_correct_kernel(): @@ -101,14 +141,20 @@ def test_wrap_timing(): def test_wrap_data(): acc_cxx = Code(OpenACC(), Cxx()) acc_f90 = Code(OpenACC(), Fortran()) + omp_cxx = Code(OpenMP(), Cxx()) + omp_f90 = Code(OpenMP(), Fortran()) code_cxx = "// this is a comment\n" code_f90 = "! this is a comment\n" data = {"array": ["int*", "size"]} preprocessor = ["#define size 42"] expected_cxx = "#pragma acc enter data create(array[:42])\n#pragma acc update device(array[:42])\n\n// this is a comment\n\n#pragma acc exit data copyout(array[:42])\n" assert wrap_data(code_cxx, acc_cxx, data, preprocessor, None) == expected_cxx + expected_cxx = "#pragma omp target enter data map(to: array[:42])\n\n// this is a comment\n\n#pragma omp target exit data map(from: array[:42])\n" + assert wrap_data(code_cxx, omp_cxx, data, preprocessor, None) == expected_cxx expected_f90 = "!$acc enter data create(array(:42))\n!$acc update device(array(:42))\n\n! this is a comment\n\n!$acc exit data copyout(array(:42))\n" assert wrap_data(code_f90, acc_f90, data, preprocessor, None) == expected_f90 + expected_f90 = "!$omp target enter data map(to: array(:42))\n\n! this is a comment\n\n!$omp target exit data map(from: array(:42))\n" + assert wrap_data(code_f90, omp_f90, data, preprocessor, None) == expected_f90 data = {"matrix": ["float*", "rows,cols"]} preprocessor = ["#define rows 42", "#define cols 84"] expected_f90 = "!$acc enter data create(matrix(:42,:84))\n!$acc update device(matrix(:42,:84))\n\n! this is a comment\n\n!$acc exit data copyout(matrix(:42,:84))\n" @@ -238,13 +284,13 @@ def test_extract_directive_signature(): signatures = extract_directive_signature(code, acc_cxx) assert len(signatures) == 1 assert ( - "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" + "float vector_add(float * a, float * b, float * c, int size)" in signatures["vector_add"] ) signatures = extract_directive_signature(code, acc_cxx, "vector_add") assert len(signatures) == 1 assert ( - "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" + "float vector_add(float * a, float * b, float * c, int size)" in signatures["vector_add"] ) signatures = extract_directive_signature(code, acc_cxx, "vector_add_ext")