From 82f9dc0591d6eaded1bea92498a791d3c857eb33 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 22 Jul 2024 15:01:57 +0200 Subject: [PATCH 01/14] Refactoring the function generation using bespoke templates. --- kernel_tuner/utils/directives.py | 81 ++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 7d9bb4e7..ba2408d4 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -2,6 +2,36 @@ from abc import ABC, abstractmethod import numpy as np +# Function templates +acc_cpp_template = """ + + +#include + +extern "C" { + + + +} +""" + +acc_f90_template = """ + + + +module kt +use iso_c_binding +contains + + + + + +end function + +end module kt +""" + class Directive(ABC): """Base class for all directives""" @@ -529,42 +559,35 @@ def generate_directive_function( ) -> str: """Generate tunable function for one directive""" - code = "\n".join(preprocessor) + "\n" - if user_dimensions is not None: - # add user dimensions to preprocessor - for key, value in user_dimensions.items(): - code += f"#define {key} {value}\n" - if is_cxx(langs.language) and "#include " not in preprocessor: - code += "\n#include \n" - if is_cxx(langs.language): - code += 'extern "C" ' + signature + "{\n" - elif is_fortran(langs.language): - code += "\nmodule kt\nuse iso_c_binding\ncontains\n" - code += "\n" + signature - if len(initialization) > 1: - code += initialization + "\n" - if data is not None: - body = add_present_openacc(body, langs, data, preprocessor, user_dimensions) if is_cxx(langs.language): + code = acc_cpp_template body = start_timing_cxx(body) if data is not None: - code += wrap_data(body + "\n", langs, data, preprocessor, user_dimensions) - else: - code += body - code = end_timing_cxx(code) - if len(deinitialization) > 1: - code += deinitialization + "\n" - code += "\n}" + body = wrap_data(body + "\n", langs, data, preprocessor, user_dimensions) + body += end_timing_cxx(body) elif is_fortran(langs.language): + code = acc_f90_template body = wrap_timing(body, langs.language) if data is not None: - code += wrap_data(body + "\n", langs, data, preprocessor, user_dimensions) - else: - code += body + "\n" - if len(deinitialization) > 1: - code += deinitialization + "\n" + body = wrap_data(body + "\n", langs, data, preprocessor, user_dimensions) name = signature.split(" ")[1].split("(")[0] - code += f"\nend function {name}\nend module kt\n" + code = code.replace("", name) + code = code.replace("", preprocessor) + # if present, add user specific dimensions as defines + if user_dimensions is not None: + user_defines = "" + for key, value in user_dimensions.items(): + user_defines += f"#define {key} {value}\n" + code = code.replace("", user_defines) + else: + code = code.replace("", "") + code = code.replace("", signature) + if len(initialization) > 1: + code = code.replace("", initialization) + if len(deinitialization) > 1: + code = code.replace("", deinitialization) + if data is not None: + body = add_present_openacc(body, langs, data, preprocessor, user_dimensions) return code From bd64ad5de38c2de6c5a546c9abab9407b92884bc Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 22 Jul 2024 15:28:09 +0200 Subject: [PATCH 02/14] Bug fix. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index ba2408d4..a006626d 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -572,7 +572,7 @@ def generate_directive_function( body = wrap_data(body + "\n", langs, data, preprocessor, user_dimensions) name = signature.split(" ")[1].split("(")[0] code = code.replace("", name) - code = code.replace("", preprocessor) + code = code.replace("", "\n".join(preprocessor)) # if present, add user specific dimensions as defines if user_dimensions is not None: user_defines = "" From 6fd2ca987e3ad8f800c5d1857af85413ef7db450 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 22 Jul 2024 15:30:49 +0200 Subject: [PATCH 03/14] Multiple fixes for the function generation. --- kernel_tuner/utils/directives.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index a006626d..fe031cc5 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -582,12 +582,11 @@ def generate_directive_function( else: code = code.replace("", "") code = code.replace("", signature) - if len(initialization) > 1: - code = code.replace("", initialization) - if len(deinitialization) > 1: - code = code.replace("", deinitialization) + code = code.replace("", initialization) + code = code.replace("", deinitialization) if data is not None: body = add_present_openacc(body, langs, data, preprocessor, user_dimensions) + code = code.replace("", body) return code From 33ae6b03b3f577f51d265a0622eb9d1d1da3ed3e Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 22 Jul 2024 15:33:01 +0200 Subject: [PATCH 04/14] Multiple typos fixed. --- kernel_tuner/utils/directives.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index fe031cc5..24403f31 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -10,7 +10,7 @@ extern "C" { - + } """ @@ -25,7 +25,7 @@ - + end function From 297890299d11769abc857e19e67387158749dcba Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 22 Jul 2024 15:38:03 +0200 Subject: [PATCH 05/14] Last bug fixed for C++ code, example now working. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 24403f31..ff0573af 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -564,7 +564,7 @@ def generate_directive_function( body = start_timing_cxx(body) if data is not None: body = wrap_data(body + "\n", langs, data, preprocessor, user_dimensions) - body += end_timing_cxx(body) + body = end_timing_cxx(body) elif is_fortran(langs.language): code = acc_f90_template body = wrap_timing(body, langs.language) From b135d7b66fa6fa5e3883f3d5f09e3fd9d5e58c2d Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 22 Jul 2024 15:43:57 +0200 Subject: [PATCH 06/14] Fixed typo for the Fortran generation. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index ff0573af..3f32800a 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -571,7 +571,7 @@ def generate_directive_function( if data is not None: body = wrap_data(body + "\n", langs, data, preprocessor, user_dimensions) name = signature.split(" ")[1].split("(")[0] - code = code.replace("", name) + code = code.replace("", name) code = code.replace("", "\n".join(preprocessor)) # if present, add user specific dimensions as defines if user_dimensions is not None: From b5ce41c91d90d3a35231eac420aa3ca07c2166d7 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 1 Aug 2024 13:32:47 +0200 Subject: [PATCH 07/14] Small refactoring change. --- kernel_tuner/utils/directives.py | 4 ++-- test/utils/test_directives.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 3f32800a..064cd95f 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -369,7 +369,7 @@ def wrap_timing_fortran(code: str) -> str: def end_timing_cxx(code: str) -> str: """In C++ we need to return the measured time""" - return code + "\nreturn elapsed_time.count();\n" + return "\n".join([code, "return elapsed_time.count();\n"]) def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, user_dimensions: dict = None) -> str: @@ -385,7 +385,7 @@ def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, use elif is_openacc(langs.directive) and is_fortran(langs.language): intro += create_data_directive_openacc_fortran(name, size) outro += exit_data_directive_openacc_fortran(name, size) - return intro + code + outro + return "\n".join([intro, code, outro]) def extract_directive_code(code: str, langs: Code, kernel_name: str = None) -> dict: diff --git a/test/utils/test_directives.py b/test/utils/test_directives.py index bed2d871..d58a4442 100644 --- a/test/utils/test_directives.py +++ b/test/utils/test_directives.py @@ -105,13 +105,13 @@ def test_wrap_data(): code_f90 = "! this is a comment\n" data = {"array": ["int*", "size"]} preprocessor = ["#define size 42"] - expected_cxx = "#pragma acc enter data create(array[:42])\n#pragma acc update device(array[:42])\n// this is a comment\n#pragma acc exit data copyout(array[:42])\n" + expected_cxx = "#pragma acc enter data create(array[:42])\n#pragma acc update device(array[:42])\n\n// this is a comment\n\n#pragma acc exit data copyout(array[:42])\n" assert wrap_data(code_cxx, acc_cxx, data, preprocessor, None) == expected_cxx - expected_f90 = "!$acc enter data create(array(:42))\n!$acc update device(array(:42))\n! this is a comment\n!$acc exit data copyout(array(:42))\n" + expected_f90 = "!$acc enter data create(array(:42))\n!$acc update device(array(:42))\n\n! this is a comment\n\n!$acc exit data copyout(array(:42))\n" assert wrap_data(code_f90, acc_f90, data, preprocessor, None) == expected_f90 data = {"matrix": ["float*", "rows,cols"]} preprocessor = ["#define rows 42", "#define cols 84"] - expected_f90 = "!$acc enter data create(matrix(:42,:84))\n!$acc update device(matrix(:42,:84))\n! this is a comment\n!$acc exit data copyout(matrix(:42,:84))\n" + expected_f90 = "!$acc enter data create(matrix(:42,:84))\n!$acc update device(matrix(:42,:84))\n\n! this is a comment\n\n!$acc exit data copyout(matrix(:42,:84))\n" assert wrap_data(code_f90, acc_f90, data, preprocessor, None) == expected_f90 dimensions = {"rows": 42, "cols": 84} assert wrap_data(code_f90, acc_f90, data, user_dimensions=dimensions) == expected_f90 From c83b4ce98ff37b9828df415a88a4a6c20cd5276b Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 6 Aug 2024 16:03:15 +0200 Subject: [PATCH 08/14] Adding a helper functions to facilitate users. --- kernel_tuner/utils/directives.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 064cd95f..1d7afb64 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -684,3 +684,19 @@ def add_present_openacc_fortran(name: str, size: ArraySize) -> str: else: md_size = fortran_md_size(size) return f" present({name}({','.join(md_size)})) " + + +def process_directives(langs: Code, source: str, user_dimensions: dict = None) -> dict: + """Helper functions to process all the directives in the code and create tunable functions""" + kernel_strings = dict() + preprocessor = extract_preprocessor(source) + signatures = extract_directive_signature(source, langs) + bodies = extract_directive_code(source, langs) + data = extract_directive_data(source, langs) + init = extract_initialization_code(source, langs) + deinit = extract_deinitialization_code(source, langs) + for kernel in signatures.keys(): + kernel_strings[kernel] = generate_directive_function( + preprocessor, signatures[kernel], bodies[kernel], langs, data[kernel], init, deinit, user_dimensions + ) + return kernel_strings From d2ec24a0585f4fa1b50adf41abbd028618193579 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 6 Aug 2024 16:40:06 +0200 Subject: [PATCH 09/14] Improved the helper, it can also allocate data now. --- kernel_tuner/utils/directives.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 1d7afb64..b019765b 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -686,9 +686,10 @@ def add_present_openacc_fortran(name: str, size: ArraySize) -> str: return f" present({name}({','.join(md_size)})) " -def process_directives(langs: Code, source: str, user_dimensions: dict = None) -> dict: +def process_directives(langs: Code, source: str, user_dimensions: dict = None) -> tuple(dict, dict): """Helper functions to process all the directives in the code and create tunable functions""" kernel_strings = dict() + kernel_args = dict() preprocessor = extract_preprocessor(source) signatures = extract_directive_signature(source, langs) bodies = extract_directive_code(source, langs) @@ -699,4 +700,5 @@ def process_directives(langs: Code, source: str, user_dimensions: dict = None) - kernel_strings[kernel] = generate_directive_function( preprocessor, signatures[kernel], bodies[kernel], langs, data[kernel], init, deinit, user_dimensions ) - return kernel_strings + kernel_args[kernel] = allocate_signature_memory(data[kernel], preprocessor, user_dimensions) + return (kernel_strings, kernel_args) From b8d4e820a8a145374df7c87a6c4580ada98db50d Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 6 Aug 2024 16:46:31 +0200 Subject: [PATCH 10/14] Fixed type hint. --- kernel_tuner/utils/directives.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index b019765b..b9612abb 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Tuple from abc import ABC, abstractmethod import numpy as np @@ -686,7 +686,7 @@ def add_present_openacc_fortran(name: str, size: ArraySize) -> str: return f" present({name}({','.join(md_size)})) " -def process_directives(langs: Code, source: str, user_dimensions: dict = None) -> tuple(dict, dict): +def process_directives(langs: Code, source: str, user_dimensions: dict = None) -> Tuple[dict, dict]: """Helper functions to process all the directives in the code and create tunable functions""" kernel_strings = dict() kernel_args = dict() From bb1f701211bbe45aa4c7f543a26ad18100091055 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 7 Aug 2024 15:21:22 +0200 Subject: [PATCH 11/14] First draft of the matrix multiply example. --- .../directives/matrix_multiply_c_openacc.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 examples/directives/matrix_multiply_c_openacc.py diff --git a/examples/directives/matrix_multiply_c_openacc.py b/examples/directives/matrix_multiply_c_openacc.py new file mode 100644 index 00000000..9e0c5523 --- /dev/null +++ b/examples/directives/matrix_multiply_c_openacc.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +"""This is an example tuning a naive matrix multiplication using the simplified directives interface""" + +from kernel_tuner import tune_kernel +from kernel_tuner.utils.directives import ( + Code, + OpenACC, + Cxx, + process_directives +) + +code = """ +#include + +#define N 4096 + +void matrix_multiply(float *A, float *B, float *C) { + #pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN) + #pragma acc parallel vector_length(nthreads) + #pragma acc loop + for ( i = 0; i < N; i++) { + for ( j = 0; j < N; j++ ) { + for ( k = 0; k < N; k++ ) { + C[i][j] += A[i][k] * B[k][j]; + } + } + } + #pragma tuner stop +} +""" + +# Extract tunable directive +app = Code(OpenACC(), Cxx()) +dims = {"NN": 4096*4096} +kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims) + +tune_params = dict() +tune_params["nthreads"] = [32 * i for i in range(1, 33)] +metrics = dict() +metrics["GB/s"] = lambda x: (4096 * 4096 * 4) / (x["time"] / 10**3) / 10**9 + +tune_kernel( + "mm", + kernel_string["mm"], + 0, + kernel_args["mm"], + tune_params, + metrics=metrics, + compiler_options=["-fast", "-acc=gpu"], + compiler="nvc++", +) From 7e5e6b2fd13a74678feb9d66092dce2e35f787ac Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 7 Aug 2024 15:43:32 +0200 Subject: [PATCH 12/14] Fixing code. --- .../directives/matrix_multiply_c_openacc.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/directives/matrix_multiply_c_openacc.py b/examples/directives/matrix_multiply_c_openacc.py index 9e0c5523..ae9efad6 100644 --- a/examples/directives/matrix_multiply_c_openacc.py +++ b/examples/directives/matrix_multiply_c_openacc.py @@ -10,19 +10,20 @@ ) code = """ -#include - #define N 4096 void matrix_multiply(float *A, float *B, float *C) { #pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN) + float temp_sum = 0.0f; #pragma acc parallel vector_length(nthreads) - #pragma acc loop - for ( i = 0; i < N; i++) { - for ( j = 0; j < N; j++ ) { - for ( k = 0; k < N; k++ ) { - C[i][j] += A[i][k] * B[k][j]; + #pragma acc loop collapse(2) reduction(+:temp_sum) + for ( int i = 0; i < N; i++) { + for ( int j = 0; j < N; j++ ) { + temp_sum = 0.0f; + for ( int k = 0; k < N; k++ ) { + temp_sum += A[(i * N) + k] * B[(k * N) + j]; } + C[(i * N) + j] = temp_sum; } } #pragma tuner stop @@ -37,7 +38,7 @@ tune_params = dict() tune_params["nthreads"] = [32 * i for i in range(1, 33)] metrics = dict() -metrics["GB/s"] = lambda x: (4096 * 4096 * 4) / (x["time"] / 10**3) / 10**9 +metrics["GB/s"] = lambda x: ((4096 * 4096 * 4096 * 2 * 4) + (4096 * 4096 * 4)) / (x["time"] / 10**3) / 10**9 tune_kernel( "mm", From 73c86094eebd3d07a96932c57a24369b24908426 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 7 Aug 2024 16:09:55 +0200 Subject: [PATCH 13/14] Some code changes, and more metrics. --- examples/directives/matrix_multiply_c_openacc.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/directives/matrix_multiply_c_openacc.py b/examples/directives/matrix_multiply_c_openacc.py index ae9efad6..d8bf7cc4 100644 --- a/examples/directives/matrix_multiply_c_openacc.py +++ b/examples/directives/matrix_multiply_c_openacc.py @@ -9,6 +9,8 @@ process_directives ) +N = 4096 + code = """ #define N 4096 @@ -16,10 +18,11 @@ #pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN) float temp_sum = 0.0f; #pragma acc parallel vector_length(nthreads) - #pragma acc loop collapse(2) reduction(+:temp_sum) + #pragma acc loop gang collapse(2) for ( int i = 0; i < N; i++) { for ( int j = 0; j < N; j++ ) { temp_sum = 0.0f; + #pragma acc loop vector reduction(+:temp_sum) for ( int k = 0; k < N; k++ ) { temp_sum += A[(i * N) + k] * B[(k * N) + j]; } @@ -32,13 +35,15 @@ # Extract tunable directive app = Code(OpenACC(), Cxx()) -dims = {"NN": 4096*4096} +dims = {"NN": N**2} kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims) tune_params = dict() tune_params["nthreads"] = [32 * i for i in range(1, 33)] metrics = dict() -metrics["GB/s"] = lambda x: ((4096 * 4096 * 4096 * 2 * 4) + (4096 * 4096 * 4)) / (x["time"] / 10**3) / 10**9 +metrics["time_s"] = lambda x: x["time"] / 10**3 +metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9 +metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9 tune_kernel( "mm", From a9509976c776c67d88fab0a1512cedbf28b5b309 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 20 Aug 2024 13:58:41 +0200 Subject: [PATCH 14/14] Rename two variables. --- kernel_tuner/utils/directives.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index b9612abb..ded8c57f 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -3,7 +3,7 @@ import numpy as np # Function templates -acc_cpp_template = """ +cpp_template: str = """ #include @@ -15,7 +15,7 @@ } """ -acc_f90_template = """ +f90_template: str = """ @@ -560,13 +560,13 @@ def generate_directive_function( """Generate tunable function for one directive""" if is_cxx(langs.language): - code = acc_cpp_template + code = cpp_template body = start_timing_cxx(body) if data is not None: body = wrap_data(body + "\n", langs, data, preprocessor, user_dimensions) body = end_timing_cxx(body) elif is_fortran(langs.language): - code = acc_f90_template + code = f90_template body = wrap_timing(body, langs.language) if data is not None: body = wrap_data(body + "\n", langs, data, preprocessor, user_dimensions)