From 1820e49e8513e9376f9bfa79fd8bc9afed64e8ef Mon Sep 17 00:00:00 2001 From: roxx30198 Date: Mon, 30 Oct 2023 14:41:01 -0600 Subject: [PATCH] Gaussian elimination impl --- dpbench/benchmarks/rodinia/CMakeLists.txt | 2 + .../rodinia/gaussian/CMakeLists.txt | 5 + .../benchmarks/rodinia/gaussian/__init__.py | 28 ++++ .../rodinia/gaussian/gaussian_initialize.py | 34 +++++ .../rodinia/gaussian/gaussian_numba_dpex_k.py | 107 ++++++++++++++ .../rodinia/gaussian/gaussian_python.py | 24 ++++ .../gaussian_sycl_native_ext/CMakeLists.txt | 14 ++ .../gaussian_sycl_native_ext/__init__.py | 7 + .../gaussian_sycl/_gaussian_kernel.hpp | 57 ++++++++ .../gaussian_sycl/_gaussian_sycl.cpp | 131 ++++++++++++++++++ .../configs/bench_info/rodinia/gaussian.toml | 60 ++++++++ 11 files changed, 469 insertions(+) create mode 100644 dpbench/benchmarks/rodinia/gaussian/CMakeLists.txt create mode 100644 dpbench/benchmarks/rodinia/gaussian/__init__.py create mode 100644 dpbench/benchmarks/rodinia/gaussian/gaussian_initialize.py create mode 100644 dpbench/benchmarks/rodinia/gaussian/gaussian_numba_dpex_k.py create mode 100644 dpbench/benchmarks/rodinia/gaussian/gaussian_python.py create mode 100644 dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/CMakeLists.txt create mode 100644 dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/__init__.py create mode 100644 dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/gaussian_sycl/_gaussian_kernel.hpp create mode 100644 dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/gaussian_sycl/_gaussian_sycl.cpp create mode 100644 dpbench/configs/bench_info/rodinia/gaussian.toml diff --git a/dpbench/benchmarks/rodinia/CMakeLists.txt b/dpbench/benchmarks/rodinia/CMakeLists.txt index 5985a8ff..eac8f1cf 100644 --- a/dpbench/benchmarks/rodinia/CMakeLists.txt +++ b/dpbench/benchmarks/rodinia/CMakeLists.txt @@ -1,3 +1,5 @@ # SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 + +add_subdirectory(gaussian) diff --git a/dpbench/benchmarks/rodinia/gaussian/CMakeLists.txt b/dpbench/benchmarks/rodinia/gaussian/CMakeLists.txt new file mode 100644 index 00000000..d60c99ed --- /dev/null +++ b/dpbench/benchmarks/rodinia/gaussian/CMakeLists.txt @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +add_subdirectory(gaussian_sycl_native_ext) diff --git a/dpbench/benchmarks/rodinia/gaussian/__init__.py b/dpbench/benchmarks/rodinia/gaussian/__init__.py new file mode 100644 index 00000000..80c92bf6 --- /dev/null +++ b/dpbench/benchmarks/rodinia/gaussian/__init__.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +""" + +Gaussian elimination implementation + +This is sycl and numba-dpex implementation for gaussian elimination + +Input +--------- +size : Forms an input matrix of dimensions (size x size) + +Output + +-------- + +result> : Result of the given set of linear equations using + gaussian elimination. + +Method: + +The gaussian transformations are applied to the input matrix to form the +diagonal matrix in forward elimination, and then the equations are solved +to find the result in back substitution. + +""" diff --git a/dpbench/benchmarks/rodinia/gaussian/gaussian_initialize.py b/dpbench/benchmarks/rodinia/gaussian/gaussian_initialize.py new file mode 100644 index 00000000..21821337 --- /dev/null +++ b/dpbench/benchmarks/rodinia/gaussian/gaussian_initialize.py @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +LAMBDA = -0.01 + + +def initialize(size, types_dict): + import math + + import numpy as np + + dtype = types_dict["float"] + + coe = np.empty((2 * size - 1), dtype=dtype) + a = np.empty((size * size), dtype=dtype) + + for i in range(size): + coe_i = 10 * math.exp(LAMBDA * i) + j = size - 1 + i + coe[j] = coe_i + j = size - 1 - i + coe[j] = coe_i + + for i in range(size): + for j in range(size): + a[i * size + j] = coe[size - 1 - i + j] + + return ( + a, + np.ones(size, dtype=dtype), + np.zeros((size * size), dtype=dtype), + np.zeros(size, dtype=dtype), + ) diff --git a/dpbench/benchmarks/rodinia/gaussian/gaussian_numba_dpex_k.py b/dpbench/benchmarks/rodinia/gaussian/gaussian_numba_dpex_k.py new file mode 100644 index 00000000..dc7b89d1 --- /dev/null +++ b/dpbench/benchmarks/rodinia/gaussian/gaussian_numba_dpex_k.py @@ -0,0 +1,107 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpctl +import numba_dpex + + +@numba_dpex.kernel() +def gaussian_kernel_1(m, a, size, t): + if ( + numba_dpex.get_local_id(2) + + numba_dpex.get_group_id(2) * numba_dpex.get_local_size(2) + >= size - 1 - t + ): + return + + m[ + size + * ( + numba_dpex.get_local_size(2) * numba_dpex.get_group_id(2) + + numba_dpex.get_local_id(2) + + t + + 1 + ) + + t + ] = ( + a[ + size + * ( + numba_dpex.get_local_size(2) * numba_dpex.get_group_id(2) + + numba_dpex.get_local_id(2) + + t + + 1 + ) + + t + ] + / a[size * t + t] + ) + + +@numba_dpex.kernel() +def gaussian_kernel_2(m, a, b, size, t): + if ( + numba_dpex.get_local_id(2) + + numba_dpex.get_group_id(2) * numba_dpex.get_local_size(2) + >= size - 1 - t + ): + return + + if ( + numba_dpex.get_local_id(1) + + numba_dpex.get_group_id(1) * numba_dpex.get_local_size(1) + >= size - t + ): + return + + xidx = numba_dpex.get_group_id(2) * numba_dpex.get_local_size( + 2 + ) + numba_dpex.get_local_id(2) + yidx = numba_dpex.get_group_id(1) * numba_dpex.get_local_size( + 1 + ) + numba_dpex.get_local_id(1) + + a[size * (xidx + 1 + t) + (yidx + t)] -= ( + m[size * (xidx + 1 + t) + t] * a[size * t + (yidx + t)] + ) + if yidx == 0: + b[xidx + 1 + t] -= m[size * (xidx + 1 + t) + (yidx + t)] * b[t] + + +def gaussian(a, b, m, size, block_sizeXY, result): + device = dpctl.SyclDevice() + block_size = device.max_work_group_size + grid_size = int((size / block_size) + 0 if not (size % block_size) else 1) + + blocksize2d = block_sizeXY + gridsize2d = int( + (size / blocksize2d) + (0 if not (size % blocksize2d) else 1) + ) + + global_range = numba_dpex.Range(1, 1, grid_size * block_size) + local_range = numba_dpex.Range(1, 1, block_size) + + dim_blockXY = numba_dpex.Range(1, blocksize2d, blocksize2d) + dim_gridXY = numba_dpex.Range( + 1, gridsize2d * blocksize2d, gridsize2d * blocksize2d + ) + + for t in range(size - 1): + gaussian_kernel_1[numba_dpex.NdRange(global_range, local_range)]( + m, a, size, t + ) + + gaussian_kernel_2[numba_dpex.NdRange(dim_gridXY, dim_blockXY)]( + m, a, b, size, t + ) + + for i in range(size): + result[size - i - 1] = b[size - i - 1] + for j in range(i): + result[size - i - 1] -= ( + a[size * (size - i - 1) + (size - j - 1)] * result[size - j - 1] + ) + result[size - i - 1] = ( + result[size - i - 1] / a[size * (size - i - 1) + (size - i - 1)] + ) diff --git a/dpbench/benchmarks/rodinia/gaussian/gaussian_python.py b/dpbench/benchmarks/rodinia/gaussian/gaussian_python.py new file mode 100644 index 00000000..5280399e --- /dev/null +++ b/dpbench/benchmarks/rodinia/gaussian/gaussian_python.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + + +def gaussian(a, b, m, size, block_sizeXY, result): + # Forward Elimination + for t in range(size - 1): + for i in range(t + 1, size): + m = a[i * size + t] / a[t * size + t] + for j in range(t, size): + a[i * size + j] = a[i * size + j] - m * a[t * size + j] + b[i] = b[i] - m * b[t] + + # Back Substitution + for i in range(size): + result[size - i - 1] = b[size - i - 1] + for j in range(i): + result[size - i - 1] -= ( + a[size * (size - i - 1) + (size - j - 1)] * result[size - j - 1] + ) + result[size - i - 1] = ( + result[size - i - 1] / a[size * (size - i - 1) + (size - i - 1)] + ) diff --git a/dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/CMakeLists.txt new file mode 100644 index 00000000..236f6c45 --- /dev/null +++ b/dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/CMakeLists.txt @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +set(module_name gaussian_sycl) +set(py_module_name _${module_name}) +python_add_library(${py_module_name} MODULE ${module_name}/${py_module_name}.cpp) +add_sycl_to_target(TARGET ${py_module_name} SOURCES ${module_name}/${py_module_name}.cpp) +target_include_directories(${py_module_name} PRIVATE ${Dpctl_INCLUDE_DIRS}) + +file(RELATIVE_PATH py_module_dest ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) +install(TARGETS ${py_module_name} + DESTINATION ${py_module_dest}/${module_name} +) diff --git a/dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/__init__.py b/dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/__init__.py new file mode 100644 index 00000000..e99261dc --- /dev/null +++ b/dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +from .gaussian_sycl._gaussian_sycl import gaussian as gaussian_sycl + +__all__ = ["gaussian_sycl"] diff --git a/dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/gaussian_sycl/_gaussian_kernel.hpp b/dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/gaussian_sycl/_gaussian_kernel.hpp new file mode 100644 index 00000000..fd184c62 --- /dev/null +++ b/dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/gaussian_sycl/_gaussian_kernel.hpp @@ -0,0 +1,57 @@ +// SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +#include + +using namespace sycl; + +template +void gaussian_kernel_1(FpTy *m_device, + const FpTy *a_device, + int size, + int t, + sycl::nd_item<3> item_ct1) +{ + if (item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range().get(2) >= + size - 1 - t) + return; + m_device[size * (item_ct1.get_local_range().get(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2) + t + 1) + + t] = a_device[size * (item_ct1.get_local_range().get(2) * + item_ct1.get_group(2) + + item_ct1.get_local_id(2) + t + 1) + + t] / + a_device[size * t + t]; +} + +template +void gaussian_kernel_2(FpTy *m_device, + FpTy *a_device, + FpTy *b_device, + int size, + int j1, + int t, + sycl::nd_item<3> item_ct1) +{ + if (item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range().get(2) >= + size - 1 - t) + return; + if (item_ct1.get_local_id(1) + + item_ct1.get_group(1) * item_ct1.get_local_range().get(1) >= + size - t) + return; + + int xidx = item_ct1.get_group(2) * item_ct1.get_local_range().get(2) + + item_ct1.get_local_id(2); + int yidx = item_ct1.get_group(1) * item_ct1.get_local_range().get(1) + + item_ct1.get_local_id(1); + + a_device[size * (xidx + 1 + t) + (yidx + t)] -= + m_device[size * (xidx + 1 + t) + t] * a_device[size * t + (yidx + t)]; + if (yidx == 0) { + b_device[xidx + 1 + t] -= + m_device[size * (xidx + 1 + t) + (yidx + t)] * b_device[t]; + } +} diff --git a/dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/gaussian_sycl/_gaussian_sycl.cpp b/dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/gaussian_sycl/_gaussian_sycl.cpp new file mode 100644 index 00000000..73533ddc --- /dev/null +++ b/dpbench/benchmarks/rodinia/gaussian/gaussian_sycl_native_ext/gaussian_sycl/_gaussian_sycl.cpp @@ -0,0 +1,131 @@ +// SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +#include "_gaussian_kernel.hpp" +#include +#include + +template bool ensure_compatibility(const Args &...args) +{ + std::vector arrays = {args...}; + + auto arr = arrays.at(0); + auto q = arr.get_queue(); + auto type_flag = arr.get_typenum(); + auto arr_size = arr.get_size(); + + for (auto &arr : arrays) { + if (!(arr.get_flags() & (USM_ARRAY_C_CONTIGUOUS))) { + std::cerr << "All arrays need to be C contiguous.\n"; + return false; + } + if (arr.get_typenum() != type_flag) { + std::cerr << "All arrays should be of same elemental type.\n"; + return false; + } + if (arr.get_ndim() > 1) { + std::cerr << "All arrays expected to be single-dimensional.\n"; + return false; + } + } + return true; +} + +void gaussian_sync(dpctl::tensor::usm_ndarray a, + dpctl::tensor::usm_ndarray b, + dpctl::tensor::usm_ndarray m, + int size, + int block_sizeXY, + dpctl::tensor::usm_ndarray result) +{ + if (!ensure_compatibility(a, m, b, result)) + throw std::runtime_error("Input arrays are not acceptable."); + + int t; + + sycl::queue q_ct1; + + int block_size, grid_size; + block_size = q_ct1.get_device() + .get_info(); + grid_size = (size / block_size) + (!(size % block_size) ? 0 : 1); + + sycl::range<3> dimBlock(1, 1, block_size); + sycl::range<3> dimGrid(1, 1, grid_size); + + int blocksize2d, gridsize2d; + blocksize2d = block_sizeXY; + gridsize2d = (size / blocksize2d) + (!(size % blocksize2d ? 0 : 1)); + + sycl::range<3> dimBlockXY(1, blocksize2d, blocksize2d); + sycl::range<3> dimGridXY(1, gridsize2d, gridsize2d); + + auto a_value = a.get_data(); + auto b_value = b.get_data(); + auto m_value = m.get_data(); + + for (t = 0; t < (size - 1); t++) { + /* + DPCT1049:7: The workgroup size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the workgroup size if + needed. + */ + q_ct1.submit([&](sycl::handler &cgh) { + auto size_ct2 = size; + cgh.parallel_for(sycl::nd_range<3>(dimGrid * dimBlock, dimBlock), + [=](sycl::nd_item<3> item_ct1) { + gaussian_kernel_1(m_value, a_value, size_ct2, + t, item_ct1); + }); + }); + q_ct1.wait_and_throw(); + /* + DPCT1049:8: The workgroup size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the workgroup size if + needed. + */ + q_ct1.submit([&](sycl::handler &cgh) { + auto size_ct3 = size; + auto size_t_ct4 = size - t; + + cgh.parallel_for( + sycl::nd_range<3>(dimGridXY * dimBlockXY, dimBlockXY), + [=](sycl::nd_item<3> item_ct1) { + gaussian_kernel_2(m_value, a_value, b_value, size_ct3, + size_t_ct4, t, item_ct1); + }); + }); + q_ct1.wait_and_throw(); + } + // Copying the final answer + auto result_value = result.get_data(); + + for (int i = 0; i < size; i++) { + + result_value[size - i - 1] = b_value[size - i - 1]; + + for (int j = 0; j < i; j++) { + result_value[size - i - 1] -= + *(a_value + size * (size - i - 1) + (size - j - 1)) * + result_value[size - j - 1]; + } + + result_value[size - i - 1] = + result_value[size - i - 1] / + *(a_value + size * (size - i - 1) + (size - i - 1)); + } +} + +PYBIND11_MODULE(_gaussian_sycl, m) +{ + // Import the dpctl extensions + import_dpctl(); + + m.def("gaussian", &gaussian_sync, + "DPC++ implementation of the gaussian elimination", py::arg("a"), + py::arg("b"), py::arg("m"), py::arg("size"), py::arg("block_sizeXY"), + py::arg("result")); +} diff --git a/dpbench/configs/bench_info/rodinia/gaussian.toml b/dpbench/configs/bench_info/rodinia/gaussian.toml new file mode 100644 index 00000000..a91ff80a --- /dev/null +++ b/dpbench/configs/bench_info/rodinia/gaussian.toml @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +[benchmark] +name = "Gaussian Elimination" +short_name = "gaussian" +relative_path = "gaussian" +module_name = "gaussian" +func_name = "gaussian" +kind = "microbenchmark" +domain = "Matrix manipulation" +input_args = [ + "a", + "b", + "m", + "size", + "block_sizeXY", + "result" +] +array_args = [ + "a", + "b", + "m", + "result" +] +output_args = [ + "result", +] + +[benchmark.parameters.S] +size = 10 +block_sizeXY = 4 + +[benchmark.parameters.M16Gb] +size = 4096 +block_sizeXY = 4 + +[benchmark.parameters.M] +size = 4096 +block_sizeXY = 4 + +[benchmark.parameters.L] +size = 8192 +block_sizeXY = 4 + +[benchmark.init] +func_name = "initialize" +types_dict_name="types_dict" +precision="double" +input_args = [ + "size", + "types_dict", +] +output_args = [ + "a", + "b", + "m", + "result" +]