Running pathfinder with rodinia

IntelPython · Oct 11, 2023 · a1f7f26 · a1f7f26
1 parent eec58be
commit a1f7f26
Show file tree

Hide file tree

Showing 17 changed files with 501 additions and 2 deletions.
diff --git a/dpbench/benchmarks/CMakeLists.txt b/dpbench/benchmarks/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(kmeans)
 add_subdirectory(knn)
 add_subdirectory(gpairs)
 add_subdirectory(dbscan)
+add_subdirectory(rodinia)
 
 # generate dpcpp version into config
 set(FILE ${CMAKE_SOURCE_DIR}/dpbench/configs/framework_info/dpcpp.toml)

diff --git a/dpbench/benchmarks/rodinia/CMakeLists.txt b/dpbench/benchmarks/rodinia/CMakeLists.txt
@@ -0,0 +1,5 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+add_subdirectory(pathfinder)
diff --git a/dpbench/benchmarks/rodinia/pathfinder/CMakeLists.txt b/dpbench/benchmarks/rodinia/pathfinder/CMakeLists.txt
@@ -0,0 +1,5 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+add_subdirectory(pathfinder_sycl_native_ext)
diff --git a/dpbench/benchmarks/rodinia/pathfinder/__init__.py b/dpbench/benchmarks/rodinia/pathfinder/__init__.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+
+Pathfinder Computation
+
+This algorithm finds the shortest path from the first row to the last.
+
+Input
+---------
+rows<int_64> : Indicates the number of rows
+
+cols<int_64> : Indicates the number of cols
+
+pyramid height<int_64> : Indicates pyramid height
+
+Output
+
+--------
+
+result<array<int_64>> : Indicates the minimum distance from first row to last
+
+Method:
+
+The elements are fed to the kernel row-wise and the minimum distance is computed based
+on the minimum weight of the neighbors above.
+This is done for all rows until last and result is returned.
+
+
+"""
diff --git a/dpbench/benchmarks/rodinia/pathfinder/pathfinder_initialize.py b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_initialize.py
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+LOW = 0
+HIGH = 10.0
+SEED = 9
+
+
+def initialize(rows, cols, pyramid_height, types_dict=None):
+    import numpy as np
+    import numpy.random as rnd
+
+    rnd.seed(SEED)
+
+    data, result = rnd.randint(
+        LOW, HIGH, (rows * cols), dtype=np.int64
+    ), np.empty(cols, dtype=np.int64)
+
+    return (data, rows, cols, pyramid_height, result)
diff --git a/dpbench/benchmarks/rodinia/pathfinder/pathfinder_numba_dpex_k.py b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_numba_dpex_k.py
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpnp
+import numba_dpex
+
+
+def MIN(a, b):
+    return (a) if (a) <= (b) else (b)
+
+
+@numba_dpex.func
+def min_dpex(a, b):
+    t = a if a <= b else b
+    return t
+
+
+@numba_dpex.kernel(debug=True)
+def _pathfinder_kernel(device_src, device_dest, cols):
+    current_element = numba_dpex.get_global_id(0)
+
+    left_ind = current_element - 1 if current_element >= 1 else 0
+    right_ind = current_element + 1 if current_element < cols - 1 else cols - 1
+    up_ind = current_element
+
+    left = device_src[left_ind]
+    up = device_src[up_ind]
+    right = device_src[right_ind]
+    shortest = min_dpex(left, up)
+    shortest = min_dpex(shortest, right)
+
+    device_dest[current_element] += shortest
+
+
+def pathfinder(data, rows, cols, pyramid_height, result):
+    # create a temp list that hold first row of data as first element and empty numpy array as second element
+    device_dest = dpnp.array(data[:cols], dtype=dpnp.int64)  # first row
+    device_src = dpnp.array([0] * cols, dtype=dpnp.int64)
+
+    t = 1
+
+    while True:
+        if t >= rows:
+            break
+        device_src = device_dest
+        device_dest = dpnp.array(
+            data[t * cols : (t + 1) * cols], dtype=dpnp.int64
+        )
+        _pathfinder_kernel[numba_dpex.Range(cols)](
+            device_src, device_dest, cols
+        )
+        t += 1
+
+    for i in range(cols):
+        result[i] = device_dest[i]
diff --git a/dpbench/benchmarks/rodinia/pathfinder/pathfinder_python.py b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_python.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpnp
+
+
+def min_dpex(a, b):
+    return a if a <= b else b
+
+
+def _pathfinder_kernel(device_src, device_dest, cols, current_element):
+    left_ind = current_element - 1 if current_element >= 1 else 0
+    right_ind = current_element + 1 if current_element < cols - 1 else cols - 1
+    up_ind = current_element
+
+    left = device_src[left_ind]
+    up = device_src[up_ind]
+    right = device_src[right_ind]
+    shortest = min_dpex(left, up)
+    shortest = min_dpex(shortest, right)
+
+    device_dest[current_element] += shortest
+
+
+def pathfinder(data, rows, cols, pyramid_height, result):
+    # create a temp list that hold first row of data as first element and empty numpy array as second element
+    device_dest = dpnp.array(data[:cols], dtype=dpnp.int64)  # first row
+    device_src = dpnp.array([0] * cols, dtype=dpnp.int64)
+
+    t = 1
+    while True:
+        if t >= rows:
+            break
+        device_src = device_dest
+        device_dest = dpnp.array(
+            data[t * cols : (t + 1) * cols], dtype=dpnp.int64
+        )
+
+        for i in range(cols):
+            _pathfinder_kernel(device_src, device_dest, cols, i)
+        t += 1
+
+    for i in range(cols):
+        result[i] = device_dest[i]
diff --git a/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/CMakeLists.txt
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+set(module_name pathfinder_sycl)
+set(py_module_name _${module_name})
+python_add_library(${py_module_name} MODULE ${module_name}/${py_module_name}.cpp)
+add_sycl_to_target(TARGET ${py_module_name} SOURCES ${module_name}/${py_module_name}.cpp)
+target_include_directories(${py_module_name} PRIVATE ${Dpctl_INCLUDE_DIRS})
+
+file(RELATIVE_PATH py_module_dest ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
+install(TARGETS ${py_module_name}
+  DESTINATION ${py_module_dest}/${module_name}
+)
diff --git a/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/__init__.py b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from .pathfinder_sycl._pathfinder_sycl import pathfinder as pathfinder_sycl
+
+__all__ = ["pathfinder_sycl"]
diff --git a/...arks/rodinia/pathfinder/pathfinder_sycl_native_ext/pathfinder_sycl/_pathfinder_kernel.hpp b/...arks/rodinia/pathfinder/pathfinder_sycl_native_ext/pathfinder_sycl/_pathfinder_kernel.hpp
@@ -0,0 +1,97 @@
+// SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <CL/sycl.hpp>
+
+using namespace sycl;
+
+#define BLOCK_SIZE 128
+#define STR_SIZE 128
+#define DEVICE 0
+#define HALO 1
+
+#define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max))
+#define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x)
+#define MIN(a, b) ((a) <= (b) ? (a) : (b))
+
+void pathfinder_impl(int iteration,
+                     int *gpuWall,
+                     int *gpuSrc,
+                     int *gpuResults,
+                     int cols,
+                     int rows,
+                     int startStep,
+                     int border,
+                     sycl::nd_item<3> item_ct1,
+                     int *prev,
+                     int *result)
+{
+
+    int bx = item_ct1.get_group(2);
+    int tx = item_ct1.get_local_id(2);
+
+    // each block finally computes result for a small block
+    // after N iterations.
+    // it is the non-overlapping small blocks that cover
+    // all the input data
+
+    // calculate the small block size
+    int small_block_cols = BLOCK_SIZE - iteration * HALO * 2;
+
+    // calculate the boundary for the block according to
+    // the boundary of its small block
+    int blkX = small_block_cols * bx - border;
+    int blkXmax = blkX + BLOCK_SIZE - 1;
+
+    // calculate the global thread coordination
+    int xidx = blkX + tx;
+
+    // effective range within this block that falls within
+    // the valid range of the input data
+    // used to rule out computation outside the boundary.
+    int validXmin = (blkX < 0) ? -blkX : 0;
+    int validXmax = (blkXmax > cols - 1) ? BLOCK_SIZE - 1 - (blkXmax - cols + 1)
+                                         : BLOCK_SIZE - 1;
+
+    int W = tx - 1;
+    int E = tx + 1;
+
+    W = (W < validXmin) ? validXmin : W;
+    E = (E > validXmax) ? validXmax : E;
+
+    bool isValid = IN_RANGE(tx, validXmin, validXmax);
+
+    if (IN_RANGE(xidx, 0, cols - 1)) {
+        prev[tx] = gpuSrc[xidx];
+    }
+    item_ct1.barrier(); // [Ronny] Added sync to avoid race on prev Aug. 14 2012
+    bool computed;
+    for (int i = 0; i < iteration; i++) {
+        computed = false;
+        if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) && isValid) {
+            computed = true;
+            int left = prev[W];
+            int up = prev[tx];
+            int right = prev[E];
+            int shortest = MIN(left, up);
+            shortest = MIN(shortest, right);
+            int index = cols * (startStep + i) + xidx;
+            result[tx] = shortest + gpuWall[index];
+        }
+        item_ct1.barrier();
+        if (i == iteration - 1)
+            break;
+        if (computed) // Assign the computation range
+            prev[tx] = result[tx];
+        item_ct1
+            .barrier(); // [Ronny] Added sync to avoid race on prev Aug. 14 2012
+    }
+
+    // update the global memory
+    // after the last iteration, only threads coordinated within the
+    // small block perform the calculation and switch on ``computed''
+    if (computed) {
+        gpuResults[xidx] = result[tx];
+    }
+}