From de9046468d89e29fe48f0e7b0844d2cfe6575168 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Tue, 27 Jun 2023 19:19:31 +0200 Subject: [PATCH] Improve `pairwise_distance` workloads * Use 2D dispatch in kernel impls instead of huge sequential inner loop. * Use nested prange in `numba_mlir_p` impl, `numba` and `numba-dpex` doesn't support nested pranges, but `numba-mlir` does. * Remove workaround in `numba_mlir_n` --- .../pairwise_distance_numba_dpex_k.py | 16 ++++++++-------- .../pairwise_distance_numba_mlir_k.py | 18 ++++++++++-------- .../pairwise_distance_numba_mlir_n.py | 3 +-- .../pairwise_distance_numba_mlir_p.py | 2 +- 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_dpex_k.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_dpex_k.py index 9ca9c01b..057b8298 100644 --- a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_dpex_k.py +++ b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_dpex_k.py @@ -9,16 +9,16 @@ @dpex.kernel def _pairwise_distance_kernel(X1, X2, D): i = dpex.get_global_id(0) + j = dpex.get_global_id(1) - X2_rows = X2.shape[0] X1_cols = X1.shape[1] - for j in range(X2_rows): - d = X1.dtype.type(0.0) - for k in range(X1_cols): - tmp = X1[i, k] - X2[j, k] - d += tmp * tmp - D[i, j] = np.sqrt(d) + + d = X1.dtype.type(0.0) + for k in range(X1_cols): + tmp = X1[i, k] - X2[j, k] + d += tmp * tmp + D[i, j] = np.sqrt(d) def pairwise_distance(X1, X2, D): - _pairwise_distance_kernel[X1.shape[0],](X1, X2, D) + _pairwise_distance_kernel[dpex.Range(X1.shape[0], X2.shape[0])](X1, X2, D) diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_k.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_k.py index 1b6bd37f..d2e68272 100644 --- a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_k.py +++ b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_k.py @@ -9,16 +9,18 @@ @nb.kernel(gpu_fp64_truncate="auto") def _pairwise_distance_kernel(X1, X2, D): i = nb.get_global_id(0) + j = nb.get_global_id(1) - X2_rows = X2.shape[0] X1_cols = X1.shape[1] - for j in range(X2_rows): - d = 0.0 - for k in range(X1_cols): - tmp = X1[i, k] - X2[j, k] - d += tmp * tmp - D[i, j] = np.sqrt(d) + + d = 0.0 + for k in range(X1_cols): + tmp = X1[i, k] - X2[j, k] + d += tmp * tmp + D[i, j] = np.sqrt(d) def pairwise_distance(X1, X2, D): - _pairwise_distance_kernel[X1.shape[0], nb.DEFAULT_LOCAL_SIZE](X1, X2, D) + _pairwise_distance_kernel[ + (X1.shape[0], X2.shape[0]), nb.DEFAULT_LOCAL_SIZE + ](X1, X2, D) diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_n.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_n.py index 00c9c982..263ecc91 100644 --- a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_n.py +++ b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_n.py @@ -11,8 +11,7 @@ def _pairwise_distance(X1, X2, D): x1 = np.sum(np.square(X1), axis=1) x2 = np.sum(np.square(X2), axis=1) np.dot(X1, X2.T, D) - # D *= -2 TODO: inplace ops doesn't work as intended - D[:] = D * -2 + D *= -2 x3 = x1.reshape(x1.size, 1) np.add(D, x3, D) np.add(D, x2, D) diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_p.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_p.py index cafb63c6..145f7b9c 100644 --- a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_p.py +++ b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_mlir_p.py @@ -25,7 +25,7 @@ def _pairwise_distance(X1, X2, D): # Outermost parallel loop over the matrix X1 for i in numba.prange(X1_rows): # Loop over the matrix X2 - for j in range(X2_rows): + for j in numba.prange(X2_rows): d = 0.0 # Compute exclidean distance for k in range(X1_cols):