Merge branch 'IntelPython:main' into main

adarshyoga · Apr 4, 2024 · c186af9 · c186af9
2 parents 61b6a45 + bece9a4
commit c186af9
Show file tree

Hide file tree

Showing 17 changed files with 69 additions and 53 deletions.
diff --git a/.github/workflows/build_and_run.yml b/.github/workflows/build_and_run.yml
@@ -9,7 +9,7 @@ on:
 
 env:
   # sycl is not included. Add it manually if you need
-  WORKLOADS: python,numpy,dpnp,numba_n,numba_np,numba_npr,numba_dpex_k,numba_dpex_n,numba_dpex_p,numba_mlir_k,numba_mlir_n,numba_mlir_p
+  WORKLOADS: python,numpy,dpnp,numba_n,numba_np,numba_npr,numba_dpex_k,numba_dpex_n,numba_dpex_p
   PYTHONIOENCODING: 'utf-8'
 
 jobs:
@@ -68,7 +68,6 @@ jobs:
         shell: bash -l {0}
         run: |
           find ./environments -type f | xargs sed -i 's/intel::numpy/numpy/'
-          find ./environments -type f | xargs sed -i '/numba-mlir/d'
           find ./environments -type f | xargs sed -i 's/setuptools>=42,<64/setuptools/'
 
       - name: Setup miniconda
@@ -168,8 +167,9 @@ jobs:
       - name: Run benchmarks
         run: dpbench -i ${{env.WORKLOADS}} run -r2 --no-print-results --precision=${{matrix.precision}} || exit 1
 
-      - name: Run rodinia benchmarks
-        run: dpbench -i ${{env.WORKLOADS}} --last-run run -r2 --no-print-results --rodinia --no-dpbench --precision=${{matrix.precision}} || exit 1
+      # TODO: Re-enable rodinia benchmarks in CI once workloads have been changed to new numba-dpex API
+      # - name: Run rodinia benchmarks
+      #   run: dpbench -i ${{env.WORKLOADS}} --last-run run -r2 --no-print-results --rodinia --no-dpbench --precision=${{matrix.precision}} || exit 1
 
       - name: Generate report
         run: dpbench -i ${{env.WORKLOADS}} report || exit 1
diff --git a/dpbench/benchmarks/default/black_scholes/black_scholes_numba_dpex_k.py b/dpbench/benchmarks/default/black_scholes/black_scholes_numba_dpex_k.py
@@ -4,7 +4,7 @@
 
 from math import erf, exp, log, sqrt
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 from numba_dpex import kernel_api as kapi
 
 

diff --git a/dpbench/benchmarks/default/dbscan/dbscan_numba_dpex_k.py b/dpbench/benchmarks/default/dbscan/dbscan_numba_dpex_k.py
@@ -4,7 +4,7 @@
 
 import dpnp as np
 import numba as nb
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 import numpy
 from numba_dpex import kernel_api as kapi
 

diff --git a/dpbench/benchmarks/default/gpairs/gpairs_numba_dpex_k.py b/dpbench/benchmarks/default/gpairs/gpairs_numba_dpex_k.py
@@ -3,13 +3,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import numba_dpex as dpex
-import numba_dpex.experimental as dpexexp
 from numba_dpex import kernel_api as kapi
 
 # This implementation is numba dpex kernel version with atomics.
 
 
-@dpexexp.kernel
+@dpex.kernel
 def count_weighted_pairs_3d_intel_no_slm_ker(
     nd_item: kapi.NdItem,
     n,
@@ -151,7 +150,7 @@ def gpairs(
         ceiling_quotient(nbins, private_hist_size) * private_hist_size
     )
 
-    dpexexp.call_kernel(
+    dpex.call_kernel(
         count_weighted_pairs_3d_intel_no_slm_ker,
         kapi.NdRange(dpex.Range(*gwsRange), dpex.Range(*lwsRange)),
         nopt,

diff --git a/dpbench/benchmarks/default/kmeans/kmeans_initialize.py b/dpbench/benchmarks/default/kmeans/kmeans_initialize.py
@@ -17,7 +17,7 @@ def initialize(npoints, niters, seed, ndims, ncentroids, types_dict):
     arrayP = default_rng.uniform(XL, XH, (npoints, ndims)).astype(f_dtype)
     arrayPclusters = np.ones(npoints, dtype=i_dtype)
     arrayC = np.empty((ncentroids, ndims), dtype=f_dtype)
-    arrayCnumpoint = np.ones(ncentroids, dtype=i_dtype)
+    arrayCnumpoint = np.ones(ncentroids, dtype=np.int64)
 
     arrayC[:] = arrayP[:ncentroids]
 

diff --git a/dpbench/benchmarks/default/kmeans/kmeans_numba_dpex_k.py b/dpbench/benchmarks/default/kmeans/kmeans_numba_dpex_k.py
@@ -6,7 +6,6 @@
 from math import sqrt
 
 import numba_dpex as dpex
-import numba_dpex.experimental as dpexexp
 from dpctl import tensor as dpt
 from numba_dpex import kernel_api as kapi
 
@@ -23,9 +22,7 @@ def Align(value, base):
 def getGroupByCluster(  # noqa: C901
     dims, num_centroids, dtyp, WorkPI, local_size_
 ):
-    local_copies = min(4, max(1, DivUp(local_size_, num_centroids)))
-
-    @dpexexp.kernel
+    @dpex.kernel
     def groupByCluster(
         nd_item: kapi.NdItem,
         arrayP,
@@ -34,15 +31,12 @@ def groupByCluster(
         NewCentroids,
         NewCount,
         last,
+        local_copies,
+        localCentroids,
+        localNewCentroids,
+        localNewCount,
     ):
         numpoints = arrayP.shape[0]
-        localCentroids = dpex.local.array((dims, num_centroids), dtype=dtyp)
-        localNewCentroids = dpex.local.array(
-            (local_copies, dims, num_centroids), dtype=dtyp
-        )
-        localNewCount = dpex.local.array(
-            (local_copies, num_centroids), dtype=dpt.int32
-        )
 
         grid = nd_item.get_group().get_group_id(0)
         lid = nd_item.get_local_id(0)
@@ -121,20 +115,19 @@ def groupByCluster(
 
 @lru_cache(maxsize=1)
 def getUpdateCentroids(dims, num_centroids, dtyp, local_size_):
-    @dpexexp.kernel
+    @dpex.kernel
     def updateCentroids(
         nd_item: kapi.NdItem,
         diff,
         arrayC,
         arrayCnumpoint,
         NewCentroids,
         NewCount,
+        local_distance,
     ):
         lid = nd_item.get_local_id(0)
         local_size = nd_item.get_local_range(0)
 
-        local_distance = dpex.local.array(local_size_, dtype=dtyp)
-
         max_distance = dtyp.type(0)
         for c in range(lid, num_centroids, local_size):
             numpoints = NewCount[c]
@@ -168,10 +161,11 @@ def updateCentroids(
 
 @lru_cache(maxsize=1)
 def getUpdateLabels(dims, num_centroids, dtyp, WorkPI):
-    @dpexexp.kernel
-    def updateLabels(nd_item: kapi.NdItem, arrayP, arrayPcluster, arrayC):
+    @dpex.kernel
+    def updateLabels(
+        nd_item: kapi.NdItem, arrayP, arrayPcluster, arrayC, localCentroids
+    ):
         numpoints = arrayP.shape[0]
-        localCentroids = dpex.local.array((dims, num_centroids), dtype=dtyp)
 
         grid = nd_item.get_group().get_group_id(0)
         lid = nd_item.get_local_id(0)
@@ -245,16 +239,31 @@ def kmeans_kernel(
     for i in range(niters):
         last = i == (niters - 1)
         if diff_host < tolerance:
-            dpexexp.call_kernel(
+            localCentroids = kapi.LocalAccessor(
+                (dims, num_centroids), dtype=arrayP.dtype
+            )
+
+            dpex.call_kernel(
                 updateLabels,
                 kapi.NdRange((global_size,), (local_size,)),
                 arrayP,
                 arrayPcluster,
                 arrayC,
+                localCentroids,
             )
             break
 
-        dpexexp.call_kernel(
+        local_copies = min(4, max(1, DivUp(local_size, num_centroids)))
+        localCentroids = kapi.LocalAccessor(
+            (dims, num_centroids), dtype=arrayP.dtype
+        )
+        localNewCentroids = kapi.LocalAccessor(
+            (local_copies, dims, num_centroids), dtype=arrayP.dtype
+        )
+        localNewCount = kapi.LocalAccessor(
+            (local_copies, num_centroids), dtype=dpt.int64
+        )
+        dpex.call_kernel(
             groupByCluster,
             kapi.NdRange((global_size,), (local_size,)),
             arrayP,
@@ -263,17 +272,23 @@ def kmeans_kernel(
             NewCentroids,
             NewCount,
             last,
+            local_copies,
+            localCentroids,
+            localNewCentroids,
+            localNewCount,
         )
 
+        local_distance = kapi.LocalAccessor(local_size, dtype=arrayP.dtype)
         update_centroid_size = min(num_centroids, local_size)
-        dpexexp.call_kernel(
+        dpex.call_kernel(
             updateCentroids,
             kapi.NdRange((update_centroid_size,), (update_centroid_size,)),
             diff,
             arrayC,
             arrayCnumpoint,
             NewCentroids,
             NewCount,
+            local_distance,
         )
         diff_host = dpt.asnumpy(diff)[0]
 

diff --git a/dpbench/benchmarks/default/kmeans/kmeans_numba_dpex_p.py b/dpbench/benchmarks/default/kmeans/kmeans_numba_dpex_p.py
@@ -5,6 +5,7 @@
 import dpnp as np
 import numba as nb
 import numba_dpex as dpex
+from numba_dpex import kernel_api as kapi
 
 
 # determine the euclidean distance from the cluster center to each point
@@ -40,12 +41,17 @@ def calCentroidsSum(
 
 
 @dpex.kernel
-def calCentroidsSum2(arrayP, arrayPcluster, arrayCsum, arrayCnumpoint):
-    i = dpex.get_global_id(0)
+def calCentroidsSum2(
+    item: kapi.Item, arrayP, arrayPcluster, arrayCsum, arrayCnumpoint
+):
+    i = item.get_id(0)
     ci = arrayPcluster[i]
-    dpex.atomic.add(arrayCsum, (ci, 0), arrayP[i, 0])
-    dpex.atomic.add(arrayCsum, (ci, 1), arrayP[i, 1])
-    dpex.atomic.add(arrayCnumpoint, ci, 1)
+    arrayCsum_aref = kapi.AtomicRef(arrayCsum, index=(ci, 0))
+    arrayCsum_aref.fetch_add(arrayP[i, 0])
+    arrayCsum_aref = kapi.AtomicRef(arrayCsum, index=(ci, 1))
+    arrayCsum_aref.fetch_add(arrayP[i, 1])
+    arrayCnumpoint_aref = kapi.AtomicRef(arrayCnumpoint, index=ci)
+    arrayCnumpoint_aref.fetch_add(1)
 
 
 # update the centriods array after computation
@@ -86,8 +92,13 @@ def kmeans_numba(arrayP, arrayPcluster, arrayC, arrayCnumpoint, niters):
             num_centroids,
         )
 
-        calCentroidsSum2[dpex.Range(num_points)](
-            arrayP, arrayPcluster, arrayCsum, arrayCnumpoint
+        dpex.call_kernel(
+            calCentroidsSum2,
+            kapi.Range(num_points),
+            arrayP,
+            arrayPcluster,
+            arrayCsum,
+            arrayCnumpoint,
         )
 
         # TODO: get rid of it once prange supports dtype

diff --git a/dpbench/benchmarks/default/kmeans/kmeans_sycl_native_ext/kmeans_sycl/_kmeans_sycl.cpp b/dpbench/benchmarks/default/kmeans/kmeans_sycl_native_ext/kmeans_sycl/_kmeans_sycl.cpp
@@ -37,10 +37,6 @@ void kmeans_sync(dpctl::tensor::usm_ndarray arrayP,
         throw std::runtime_error("All arrays must have the same precision");
     }
 
-    if (arrayPclusters.get_typenum() != arrayCnumpoint.get_typenum()) {
-        throw std::runtime_error("All arrays must have the same precision");
-    }
-
     auto npoints = arrayP.get_shape(0);
     auto ncentroids = arrayC.get_shape(0);
     auto ndims = arrayC.get_shape(1);

diff --git a/dpbench/benchmarks/default/knn/knn_numba_dpex_k.py b/dpbench/benchmarks/default/knn/knn_numba_dpex_k.py
@@ -5,12 +5,11 @@
 from math import sqrt
 
 import numba_dpex as dpex
-import numba_dpex.experimental as dpexexp
 import numpy as np
 from numba_dpex import kernel_api as kapi
 
 
-@dpexexp.kernel
+@dpex.kernel
 def _knn_kernel(  # noqa: C901: TODO: can we simplify logic?
     item: kapi.Item,
     train,
@@ -109,7 +108,7 @@ def knn(
     votes_to_classes,
     data_dim,
 ):
-    dpexexp.call_kernel(
+    dpex.call_kernel(
         _knn_kernel,
         kapi.Range(test_size),
         x_train,

diff --git a/dpbench/benchmarks/default/l2_norm/l2_norm_numba_dpex_k.py b/dpbench/benchmarks/default/l2_norm/l2_norm_numba_dpex_k.py
@@ -4,7 +4,7 @@
 
 import math
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 from numba_dpex import kernel_api as kapi
 
 

diff --git a/dpbench/benchmarks/default/pairwise_distance/pairwise_distance_numba_dpex_k.py b/dpbench/benchmarks/default/pairwise_distance/pairwise_distance_numba_dpex_k.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dpnp as np
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 from numba_dpex import kernel_api as kapi
 
 

diff --git a/dpbench/benchmarks/default/rambo/rambo_numba_dpex_k.py b/dpbench/benchmarks/default/rambo/rambo_numba_dpex_k.py
@@ -4,7 +4,7 @@
 
 from math import cos, log, pi, sin, sqrt
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 from numba_dpex import kernel_api as kapi
 
 

diff --git a/dpbench/configs/bench_info/kmeans.toml b/dpbench/configs/bench_info/kmeans.toml
@@ -30,7 +30,7 @@ output_args = [
 ]
 # TODO: remove once fixed. Fails randomly
 # remove numba_dpex_k once atomics on SLM is implemented
-expected_failure_implementations = ["numba_mlir_k", "numba_dpex_k"]
+expected_failure_implementations = ["numba_mlir_k", "numba_dpex_k", "sycl"]
 
 [benchmark.parameters.S]
 npoints = 4096

diff --git a/dpbench/configs/bench_info/knn.toml b/dpbench/configs/bench_info/knn.toml
@@ -33,8 +33,7 @@ output_args = [
     "predictions",
 ]
 
-# `sycl` fails just on Windows
-expected_failure_implementations = ["numba_dpex_p", "sycl"]
+expected_failure_implementations = ["numba_dpex_p"]
 
 [benchmark.parameters.S]
 test_size = 1024

diff --git a/environments/conda-linux-sycl.yml b/environments/conda-linux-sycl.yml
@@ -23,7 +23,6 @@ dependencies:
   - dpctl
   - dpnp
   - numba-dpex
-  - numba-mlir
   # TODO: fix issues on conda-forge build
   - intel::dpcpp_linux-64==2024.0.0
   - intel::dpcpp-cpp-rt==2024.0.0

diff --git a/environments/conda-win-sycl.yml b/environments/conda-win-sycl.yml
@@ -23,7 +23,6 @@ dependencies:
   - dpctl
   - dpnp
   - numba-dpex
-  - numba-mlir
   # TODO: switch to conda-forge, but it results in broken OpenCL rt (see below)
   # - conda-forge::dpcpp_win-64
   - intel::dpcpp_win-64==2024.0.0

diff --git a/environments/conda.yml b/environments/conda.yml
@@ -24,4 +24,3 @@ dependencies:
   - dpctl
   - dpnp
   - numba-dpex
-  - numba-mlir
-Original file line number
+Diff line change
@@ Expand Up / @@ -24,4 +24,3 @@ dependencies: @@
       - dpctl
       - dpnp
       - numba-dpex
-      - numba-mlir