diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 7342d0b8..ca6fd778 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -156,10 +156,10 @@ jobs:
 
       # Needed to be able to run conda index
       - name: Install conda-build
-        run: mamba install conda-build
+        run: mamba install conda-build conda-index
 
       - name: Create conda channel
-        run: conda index ${{ env.CHANNEL_PATH }}
+        python -m conda_index ${{ env.CHANNEL_PATH }}
 
       - name: Test conda channel
         run: |
diff --git a/dpbench/benchmarks/black_scholes/black_scholes_cupy.py b/dpbench/benchmarks/black_scholes/black_scholes_cupy.py
index 927d71cb..8cf4650e 100644
--- a/dpbench/benchmarks/black_scholes/black_scholes_cupy.py
+++ b/dpbench/benchmarks/black_scholes/black_scholes_cupy.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import cupy as np
+import cupy as cp
 from scipy.special import erf
 
 
@@ -14,12 +14,12 @@ def black_scholes(nopt, price, strike, t, rate, volatility, call, put):
     S = strike
     T = t
 
-    a = np.log(P / S)
+    a = cp.log(P / S)
     b = T * mr
 
     z = T * sig_sig_two
     c = 0.25 * z
-    y = np.true_divide(1.0, np.sqrt(z))
+    y = cp.true_divide(1.0, cp.sqrt(z))
 
     w1 = (a - b + c) * y
     w2 = (a - b - c) * y
@@ -27,7 +27,9 @@ def black_scholes(nopt, price, strike, t, rate, volatility, call, put):
     d1 = 0.5 + 0.5 * erf(w1)
     d2 = 0.5 + 0.5 * erf(w2)
 
-    Se = np.exp(b) * S
+    Se = cp.exp(b) * S
 
     call[:] = P * d1 - Se * d2
     put[:] = call - P + Se
+
+    cp.cuda.stream.get_current_stream().synchronize()
diff --git a/dpbench/benchmarks/black_scholes/black_scholes_numba_dpex_k.py b/dpbench/benchmarks/black_scholes/black_scholes_numba_dpex_k.py
index 502b6a2f..dc4ab77c 100644
--- a/dpbench/benchmarks/black_scholes/black_scholes_numba_dpex_k.py
+++ b/dpbench/benchmarks/black_scholes/black_scholes_numba_dpex_k.py
@@ -4,16 +4,19 @@
 
 from math import erf, exp, log, sqrt
 
-import numba_dpex as dpex
+import numba_dpex.experimental as dpex
+from numba_dpex import kernel_api as kapi
 
 
 @dpex.kernel
-def _black_scholes_kernel(nopt, price, strike, t, rate, volatility, call, put):
+def _black_scholes_kernel(
+    item: kapi.Item, nopt, price, strike, t, rate, volatility, call, put
+):
     dtype = price.dtype
     mr = -rate
     sig_sig_two = volatility * volatility * dtype.type(2)
 
-    i = dpex.get_global_id(0)
+    i = item.get_id(0)
 
     P = price[i]
     S = strike[i]
@@ -40,6 +43,15 @@ def _black_scholes_kernel(nopt, price, strike, t, rate, volatility, call, put):
 
 
 def black_scholes(nopt, price, strike, t, rate, volatility, call, put):
-    _black_scholes_kernel[dpex.Range(nopt)](
-        nopt, price, strike, t, rate, volatility, call, put
+    dpex.call_kernel(
+        _black_scholes_kernel,
+        kapi.Range(nopt),
+        nopt,
+        price,
+        strike,
+        t,
+        rate,
+        volatility,
+        call,
+        put,
     )
diff --git a/dpbench/benchmarks/dbscan/dbscan_numba_dpex_k.py b/dpbench/benchmarks/dbscan/dbscan_numba_dpex_k.py
index 66e7396c..730a94f2 100644
--- a/dpbench/benchmarks/dbscan/dbscan_numba_dpex_k.py
+++ b/dpbench/benchmarks/dbscan/dbscan_numba_dpex_k.py
@@ -4,8 +4,9 @@
 
 import dpnp as np
 import numba as nb
-import numba_dpex as dpex
+import numba_dpex.experimental as dpex
 import numpy
+from numba_dpex import kernel_api as kapi
 
 NOISE = -1
 UNDEFINED = -2
@@ -50,8 +51,10 @@ def _queue_empty(head, tail):
 
 
 @dpex.kernel
-def get_neighborhood(n, dim, data, eps, ind_lst, sz_lst, block_size, nblocks):
-    i = dpex.get_global_id(0)
+def get_neighborhood(
+    item: kapi.Item, n, dim, data, eps, ind_lst, sz_lst, block_size, nblocks
+):
+    i = item.get_id(0)
 
     start = i * block_size
     stop = n if i + 1 == nblocks else start + block_size
@@ -130,7 +133,9 @@ def dbscan(n_samples, n_features, data, eps, min_pts):
     )
     sizes = np.zeros_like(data, shape=n_samples, dtype=np.int64)
 
-    get_neighborhood[dpex.Range(n_samples)](
+    dpex.call_kernel(
+        get_neighborhood,
+        kapi.Range(n_samples),
         n_samples,
         n_features,
         data,
diff --git a/dpbench/benchmarks/gpairs/gpairs_cupy.py b/dpbench/benchmarks/gpairs/gpairs_cupy.py
index 8e26f2cb..f0a58074 100644
--- a/dpbench/benchmarks/gpairs/gpairs_cupy.py
+++ b/dpbench/benchmarks/gpairs/gpairs_cupy.py
@@ -2,19 +2,23 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import cupy as np
+import cupy as cp
 
 
 def _gpairs_impl(x1, y1, z1, w1, x2, y2, z2, w2, rbins):
     dm = (
-        np.square(x2 - x1[:, None])
-        + np.square(y2 - y1[:, None])
-        + np.square(z2 - z1[:, None])
+        cp.square(x2 - x1[:, None])
+        + cp.square(y2 - y1[:, None])
+        + cp.square(z2 - z1[:, None])
     )
-    return np.array(
-        [np.outer(w1, w2)[dm <= rbins[k]].sum() for k in range(len(rbins))]
+    ret_arr = cp.array(
+        [cp.outer(w1, w2)[dm <= rbins[k]].sum() for k in range(len(rbins))]
     )
 
+    cp.cuda.stream.get_current_stream().synchronize()
+
+    return ret_arr
+
 
 def gpairs(nopt, nbins, x1, y1, z1, w1, x2, y2, z2, w2, rbins, results):
     results[:] = _gpairs_impl(x1, y1, z1, w1, x2, y2, z2, w2, rbins)
diff --git a/dpbench/benchmarks/gpairs/gpairs_numba_dpex_k.py b/dpbench/benchmarks/gpairs/gpairs_numba_dpex_k.py
index 9635d224..65f088fa 100644
--- a/dpbench/benchmarks/gpairs/gpairs_numba_dpex_k.py
+++ b/dpbench/benchmarks/gpairs/gpairs_numba_dpex_k.py
@@ -3,12 +3,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import numba_dpex as dpex
+import numba_dpex.experimental as dpexexp
+from numba_dpex import kernel_api as kapi
 
 # This implementation is numba dpex kernel version with atomics.
 
 
-@dpex.kernel
+@dpexexp.kernel
 def count_weighted_pairs_3d_intel_no_slm_ker(
+    nd_item: kapi.NdItem,
     n,
     nbins,
     slm_hist_size,
@@ -25,14 +28,14 @@ def count_weighted_pairs_3d_intel_no_slm_ker(
     result,
 ):
     dtype = x0.dtype
-    lid0 = dpex.get_local_id(0)
-    gr0 = dpex.get_group_id(0)
+    lid0 = nd_item.get_local_id(0)
+    gr0 = nd_item.get_group().get_group_id(0)
 
-    lid1 = dpex.get_local_id(1)
-    gr1 = dpex.get_group_id(1)
+    lid1 = nd_item.get_local_id(1)
+    gr1 = nd_item.get_group().get_group_id(1)
 
-    lws0 = dpex.get_local_size(0)
-    lws1 = dpex.get_local_size(1)
+    lws0 = nd_item.get_local_range(0)
+    lws1 = nd_item.get_local_range(1)
 
     n_wi = 20
 
@@ -107,7 +110,8 @@ def count_weighted_pairs_3d_intel_no_slm_ker(
 
         pk = k
         for p in range(private_hist_size):
-            dpex.atomic.add(result, pk, private_hist[p])
+            result_aref = kapi.AtomicRef(result, index=pk)
+            result_aref.fetch_add(private_hist[p])
             pk += 1
 
 
@@ -147,7 +151,9 @@ def gpairs(
         ceiling_quotient(nbins, private_hist_size) * private_hist_size
     )
 
-    count_weighted_pairs_3d_intel_no_slm_ker[dpex.NdRange(gwsRange, lwsRange)](
+    dpexexp.call_kernel(
+        count_weighted_pairs_3d_intel_no_slm_ker,
+        kapi.NdRange(dpex.Range(*gwsRange), dpex.Range(*lwsRange)),
         nopt,
         nbins,
         slm_hist_size,
diff --git a/dpbench/benchmarks/kmeans/kmeans_numba_dpex_k.py b/dpbench/benchmarks/kmeans/kmeans_numba_dpex_k.py
index b2363256..a66193e8 100644
--- a/dpbench/benchmarks/kmeans/kmeans_numba_dpex_k.py
+++ b/dpbench/benchmarks/kmeans/kmeans_numba_dpex_k.py
@@ -6,9 +6,9 @@
 from math import sqrt
 
 import numba_dpex as dpex
-import numpy
+import numba_dpex.experimental as dpexexp
 from dpctl import tensor as dpt
-from numba_dpex import NdRange
+from numba_dpex import kernel_api as kapi
 
 
 def DivUp(numerator, denominator):
@@ -25,9 +25,15 @@ def getGroupByCluster(  # noqa: C901
 ):
     local_copies = min(4, max(1, DivUp(local_size_, num_centroids)))
 
-    @dpex.kernel
+    @dpexexp.kernel
     def groupByCluster(
-        arrayP, arrayPcluster, arrayC, NewCentroids, NewCount, last
+        nd_item: kapi.NdItem,
+        arrayP,
+        arrayPcluster,
+        arrayC,
+        NewCentroids,
+        NewCount,
+        last,
     ):
         numpoints = arrayP.shape[0]
         localCentroids = dpex.local.array((dims, num_centroids), dtype=dtyp)
@@ -38,9 +44,9 @@ def groupByCluster(
             (local_copies, num_centroids), dtype=dpt.int32
         )
 
-        grid = dpex.get_group_id(0)
-        lid = dpex.get_local_id(0)
-        local_size = dpex.get_local_size(0)
+        grid = nd_item.get_group().get_group_id(0)
+        lid = nd_item.get_local_id(0)
+        local_size = nd_item.get_local_range(0)
 
         for i in range(lid, num_centroids * dims, local_size):
             localCentroids[i % dims, i // dims] = arrayC[i // dims, i % dims]
@@ -51,7 +57,7 @@ def groupByCluster(
             for lc in range(local_copies):
                 localNewCount[lc, c] = 0
 
-        dpex.barrier(dpex.LOCAL_MEM_FENCE)
+        kapi.group_barrier(nd_item.get_group())
 
         for i in range(WorkPI):
             point_id = grid * WorkPI * local_size + i * local_size + lid
@@ -73,44 +79,59 @@ def groupByCluster(
 
                 lc = lid % local_copies
                 for d in range(dims):
-                    dpex.atomic.add(
-                        localNewCentroids, (lc, d, nearest_centroid), localP[d]
+                    localNewCentroids_aref = kapi.AtomicRef(
+                        localNewCentroids,
+                        index=(lc, d, nearest_centroid),
+                        address_space=kapi.AddressSpace.LOCAL,
                     )
+                    localNewCentroids_aref.fetch_add(localP[d])
 
-                dpex.atomic.add(localNewCount, (lc, nearest_centroid), 1)
+                localNewCount_aref = kapi.AtomicRef(
+                    localNewCount,
+                    index=(lc, nearest_centroid),
+                    address_space=kapi.AddressSpace.LOCAL,
+                )
+                localNewCount_aref.fetch_add(1)
 
                 if last:
                     arrayPcluster[point_id] = nearest_centroid
 
-        dpex.barrier(dpex.LOCAL_MEM_FENCE)
+        kapi.group_barrier(nd_item.get_group())
 
         for i in range(lid, num_centroids * dims, local_size):
             local_centroid_d = dtyp.type(0)
             for lc in range(local_copies):
                 local_centroid_d += localNewCentroids[lc, i % dims, i // dims]
 
-            dpex.atomic.add(
-                NewCentroids,
-                (i // dims, i % dims),
-                local_centroid_d,
+            NewCentroids_aref = kapi.AtomicRef(
+                NewCentroids, index=(i // dims, i % dims)
             )
+            NewCentroids_aref.fetch_add(local_centroid_d)
 
         for c in range(lid, num_centroids, local_size):
             local_centroid_npoints = dpt.int32.type(0)
             for lc in range(local_copies):
                 local_centroid_npoints += localNewCount[lc, c]
 
-            dpex.atomic.add(NewCount, c, local_centroid_npoints)
+            NewCount_aref = kapi.AtomicRef(NewCount, index=c)
+            NewCount_aref.fetch_add(local_centroid_npoints)
 
     return groupByCluster
 
 
 @lru_cache(maxsize=1)
 def getUpdateCentroids(dims, num_centroids, dtyp, local_size_):
-    @dpex.kernel
-    def updateCentroids(diff, arrayC, arrayCnumpoint, NewCentroids, NewCount):
-        lid = dpex.get_local_id(0)
-        local_size = dpex.get_local_size(0)
+    @dpexexp.kernel
+    def updateCentroids(
+        nd_item: kapi.NdItem,
+        diff,
+        arrayC,
+        arrayCnumpoint,
+        NewCentroids,
+        NewCount,
+    ):
+        lid = nd_item.get_local_id(0)
+        local_size = nd_item.get_local_range(0)
 
         local_distance = dpex.local.array(local_size_, dtype=dtyp)
 
@@ -134,7 +155,7 @@ def updateCentroids(diff, arrayC, arrayCnumpoint, NewCentroids, NewCount):
             max_distance = max(max_distance, distance)
             local_distance[c] = max_distance
 
-        dpex.barrier(dpex.LOCAL_MEM_FENCE)
+        kapi.group_barrier(nd_item.get_group())
 
         if lid == 0:
             for c in range(local_size):
@@ -147,19 +168,19 @@ def updateCentroids(diff, arrayC, arrayCnumpoint, NewCentroids, NewCount):
 
 @lru_cache(maxsize=1)
 def getUpdateLabels(dims, num_centroids, dtyp, WorkPI):
-    @dpex.kernel
-    def updateLabels(arrayP, arrayPcluster, arrayC):
+    @dpexexp.kernel
+    def updateLabels(nd_item: kapi.NdItem, arrayP, arrayPcluster, arrayC):
         numpoints = arrayP.shape[0]
         localCentroids = dpex.local.array((dims, num_centroids), dtype=dtyp)
 
-        grid = dpex.get_group_id(0)
-        lid = dpex.get_local_id(0)
-        local_size = dpex.get_local_size(0)
+        grid = nd_item.get_group().get_group_id(0)
+        lid = nd_item.get_local_id(0)
+        local_size = nd_item.get_local_range(0)
 
         for i in range(lid, num_centroids * dims, local_size):
             localCentroids[i % dims, i // dims] = arrayC[i // dims, i % dims]
 
-        dpex.barrier(dpex.LOCAL_MEM_FENCE)
+        kapi.group_barrier(nd_item.get_group())
 
         for i in range(WorkPI):
             point_id = grid * WorkPI * local_size + i * local_size + lid
@@ -224,19 +245,36 @@ def kmeans_kernel(
     for i in range(niters):
         last = i == (niters - 1)
         if diff_host < tolerance:
-            updateLabels[NdRange((global_size,), (local_size,))](
-                arrayP, arrayPcluster, arrayC
+            dpexexp.call_kernel(
+                updateLabels,
+                kapi.NdRange((global_size,), (local_size,)),
+                arrayP,
+                arrayPcluster,
+                arrayC,
             )
             break
 
-        groupByCluster[NdRange((global_size,), (local_size,))](
-            arrayP, arrayPcluster, arrayC, NewCentroids, NewCount, last
+        dpexexp.call_kernel(
+            groupByCluster,
+            kapi.NdRange((global_size,), (local_size,)),
+            arrayP,
+            arrayPcluster,
+            arrayC,
+            NewCentroids,
+            NewCount,
+            last,
         )
 
         update_centroid_size = min(num_centroids, local_size)
-        updateCentroids[
-            NdRange((update_centroid_size,), (update_centroid_size,))
-        ](diff, arrayC, arrayCnumpoint, NewCentroids, NewCount)
+        dpexexp.call_kernel(
+            updateCentroids,
+            kapi.NdRange((update_centroid_size,), (update_centroid_size,)),
+            diff,
+            arrayC,
+            arrayCnumpoint,
+            NewCentroids,
+            NewCount,
+        )
         diff_host = dpt.asnumpy(diff)[0]
 
 
diff --git a/dpbench/benchmarks/knn/knn_numba_dpex_k.py b/dpbench/benchmarks/knn/knn_numba_dpex_k.py
index f5e3935c..640a94cd 100644
--- a/dpbench/benchmarks/knn/knn_numba_dpex_k.py
+++ b/dpbench/benchmarks/knn/knn_numba_dpex_k.py
@@ -5,11 +5,14 @@
 from math import sqrt
 
 import numba_dpex as dpex
+import numba_dpex.experimental as dpexexp
 import numpy as np
+from numba_dpex import kernel_api as kapi
 
 
-@dpex.kernel
+@dpexexp.kernel
 def _knn_kernel(  # noqa: C901: TODO: can we simplify logic?
+    item: kapi.Item,
     train,
     train_labels,
     test,
@@ -21,7 +24,7 @@ def _knn_kernel(  # noqa: C901: TODO: can we simplify logic?
     data_dim,
 ):
     dtype = train.dtype
-    i = dpex.get_global_id(0)
+    i = item.get_id(0)
     # here k has to be 5 in order to match with numpy
     queue_neighbors = dpex.private.array(shape=(5, 2), dtype=dtype)
 
@@ -106,7 +109,9 @@ def knn(
     votes_to_classes,
     data_dim,
 ):
-    _knn_kernel[dpex.Range(test_size)](
+    dpexexp.call_kernel(
+        _knn_kernel,
+        kapi.Range(test_size),
         x_train,
         y_train,
         x_test,
diff --git a/dpbench/benchmarks/l2_norm/l2_norm_cupy.py b/dpbench/benchmarks/l2_norm/l2_norm_cupy.py
index a0a979c5..95582772 100644
--- a/dpbench/benchmarks/l2_norm/l2_norm_cupy.py
+++ b/dpbench/benchmarks/l2_norm/l2_norm_cupy.py
@@ -2,10 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import cupy as np
+import cupy as cp
 
 
 def l2_norm(a, d):
-    sq = np.square(a)
+    sq = cp.square(a)
     sum = sq.sum(axis=1)
-    d[:] = np.sqrt(sum)
+    d[:] = cp.sqrt(sum)
+
+    cp.cuda.stream.get_current_stream().synchronize()
diff --git a/dpbench/benchmarks/l2_norm/l2_norm_numba_dpex_k.py b/dpbench/benchmarks/l2_norm/l2_norm_numba_dpex_k.py
index cbc7fe37..eb95183b 100644
--- a/dpbench/benchmarks/l2_norm/l2_norm_numba_dpex_k.py
+++ b/dpbench/benchmarks/l2_norm/l2_norm_numba_dpex_k.py
@@ -4,12 +4,13 @@
 
 import math
 
-import numba_dpex as dpex
+import numba_dpex.experimental as dpex
+from numba_dpex import kernel_api as kapi
 
 
 @dpex.kernel
-def l2_norm_kernel(a, d):
-    i = dpex.get_global_id(0)
+def l2_norm_kernel(item: kapi.Item, a, d):
+    i = item.get_id(0)
     a_rows = a.shape[1]
     d[i] = 0.0
     for k in range(a_rows):
@@ -18,4 +19,4 @@ def l2_norm_kernel(a, d):
 
 
 def l2_norm(a, d):
-    l2_norm_kernel[dpex.Range(a.shape[0])](a, d)
+    dpex.call_kernel(l2_norm_kernel, kapi.Range(a.shape[0]), a, d)
diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_cupy.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_cupy.py
index d4fc1110..0c534215 100644
--- a/dpbench/benchmarks/pairwise_distance/pairwise_distance_cupy.py
+++ b/dpbench/benchmarks/pairwise_distance/pairwise_distance_cupy.py
@@ -2,15 +2,17 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import cupy as np
+import cupy as cp
 
 
 def pairwise_distance(X1, X2, D):
-    x1 = np.sum(np.square(X1), axis=1)
-    x2 = np.sum(np.square(X2), axis=1)
-    np.dot(X1, X2.T, D)
+    x1 = cp.sum(cp.square(X1), axis=1)
+    x2 = cp.sum(cp.square(X2), axis=1)
+    cp.dot(X1, X2.T, D)
     D *= -2
     x3 = x1.reshape(x1.size, 1)
-    np.add(D, x3, D)
-    np.add(D, x2, D)
-    np.sqrt(D, D)
+    cp.add(D, x3, D)
+    cp.add(D, x2, D)
+    cp.sqrt(D, D)
+
+    cp.cuda.stream.get_current_stream().synchronize()
diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_dpex_k.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_dpex_k.py
index e0fa9982..c3dea96b 100644
--- a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_dpex_k.py
+++ b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_dpex_k.py
@@ -3,13 +3,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dpnp as np
-import numba_dpex as dpex
+import numba_dpex.experimental as dpex
+from numba_dpex import kernel_api as kapi
 
 
 @dpex.kernel
-def _pairwise_distance_kernel(X1, X2, D):
-    i = dpex.get_global_id(1)
-    j = dpex.get_global_id(0)
+def _pairwise_distance_kernel(item: kapi.Item, X1, X2, D):
+    i = item.get_id(1)
+    j = item.get_id(0)
 
     X1_cols = X1.shape[1]
 
@@ -21,4 +22,10 @@ def _pairwise_distance_kernel(X1, X2, D):
 
 
 def pairwise_distance(X1, X2, D):
-    _pairwise_distance_kernel[dpex.Range(X2.shape[0], X1.shape[0])](X1, X2, D)
+    dpex.call_kernel(
+        _pairwise_distance_kernel,
+        kapi.Range(X2.shape[0], X1.shape[0]),
+        X1,
+        X2,
+        D,
+    )
diff --git a/dpbench/benchmarks/pca/pca_cupy.py b/dpbench/benchmarks/pca/pca_cupy.py
index 4bfb9631..f439117e 100644
--- a/dpbench/benchmarks/pca/pca_cupy.py
+++ b/dpbench/benchmarks/pca/pca_cupy.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import cupy as np
+import cupy as cp
 
 
 def pca(data, dims_rescaled_data=2):
@@ -10,13 +10,13 @@ def pca(data, dims_rescaled_data=2):
     data -= data.mean(axis=0)
 
     # calculate the covariance matrix
-    v = np.cov(data, rowvar=False, dtype=data.dtype)
+    v = cp.cov(data, rowvar=False, dtype=data.dtype)
 
     # calculate eigenvectors & eigenvalues of the covariance matrix
-    evalues, evectors = np.linalg.eigh(v)
+    evalues, evectors = cp.linalg.eigh(v)
 
     # sort eigenvalues and eigenvectors in decreasing order
-    idx = np.argsort(evalues)[::-1]
+    idx = cp.argsort(evalues)[::-1]
     evectors = evectors[:, idx]
     evalues = evalues[idx]
 
@@ -25,7 +25,10 @@ def pca(data, dims_rescaled_data=2):
     evectors = evectors[:, :dims_rescaled_data]
 
     # carry out the transformation on the data using eigenvectors
-    tdata = np.dot(evectors.T, data.T).T
+    tdata = cp.dot(evectors.T, data.T).T
+
+    cp.cuda.stream.get_current_stream().synchronize()
 
     # return the transformed data, eigenvalues, and eigenvectors
+
     return tdata, evalues, evectors
diff --git a/dpbench/benchmarks/rambo/rambo_cupy.py b/dpbench/benchmarks/rambo/rambo_cupy.py
index 1ffb1b96..87b7b4b0 100644
--- a/dpbench/benchmarks/rambo/rambo_cupy.py
+++ b/dpbench/benchmarks/rambo/rambo_cupy.py
@@ -2,16 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import cupy as np
+import cupy as cp
 
 
 def rambo(nevts, nout, C1, F1, Q1, output):
     C = 2.0 * C1 - 1.0
-    S = np.sqrt(1 - np.square(C))
-    F = 2.0 * np.pi * F1
-    Q = -np.log(Q1)
+    S = cp.sqrt(1 - cp.square(C))
+    F = 2.0 * cp.pi * F1
+    Q = -cp.log(Q1)
 
     output[:, :, 0] = Q
-    output[:, :, 1] = Q * S * np.sin(F)
-    output[:, :, 2] = Q * S * np.cos(F)
+    output[:, :, 1] = Q * S * cp.sin(F)
+    output[:, :, 2] = Q * S * cp.cos(F)
     output[:, :, 3] = Q * C
+
+    cp.cuda.stream.get_current_stream().synchronize()
diff --git a/dpbench/benchmarks/rambo/rambo_numba_dpex_k.py b/dpbench/benchmarks/rambo/rambo_numba_dpex_k.py
index 3937a442..43644d06 100644
--- a/dpbench/benchmarks/rambo/rambo_numba_dpex_k.py
+++ b/dpbench/benchmarks/rambo/rambo_numba_dpex_k.py
@@ -4,13 +4,14 @@
 
 from math import cos, log, pi, sin, sqrt
 
-import numba_dpex as dpex
+import numba_dpex.experimental as dpex
+from numba_dpex import kernel_api as kapi
 
 
 @dpex.kernel
-def _rambo(C1, F1, Q1, nout, output):
+def _rambo(item: kapi.Item, C1, F1, Q1, nout, output):
     dtype = C1.dtype
-    i = dpex.get_global_id(0)
+    i = item.get_id(0)
     for j in range(nout):
         C = dtype.type(2.0) * C1[i, j] - dtype.type(1.0)
         S = sqrt(dtype.type(1) - C * C)
@@ -24,10 +25,4 @@ def _rambo(C1, F1, Q1, nout, output):
 
 
 def rambo(nevts, nout, C1, F1, Q1, output):
-    _rambo[dpex.Range(nevts)](
-        C1,
-        F1,
-        Q1,
-        nout,
-        output,
-    )
+    dpex.call_kernel(_rambo, kapi.Range(nevts), C1, F1, Q1, nout, output)
diff --git a/dpbench/configs/bench_info/kmeans.toml b/dpbench/configs/bench_info/kmeans.toml
index 1fea6d20..e2020b23 100644
--- a/dpbench/configs/bench_info/kmeans.toml
+++ b/dpbench/configs/bench_info/kmeans.toml
@@ -29,7 +29,8 @@ output_args = [
     "arrayCnumpoint",
 ]
 # TODO: remove once fixed. Fails randomly
-expected_failure_implementations = ["numba_mlir_k"]
+# remove numba_dpex_k once atomics on SLM is implemented
+expected_failure_implementations = ["numba_mlir_k", "numba_dpex_k"]
 
 [benchmark.parameters.S]
 npoints = 4096
diff --git a/dpbench/configs/bench_info/pairwise_distance.toml b/dpbench/configs/bench_info/pairwise_distance.toml
index ac133a08..b4eece56 100644
--- a/dpbench/configs/bench_info/pairwise_distance.toml
+++ b/dpbench/configs/bench_info/pairwise_distance.toml
@@ -41,7 +41,7 @@ dims = 3
 seed = 7777777
 
 [benchmark.parameters.L]
-npoints = 65536
+npoints = 44032
 dims = 3
 seed = 7777777
 
diff --git a/dpbench/infrastructure/benchmark_validation.py b/dpbench/infrastructure/benchmark_validation.py
index 65d42a19..8db81345 100644
--- a/dpbench/infrastructure/benchmark_validation.py
+++ b/dpbench/infrastructure/benchmark_validation.py
@@ -91,7 +91,7 @@ def relative_error(
     Returns: relative error.
     """
     ref_norm = np.linalg.norm(ref)
-    if ref_norm:
+    if ref_norm == 0:
         val_norm = np.linalg.norm(val)
         if val_norm == 0:
             return 0.0