diff --git a/.github/workflows/nvidia_test.yml b/.github/workflows/nvidia_test.yml
index 88ded86320..118fbb004e 100644
--- a/.github/workflows/nvidia_test.yml
+++ b/.github/workflows/nvidia_test.yml
@@ -63,7 +63,7 @@ jobs:
           cmake -G Ninja  \
             -DCUTLASS_ENABLE_SYCL=ON \
             -DDPCPP_SYCL_TARGET=nvptx64-nvidia-cuda \
-            -DDPCPP_SYCL_ARCH=sm_80
+            -DDPCPP_SYCL_ARCH=sm_90a
           cmake --build .
 
       - name: Unit test
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 822f910dff..221feef95f 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -26,6 +26,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+if (SYCL_NVIDIA_TARGET)
+  find_package(CUDAToolkit)
+endif()
 
 add_custom_target(cutlass_benchmarks)
 add_custom_target(benchmarks)
@@ -81,6 +84,17 @@ function(cutlass_benchmark_add_executable NAME)
     benchmark::benchmark
   )
 
+  if (SYCL_NVIDIA_TARGET)
+    target_link_libraries(
+      ${NAME}
+      PRIVATE
+      CUDA::cudart
+      CUDA::cuda_driver
+    )
+
+    target_compile_definitions(${NAME} PRIVATE  __CUDACC_VER_MAJOR__=${CUDAToolkit_VERSION_MAJOR} __CUDACC_VER_MINOR__=${CUDAToolkit_VERSION_MINOR})
+  endif()
+
   target_include_directories(
     ${NAME}
     PRIVATE
diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
index f14a62b267..5ff7aa13e0 100644
--- a/cmake/FindDPCPP.cmake
+++ b/cmake/FindDPCPP.cmake
@@ -53,9 +53,13 @@ endif()
 
 if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
   if("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
+    list(APPEND DPCPP_FLAGS "-fsycl-cuda-compatibility")
+    # Allow to use grid constant, beneficial even if CUTensorMap is not used.
+    list(APPEND DPCPP_FLAGS "-fno-sycl-decompose-functor")
     list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
     list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
     list(APPEND DPCPP_COMPILE_ONLY_FLAGS; "-mllvm;-enable-global-offset=false;")
+    set(CMAKE_CUDA_ARCHITECTURES "${CUTLASS_NVCC_ARCHS}")
   endif()
 endif()
 
diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu
index 1280ad5b40..1e8f4efe47 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu
+++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu
@@ -265,16 +265,12 @@ struct ExampleRunner {
             M * N  // batch_stride_D
     );
 
-#if defined(CUTLASS_ENABLE_SYCL)
-    syclcompat::wait_and_throw();
-#else
     cudaError_t result = cudaDeviceSynchronize();
     if (result != cudaSuccess) {
       std::cerr << "Reference kernel failed. Last CUDA error: "
                 << cudaGetErrorString(result) << std::endl;
       return false;
     }
-#endif
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size());
diff --git a/examples/35_gemm_softmax/gemm_online_softmax.cpp b/examples/35_gemm_softmax/gemm_online_softmax.cpp
index 11d01f6329..e2f473d089 100644
--- a/examples/35_gemm_softmax/gemm_online_softmax.cpp
+++ b/examples/35_gemm_softmax/gemm_online_softmax.cpp
@@ -247,7 +247,7 @@ struct ExampleRunner {
       float abs_ref = fabs((float)vector_Input_Ref.at(i));
       float relative_diff = abs_ref > abs_tol ? abs_diff / abs_ref : 0;
       if ( (isnan(abs_diff) || isinf(abs_diff)) ||  (abs_diff > abs_tol && relative_diff > rel_tol)) {
-        printf("i = %d diff = %f, {%f, %f}.\n", i, abs_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
+        printf("i = %ld diff = %f, {%f, %f}.\n", i, abs_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
         return false;
       }
 
diff --git a/examples/35_gemm_softmax/softmax_finalize.hpp b/examples/35_gemm_softmax/softmax_finalize.hpp
index ca6e6ac93a..ab6ef57b1b 100644
--- a/examples/35_gemm_softmax/softmax_finalize.hpp
+++ b/examples/35_gemm_softmax/softmax_finalize.hpp
@@ -90,6 +90,8 @@ class SoftmaxFinalize {
   };
 
   static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static constexpr int MaxThreadsPerBlock = MaxNumThreadsPerBlock;
+  static constexpr int MinBlocksPerMultiprocessor = 1;
 
   //
   // Params struct
diff --git a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
index 3a35cd7197..3af4469184 100644
--- a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
+++ b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
@@ -510,7 +510,7 @@ int main(int argc, char const **args) {
   //
 
 #if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
-  run<Gemm>(options);
+  return run<Gemm>(options);
 #endif
 
   return 0;
diff --git a/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh b/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh
index b4cafccbb5..070e80f853 100644
--- a/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh
+++ b/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh
@@ -37,7 +37,7 @@ namespace example
 
 // Naive grid-stride loop implementation of gather
 template<typename Element, typename Func>
-__global__ void
+CUTLASS_GLOBAL void
 gather_kernel(Element const * __restrict__ input,
               Element       * __restrict__ output,
               Func func,
@@ -76,17 +76,28 @@ gather(Element const * input,
 
   cutlass::FastDivmod stride_divmod(stride_upcast);
   dim3 blocks(hw_info.sm_count, 1, batch_size);
+#if defined(CUTLASS_ENABLE_SYCL)
+  syclcompat::dim3 sycl_grid(blocks.x, blocks.y, blocks.z);
+  syclcompat::launch<gather_kernel<uint128_t,Func>>(sycl_grid, 1024,
+                                  reinterpret_cast<cute::uint128_t const *>(input),
+                                  reinterpret_cast<cute::uint128_t *>(output),
+                                  func,
+                                  num_elems_input_upcast,
+                                  num_elems_output_upcast,
+                                  stride_divmod);
+#else
   gather_kernel<<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input),
                                   reinterpret_cast<cute::uint128_t *>(output),
                                   func,
                                   num_elems_input_upcast,
                                   num_elems_output_upcast,
                                   stride_divmod);
+#endif
 }
 
 // Naive grid-stride loop implementation of scatter
 template<typename Element, typename Func>
-__global__ void
+CUTLASS_GLOBAL void
 scatter_kernel(Element const * __restrict__ input,
                Element       * __restrict__ output,
                Func func,
@@ -125,12 +136,23 @@ scatter(Element const * input,
 
   cutlass::FastDivmod stride_divmod(stride_upcast);
   dim3 blocks(hw_info.sm_count, 1, batch_size);
+#if defined(CUTLASS_ENABLE_SYCL)
+  syclcompat::dim3 sycl_grid(blocks.x, blocks.y, blocks.z);
+  syclcompat::launch<scatter_kernel<uint128_t,Func>>(sycl_grid, 1024,
+                                   reinterpret_cast<cute::uint128_t const *>(input),
+                                   reinterpret_cast<cute::uint128_t *>(output),
+                                   func,
+                                   num_elems_input_upcast,
+                                   num_elems_output_upcast,
+                                   stride_divmod);
+#else
   scatter_kernel<<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input),
                                    reinterpret_cast<cute::uint128_t *>(output),
                                    func,
                                    num_elems_input_upcast,
                                    num_elems_output_upcast,
                                    stride_divmod);
+#endif
 }
 
 } // namespace example
diff --git a/examples/53_hopper_gemm_permute/permute_kernel.cuh b/examples/53_hopper_gemm_permute/permute_kernel.cuh
index 0cb1aad901..a94774978b 100644
--- a/examples/53_hopper_gemm_permute/permute_kernel.cuh
+++ b/examples/53_hopper_gemm_permute/permute_kernel.cuh
@@ -47,7 +47,7 @@ namespace example
  * For row major, the inputs must be switched accordingly.
 */
 template<bool Batched, typename Element, typename Permute>
-__global__ void
+CUTLASS_GLOBAL void
 permute_kernel(Element const* __restrict__ input,
                Element* __restrict__ output,
                Permute permute,
@@ -82,11 +82,22 @@ void permute(Element const* input,
 
   cutlass::FastDivmod stride_divmod(stride);
   dim3 blocks(hw_info.sm_count, 1, batch_count);
+#if defined(CUTLASS_ENABLE_SYCL)
+  syclcompat::dim3 sycl_grid(blocks.x, blocks.y, blocks.z);
+  syclcompat::launch<permute_kernel<Batched, cute::uint128_t, Permute>>(sycl_grid, 1024, 
+                                            reinterpret_cast<cute::uint128_t const *>(input), 
+                                            reinterpret_cast<cute::uint128_t *>(output),
+                                            permute_upcast,
+                                            num_elems_upcast,
+                                            stride_upcast);
+
+#else
   permute_kernel<Batched><<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input), 
                                             reinterpret_cast<cute::uint128_t *>(output),
                                             permute_upcast,
                                             num_elems_upcast,
                                             stride_upcast);
+#endif
 }
 
 } // namespace example
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 84fc931118..1b8bde4ba5 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -30,6 +30,10 @@
 
 set(CUTLASS_EXAMPLES_COMMON_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/common)
 
+if (SYCL_NVIDIA_TARGET)
+  find_package(CUDAToolkit)
+endif()
+
 add_custom_target(cutlass_examples)
 add_custom_target(test_examples)
 
@@ -56,11 +60,22 @@ function(cutlass_example_add_executable NAME)
     ${NAME}
     PRIVATE
     CUTLASS
-    cutlass_tools_util_includes
+    cutlass_library_includes
     $<$<BOOL:${CUTLASS_ENABLE_CUBLAS}>:nvidia::cublas>
     $<$<BOOL:${ADD_CUDA}>:cuda>
     )
 
+  if (SYCL_NVIDIA_TARGET)
+    target_link_libraries(
+      ${NAME}
+      PRIVATE
+      CUDA::cudart
+      CUDA::cuda_driver
+    )
+
+    target_compile_definitions(${NAME} PRIVATE  __CUDACC_VER_MAJOR__=${CUDAToolkit_VERSION_MAJOR} __CUDACC_VER_MINOR__=${CUDAToolkit_VERSION_MINOR})
+  endif()
+
   target_include_directories(
     ${NAME}
     PRIVATE
@@ -174,13 +189,33 @@ foreach(EXAMPLE
     add_subdirectory(${EXAMPLE})
   endforeach()
 else()
-  foreach(EXAMPLE
+  set(EXAMPLES
     14_ampere_tf32_tensorop_gemm
     35_gemm_softmax
     cute
     sycl
+  )
+
+  if (SYCL_NVIDIA_TARGET)
+    list(APPEND EXAMPLES
+      48_hopper_warp_specialized_gemm
+      49_hopper_gemm_with_collective_builder
+      50_hopper_gemm_with_epilogue_swizzle
+      51_hopper_gett
+      52_hopper_gather_scatter_fusion
+      53_hopper_gemm_permute
+      54_hopper_fp8_warp_specialized_gemm
+      55_hopper_mixed_dtype_gemm
+      56_hopper_ptr_array_batched_gemm
+      57_hopper_grouped_gemm
+      61_hopper_gemm_with_topk_and_softmax
+      63_hopper_gemm_with_weight_prefetch
+      # 67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling # error: SYCL kernel cannot call a variadic function
+      69_hopper_mixed_dtype_grouped_gemm
     )
+  endif()
+
+  foreach(EXAMPLE ${EXAMPLES})
     add_subdirectory(${EXAMPLE})
   endforeach()
 endif()
-
diff --git a/include/cute/arch/util.hpp b/include/cute/arch/util.hpp
index b0899f7a83..82d4538dd8 100644
--- a/include/cute/arch/util.hpp
+++ b/include/cute/arch/util.hpp
@@ -279,7 +279,6 @@ explode(Fn fn,
   return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]..., f[If]..., g[Ig]...);
 }
 
-#if defined(CUTLASS_ENABLE_SYCL)
 template <class MMA_Op,
           class PtrD, int... Id,
           class PtrA, int... Ia,
@@ -294,7 +293,23 @@ explode_mma(PtrD&& d, int_sequence<Id...>,
 {
   return MMA_Op::fma(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]...);
 }
-#endif
+
+template <class MMA_Op,
+          class PtrD, int... Id,
+          class PtrA, int... Ia,
+          class PtrB, int... Ib,
+          class PtrC, int... Ic,
+          class PtrE, int... Ie>
+CUTE_HOST_DEVICE constexpr
+void
+explode_mma(PtrD&& d, int_sequence<Id...>,
+        PtrA&& a, int_sequence<Ia...>,
+        PtrB&& b, int_sequence<Ib...>,
+        PtrC&& c, int_sequence<Ic...>,
+        PtrE&& e, int_sequence<Ie...>)
+{
+  return MMA_Op::fma(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]...);
+}
 
 //
 // Utility for exploding tuples into functions
diff --git a/include/cute/atom/mma_traits_sm90_gmma.hpp b/include/cute/atom/mma_traits_sm90_gmma.hpp
index e688a7e6a8..028d0e79ae 100644
--- a/include/cute/atom/mma_traits_sm90_gmma.hpp
+++ b/include/cute/atom/mma_traits_sm90_gmma.hpp
@@ -418,7 +418,7 @@ mma_unpack(MMA_Traits<MMA_Op, MMA_Args...> const& traits,
   CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
   CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
 
-  detail::explode(MMA_Op::fma,
+  detail::explode_mma<MMA_Op>(
                   rA, make_int_sequence<RegNumA>{},
                   rB, make_int_sequence<RegNumB>{},
                   rC, make_int_sequence<RegNumC>{},
diff --git a/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp b/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
index 13ff07c89f..4dc6eb2705 100644
--- a/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
+++ b/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
@@ -197,7 +197,8 @@ mma_unpack(MMA_Traits<MMAOp>   const& traits,
 
   CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
 
-  detail::explode(MMAOp::fma,
+
+  detail::explode_mma<MMAOp>(
                   rA, make_int_sequence<RegNumA>{},
                   rB, make_int_sequence<RegNumB>{},
                   rC, make_int_sequence<RegNumC>{},
diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h
index 262f5b8974..ffdf2756c1 100644
--- a/include/cutlass/bfloat16.h
+++ b/include/cutlass/bfloat16.h
@@ -45,9 +45,9 @@
 #include <cstring>
 #endif
 
-#if !defined(CUTLASS_ENABLE_SYCL)
+#if !defined(CUTLASS_ENABLE_SYCL) || defined(__CUDA__)
 #include <cuda_bf16.h>
-#endif
+#endif // defined(__CUDA__)
 
 #include "cutlass/cutlass.h"
 #include "cutlass/platform/platform.h"
@@ -103,7 +103,7 @@ struct alignas(2) bfloat16_t {
   /// Default constructor
   bfloat16_t() = default;
 
-#if !defined(CUTLASS_ENABLE_SYCL)
+#if !defined(CUTLASS_ENABLE_SYCL) || defined(__CUDA__)
   /// Reinterpret cast from CUDA's __nv_bfloat16 type
   CUTLASS_HOST_DEVICE
   explicit bfloat16_t(__nv_bfloat16 const & x) {
@@ -199,7 +199,7 @@ struct alignas(2) bfloat16_t {
     return (float(*this) != 0.0f);
   }
 
-#if !defined(CUTLASS_ENABLE_SYCL)
+#if !defined(CUTLASS_ENABLE_SYCL) || defined(__CUDA__)
   /// Bitcasts to CUDA's bf16 type
   CUTLASS_DEVICE
   __nv_bfloat16 to_nv_bfloat16() const {
diff --git a/include/cutlass/complex.h b/include/cutlass/complex.h
index ea355c7130..efab864fe6 100644
--- a/include/cutlass/complex.h
+++ b/include/cutlass/complex.h
@@ -31,7 +31,7 @@
 
 #pragma once
 
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
 #include <cutlass/sycl_complex.h>
 #include <cutlass/sycl_fp16.h>
 #else
diff --git a/include/cutlass/cuda_host_adapter.hpp b/include/cutlass/cuda_host_adapter.hpp
index dffd2c7e15..4194c496aa 100644
--- a/include/cutlass/cuda_host_adapter.hpp
+++ b/include/cutlass/cuda_host_adapter.hpp
@@ -35,7 +35,7 @@
 
 #pragma once
 
-#if !defined(CUTLASS_ENABLE_SYCL)
+#if defined(__CUDA__)
 #include <cuda_runtime_api.h>
 #endif
 #include "cutlass/cutlass.h"
@@ -85,14 +85,16 @@ namespace cutlass {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 
-#if !defined(__CUDACC_RTC__) && !defined(CUTLASS_ENABLE_SYCL)
+#if !defined(__CUDACC_RTC__)
 
 #if ((__CUDACC_VER_MAJOR__ >= 12) ||                               \
     ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
 #include <cudaTypedefs.h>
 #endif // (__CUDACC_VERSION__ >= 11.8)
 
+#if !defined(CUTLASS_ENABLE_SYCL)
 #include <driver_types.h>
+#endif
 
 #define CUTLASS_CUDA_DRIVER_STRINGIFY(tok) #tok
 
diff --git a/include/cutlass/detail/collective/mixed_input_utils.hpp b/include/cutlass/detail/collective/mixed_input_utils.hpp
index 7a8ac1c012..791b2899ed 100644
--- a/include/cutlass/detail/collective/mixed_input_utils.hpp
+++ b/include/cutlass/detail/collective/mixed_input_utils.hpp
@@ -107,7 +107,7 @@ struct LayoutAwareConvertImpl<
       static constexpr uint32_t xor_mask = 0x43084308;
       static constexpr uint32_t lo_mask  = 0x000F000F;
       static constexpr uint32_t immLut   = (0xf0 & 0xcc) ^ 0xaa;
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
       CUTE_INVALID_CONTROL_PATH("Unimplemented");
 #else
       asm volatile(
@@ -159,7 +159,7 @@ struct LayoutAwareConvertImpl<
       static constexpr uint32_t or_mask = 0x43004300;
       static constexpr uint32_t lo_mask = 0x000F000F;
       static constexpr uint32_t immLut  = (0xf0 & 0xcc) | 0xaa;
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
       CUTE_INVALID_CONTROL_PATH("Unimplemented");
 #else
       asm volatile(
@@ -230,7 +230,7 @@ struct LayoutAwareConvertImpl<
       static constexpr uint32_t lo_bias  = 0x64086408; // {1032, 1032}
       static constexpr uint32_t hi_bias  = 0xD480D480; // {-72, -72}
       static constexpr uint32_t hi_scale = 0x2C002C00; // {1/16, 1/16}
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
       {
         half2& fp16x2_val = reinterpret_cast<half2&>(r[ii + 0]);
         fp16x2_val = fp16x2_val - reinterpret_cast<const half2&>(lo_bias);
@@ -308,7 +308,7 @@ struct LayoutAwareConvertImpl<
       static constexpr uint32_t lo_bias  = or_mask;    // 0x64006400, {1024, 1024}
       static constexpr uint32_t hi_bias  = 0xD400D400; // {-64, -64}
       static constexpr uint32_t hi_scale = 0x2C002C00; // {1/16, 1/16}
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
       {
         half2& fp16x2_val = reinterpret_cast<half2&>(r[ii + 0]);
         fp16x2_val = fp16x2_val - reinterpret_cast<const half2&>(lo_bias);
@@ -412,7 +412,7 @@ struct LayoutAwareConvertImpl<
       static constexpr uint32_t and_mask_0 = 0x007F007F;
       static constexpr uint32_t and_mask_1 = 0x00800080;
       static constexpr uint32_t immLut     = (0xf0 & 0xcc) | 0xaa;
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
       CUTE_INVALID_CONTROL_PATH("Unimplemented");
 #else
       asm volatile(
@@ -477,7 +477,7 @@ struct LayoutAwareConvertImpl<
           : "n"(and_mask), "n"(xor_mask), "n"(immLut));
       {
         static constexpr uint32_t bias = 0x64806480;
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
         half2& fp16x2_val = reinterpret_cast<half2&>(r[ii]);
         fp16x2_val = fp16x2_val - reinterpret_cast<half2 const&>(bias);
 #else
diff --git a/include/cutlass/float8.h b/include/cutlass/float8.h
index 7be37d3fc8..d445d9f734 100644
--- a/include/cutlass/float8.h
+++ b/include/cutlass/float8.h
@@ -85,14 +85,15 @@
 #include <cstring>
 #endif
 
-#if !defined(CUTLASS_ENABLE_SYCL)
 #ifdef CUDA_FP8_ENABLED
 #include <cuda_fp8.h>
-#endif
-#include <cuda_fp16.h>
-#else
+#endif // CUDA_FP8_ENABLED
+
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
 #include <cutlass/sycl_fp16.h>
-#endif
+#else
+#include <cuda_fp16.h>
+#endif // defined(__CUDA__)
 
 #include "cutlass/cutlass.h"
 
diff --git a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
index 5b22b29296..b498333665 100644
--- a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
@@ -363,7 +363,7 @@ struct CollectiveMma<
       --k_tile_count;
 
       // UNLOCK smem_pipe_write
-      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+      pipeline.template producer_commit<cutlass::arch::cpasync_barrier_arrive>(smem_pipe_write);
 
       // Advance smem_pipe_write
       ++smem_pipe_write;
@@ -382,7 +382,7 @@ struct CollectiveMma<
       ++k_tile_iter;
 
       // UNLOCK smem_pipe_write
-      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+      pipeline.template producer_commit<cutlass::arch::cpasync_barrier_arrive>(smem_pipe_write);
 
       // Advance smem_pipe_write
       ++smem_pipe_write;
diff --git a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
index 23b8a868a9..eb936eefe8 100644
--- a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
@@ -289,7 +289,7 @@ struct CollectiveMma<
       --k_tile_count;
 
       // UNLOCK smem_pipe_write
-      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+      pipeline.template producer_commit<cutlass::arch::cpasync_barrier_arrive>(smem_pipe_write);
 
       // Advance smem_pipe_write
       ++smem_pipe_write;
@@ -308,7 +308,7 @@ struct CollectiveMma<
       ++k_tile_iter;
 
       // UNLOCK smem_pipe_write
-      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+      pipeline.template producer_commit<cutlass::arch::cpasync_barrier_arrive>(smem_pipe_write);
 
       // Advance smem_pipe_write
       ++smem_pipe_write;
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index bcfc38e097..be7bbee167 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -490,7 +490,7 @@ struct CollectiveMma<
       // Copy scale tensors from global memory to shared memory
       copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
       copy_if(scale_copy_b, tBpB_ScaleB, tBgB_ScaleB(_,_,*k_tile_iter), tBsB_ScaleB(_,_,write_stage));
-      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
+      pipeline.template producer_commit<cutlass::arch::cpasync_barrier_arrive_noinc>(smem_pipe_write);
 
       ++k_tile_iter;
 
diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index b835987b9e..2c573fb740 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -393,7 +393,6 @@ class GemmUniversalAdapter<
 #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
       CUTLASS_TRACE_HOST("GemmUniversal::run: Use extended launch API");
 #endif
-#if !defined(CUTLASS_ENABLE_SYCL)
       [[maybe_unused]] constexpr bool is_static_1x1x1 =
         cute::is_static_v<typename GemmKernel::DispatchPolicy::ClusterShape> and
         cute::size(typename GemmKernel::DispatchPolicy::ClusterShape{}) == 1;
@@ -401,6 +400,7 @@ class GemmUniversalAdapter<
         cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
         cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{}));
       
+#if !defined(CUTLASS_ENABLE_SYCL)
       // Dynamic cluster support
       [[maybe_unused]] dim3 fallback_cluster = dim3{0,0,0};
       if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 100 
@@ -519,6 +519,32 @@ class GemmUniversalAdapter<
         }
         
       }
+#elif defined(__CUDA__)
+        using namespace syclcompat::experimental;
+        auto launch_props = [smem_size, cluster] {
+          if constexpr (is_static_1x1x1) {
+            return sycl::ext::oneapi::experimental::properties{
+              sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
+            };
+          } else {
+            return sycl::ext::oneapi::experimental::properties{
+              sycl::ext::oneapi::experimental::cuda::cluster_size(
+                  sycl::range<3>(cluster.z, cluster.y, cluster.x)
+              ),
+              sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
+            };
+          }
+        }();
+        launch_properties l_props(launch_props);
+        kernel_properties k_props(
+          sycl::ext::oneapi::experimental::max_linear_work_group_size<GemmKernel::MaxThreadsPerBlock>
+        );
+        syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, l_props, k_props};
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        CUTLASS_TRACE_HOST("GemmUniversal::run: Launching sm90 kernel with syclcompat");
+#endif
+        auto event = syclcompat::experimental::launch<device_kernel<GemmKernel>>(policy, params);
+        EventManager::getInstance().addEvent(event);
 #endif
     }
     else {
diff --git a/include/cutlass/gpu_generics.h b/include/cutlass/gpu_generics.h
index 898eab3ecd..4a03e7a980 100644
--- a/include/cutlass/gpu_generics.h
+++ b/include/cutlass/gpu_generics.h
@@ -38,8 +38,10 @@
 
 #if defined(CUTLASS_ENABLE_SYCL)
 #include <sycl/sycl.hpp>
+#if !defined(__CUDA__)
 #include <syclcompat.hpp>
-#endif
+#endif // !defined(__CUDA__)
+#endif // defined(CUTLASS_ENABLE_SYCL)
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -321,6 +323,28 @@ T shfl_xor_sync(
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// Needed for CUTLASS_TRACE_HOST
+
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
+namespace syclcompat {
+static inline std::ostream& operator<<(std::ostream &os, const dim3& dims)
+{
+    os << dims.x << ", " << dims.y << ", " << dims.z;
+    return os;
+}
+}
+#endif
+
+#if defined(CUTLASS_ENABLE_SYCL) && defined(__CUDA__)
+static inline std::ostream& operator<<(std::ostream &os, const dim3 &dims)
+{
+    os << dims.x << ", " << dims.y << ", " << dims.z;
+    return os;
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 /*
  * The CUDA API has functions and types in the global namespace. Ideally, we'd generalize them for both, CUDA and SYCL,
  * but that requires major changes in Cutlass. To avoid that, we redefine them in the Cutlass namespace that is the base
@@ -328,7 +352,7 @@ T shfl_xor_sync(
  * with CUDA definitions. When using CUDA, only the global definitions are available. This way we don't have to modify
  * the codebase, and we can rely on the compiler to select the right definition in both cases.
  */
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
 
 namespace cutlass {
 
@@ -464,6 +488,6 @@ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
 namespace cute {
   using dim3 = syclcompat::dim3;
 }
-#endif
+#endif // defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/kernel_hardware_info.h b/include/cutlass/kernel_hardware_info.h
index da192d1d6e..208fc24f75 100644
--- a/include/cutlass/kernel_hardware_info.h
+++ b/include/cutlass/kernel_hardware_info.h
@@ -31,9 +31,11 @@
 #pragma once
 
 #include "cutlass/device_kernel.h"
-#if !defined(__CUDACC_RTC__) && !defined(CUTLASS_ENABLE_SYCL)
+#if !defined(__CUDACC_RTC__)
+#if !defined(CUTLASS_ENABLE_SYCL)
 #include "cuda_runtime.h"
 #include "cutlass/cluster_launch.hpp"
+#endif
 #include "cutlass/trace.h"
 #endif
 #include <cute/int_tuple.hpp>
@@ -58,12 +60,12 @@ struct KernelHardwareInfo {
   // Methods
   //
 
-#if defined (CUTLASS_ENABLE_SYCL)
+#if defined (CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
   static inline int
   query_device_multiprocessor_count(int device_id = 0) {
     auto& dev = syclcompat::get_device(device_id);
     int multiprocessor_count = 1;
-    //TODO (Codeplay): Replace with device.get_info<sycl::ext::oneapi::info::device::num_compute_units>() once available 
+    //TODO (Codeplay): Replace with device.get_info<sycl::ext::oneapi::info::device::num_compute_units>() once available
 #if defined __SYCL_CUDA_ARCH__
     multiprocessor_count = dev.get_info<sycl::info::device::max_compute_units>();
 #elif defined SYCL_INTEL_TARGET
diff --git a/include/cutlass/numeric_conversion.h b/include/cutlass/numeric_conversion.h
index c16c58e7ac..a3ec635ffb 100644
--- a/include/cutlass/numeric_conversion.h
+++ b/include/cutlass/numeric_conversion.h
@@ -5355,7 +5355,7 @@ struct NumericArrayConverter<float, int8_t, N, Round> {
 
     CUTLASS_PRAGMA_UNROLL
     for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
       t[ii] = syclcompat::dp4a(x, mask[ii], 0);
 #else
       t[ii] = __dp4a(x, mask[ii], 0);
@@ -5571,7 +5571,7 @@ struct NumericArrayConverter<cutlass::half_t, cutlass::int2b_t, N, Round> {
     // Scale and subtract the FP16s to get the original int4 number as FP16.
     CUTLASS_PRAGMA_UNROLL
     for (int ii = 0; ii < RegArray::kElements; ++ii) {
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
       half2& fp16x2_val = reinterpret_cast<half2&>(r[ii]);
       fp16x2_val = sycl::fma(fp16x2_val,
                            reinterpret_cast<const half2&>(hfma_scale_rep),
@@ -5714,7 +5714,7 @@ struct NumericArrayConverter<cutlass::half_t, cutlass::uint2b_t, N, Round> {
     // Scale and subtract the FP16s to get the original int4 number as FP16.
     CUTLASS_PRAGMA_UNROLL
     for (int ii = 0; ii < RegArray::kElements; ++ii) {
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
       half2& fp16x2_val = reinterpret_cast<half2&>(r[ii]);
       fp16x2_val = sycl::fma(fp16x2_val,
                            reinterpret_cast<const half2&>(hfma_scale_rep),
@@ -5868,7 +5868,7 @@ struct NumericArrayConverter<cutlass::half_t, cutlass::int4b_t, N, Round> {
     // Scale and subtract the FP16s to get the original int4 number as FP16.
     CUTLASS_PRAGMA_UNROLL
     for (int ii = 0; ii < RegArray::kElements; ++ii) {
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
       half2& fp16x2_val = reinterpret_cast<half2&>(r[ii]);
       fp16x2_val = sycl::fma(hfma_scale, fp16x2_val, hfma_bias);
 #else
@@ -5999,7 +5999,7 @@ struct NumericArrayConverter<cutlass::half_t, cutlass::uint4b_t, N, Round> {
       static constexpr uint32_t hfma_scale = 0x2C003C00; // {1 / 16, 1}
       
       {
-      #if defined(CUTLASS_ENABLE_SYCL)
+      #if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
         half2& fp16x2_val = reinterpret_cast<half2&>(r[ii]);
         fp16x2_val = sycl::fma(fp16x2_val, reinterpret_cast<const half2&>(hfma_scale), reinterpret_cast<const half2&>(hfma_bias));
       #else
@@ -6119,7 +6119,7 @@ struct NumericArrayConverter<cutlass::half_t, int8_t, N, Round> {
     const half2& bias = reinterpret_cast<const half2&>(bias_rep);
     CUTLASS_PRAGMA_UNROLL
     for (int ii = 0; ii < RegArray::kElements; ++ii) {
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
       half2& fp16x2_val = reinterpret_cast<half2&>(r[ii]);
       fp16x2_val = fp16x2_val - bias;
 #else
@@ -6203,7 +6203,7 @@ struct NumericArrayConverter<cutlass::half_t, uint8_t, N, Round> {
     const half2& bias = reinterpret_cast<const half2&>(bias_rep);
     CUTLASS_PRAGMA_UNROLL
     for (int ii = 0; ii < RegArray::kElements; ++ii) {
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
       half2& fp16x2_val = reinterpret_cast<half2&>(r[ii]);
       fp16x2_val = fp16x2_val - bias;
 #else
diff --git a/include/cutlass/pipeline/sm90_pipeline.hpp b/include/cutlass/pipeline/sm90_pipeline.hpp
index a58893196f..143296065f 100644
--- a/include/cutlass/pipeline/sm90_pipeline.hpp
+++ b/include/cutlass/pipeline/sm90_pipeline.hpp
@@ -430,11 +430,18 @@ class PipelineTmaAsync {
     producer_commit(state.index(), bytes);
   }
 
+  template<void(*ArriveOp)(uint64_t const*)>
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    ArriveOp(producer_get_barrier(state.index()));
+  }
+#if !defined(CUTLASS_ENABLE_SYCL)
   template<class UserDefinedArriveOp>
   CUTLASS_DEVICE
   void producer_commit(PipelineState state, UserDefinedArriveOp&& user_defined_arrive_op) {
     cute::forward<UserDefinedArriveOp>(user_defined_arrive_op)(producer_get_barrier(state.index()));;
   }
+#endif
 
   // Prevents early exit of producer blocks in Cluster.
   // This should be called once before kernel exits.
@@ -1074,12 +1081,20 @@ class PipelineAsync {
     producer_commit(state.index());
   }
 
+  template<void(*ArriveOp)(uint64_t const*)>
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    ArriveOp(producer_get_barrier(state.index()));
+    producer_commit(state);
+  }
+#if !defined(CUTLASS_ENABLE_SYCL)
   template<class UserDefinedArriveOp>
   CUTLASS_DEVICE
   void producer_commit(PipelineState state, UserDefinedArriveOp&& user_defined_arrive_op) {
     cute::forward<UserDefinedArriveOp>(user_defined_arrive_op)(producer_get_barrier(state.index()));
     producer_commit(state);
   }
+#endif
 
   // Prevents early exit of producer blocks in Cluster.
   // This should be called once before kernel exits.
diff --git a/include/cutlass/sycl_vector_types.h b/include/cutlass/sycl_vector_types.h
index fe5f931201..30a44eaf54 100644
--- a/include/cutlass/sycl_vector_types.h
+++ b/include/cutlass/sycl_vector_types.h
@@ -30,6 +30,8 @@
  **************************************************************************************************/
 #pragma once
 
+#if !defined(__CUDA__)
+
 #include "cutlass/detail/helper_macros.hpp"
 
 // Add these definitions in the cutlass namespace, so they do not clash with the ones in cuda
@@ -103,3 +105,4 @@ int4 make_int4(int x, int y, int z, int w) {
   return int4 {x,y,z,w};
 }
 }
+#endif // !defined(__CUDA__)
diff --git a/test/unit/cute/cooperative_gemm_common.hpp b/test/unit/cute/cooperative_gemm_common.hpp
index 3fd61d93f9..e154d61a2b 100644
--- a/test/unit/cute/cooperative_gemm_common.hpp
+++ b/test/unit/cute/cooperative_gemm_common.hpp
@@ -226,7 +226,7 @@ cooperative_gemm_kernel(GMemALayout gmem_a_layout,
 
     constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
 
-    auto smem_buf = reinterpret_cast<cutlass::float4*>((char*)base_smem);
+    auto smem_buf = reinterpret_cast<float4*>((char*)base_smem);
 
     auto* smem_ptr = reinterpret_cast<unsigned char*>(smem_buf);
     auto* smem_ptr_a = smem_ptr;
@@ -301,7 +301,7 @@ cooperative_gemm_kernel_rmem_c(GMemALayout gmem_a_layout,
 
     constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
 
-    auto smem_buf = reinterpret_cast<cutlass::float4*>((char*)base_smem);
+    auto smem_buf = reinterpret_cast<float4*>((char*)base_smem);
     auto* smem_ptr = reinterpret_cast<unsigned char*>(smem_buf);
     auto* smem_ptr_a = smem_ptr;
     auto* smem_ptr_b = smem_ptr_a + round_up((sizeof(TA) * cosize(smem_a_layout)), copy_max_vec_bytes);
diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h
index ca9d220fc9..2d2c7e374a 100644
--- a/tools/util/include/cutlass/util/device_memory.h
+++ b/tools/util/include/cutlass/util/device_memory.h
@@ -193,6 +193,10 @@ void insert_to_device(T* device_begin, InputIterator begin, InputIterator end) {
 template <typename T>
 class DeviceAllocation {
 public:
+#if defined(CUTLASS_ENABLE_SYCL)
+  // keep a reference to the device so it's not destructed before the destructor of this class (siof)
+  sycl::device dev = syclcompat::get_default_queue().get_device();
+#endif
 
   /// Delete functor for CUDA device memory
   struct deleter {
diff --git a/tools/util/include/cutlass/util/mixed_dtype_utils.hpp b/tools/util/include/cutlass/util/mixed_dtype_utils.hpp
index 3843dea874..f6685fdc34 100644
--- a/tools/util/include/cutlass/util/mixed_dtype_utils.hpp
+++ b/tools/util/include/cutlass/util/mixed_dtype_utils.hpp
@@ -192,7 +192,6 @@ static void dequantize(DequantizedElement* dq_buffer,
       ElementZero, decltype(scale_layout_bcast), decltype(thr_layout)>>(
       blocks, tpb, dq_buffer, q_buffer, operand_layout, scale_buffer,
       zero_buffer, scale_layout_bcast, thr_layout);
-
   syclcompat::wait_and_throw();
 #else
   dequantize_kernel<<<blocks, tpb, 0, stream>>>(dq_buffer, q_buffer, operand_layout, scale_buffer, zero_buffer, scale_layout_bcast, thr_layout);
@@ -463,11 +462,19 @@ void reorder_tensor(
   auto tiled_D = group_modes<3,rank_v<LayoutDst>>(tiled_divide(D, TileShape{}));
   dim3 blocks{unsigned(size<1>(tiled_D)), 1u, unsigned(size<3>(tiled_D))};
 
-#ifndef CUTLASS_ENABLE_SYCL
+#if defined(CUTLASS_ENABLE_SYCL)
+  syclcompat::dim3 sycl_grid{blocks.x, blocks.y, blocks.z};
+  syclcompat::launch<reorder_tensor_kernel<
+    TileShape,
+    typename decltype(S)::engine_type, typename decltype(S)::layout_type,
+    typename decltype(D)::engine_type, typename decltype(D)::layout_type,
+    decltype(tiled_copy)
+    >>(sycl_grid, NumThreads,
+      S, D, tiled_copy);
+  syclcompat::wait_and_throw();
+#else
   reorder_tensor_kernel<TileShape><<<blocks, NumThreads>>>(S, D, tiled_copy);
   CUDA_CHECK(cudaDeviceSynchronize());
-#else
-  CUTE_INVALID_CONTROL_PATH("Unimplemented/untested code path");
 #endif
 }
 
diff --git a/tools/util/include/cutlass/util/reference/device/gemm.h b/tools/util/include/cutlass/util/reference/device/gemm.h
index 7d575d522c..4040a493bc 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm.h
@@ -99,6 +99,20 @@ void compute_gemm(
   );
 
   // Launch a GEMM kernel
+#if defined(CUTLASS_ENABLE_SYCL)
+  syclcompat::launch<
+    kernel::Gemm<
+    TensorRef<ElementA, LayoutA>,
+    TensorRef<ElementB, LayoutB>,
+    TensorRef<ElementC, LayoutC>,
+    ScalarType,
+    AccumulatorType,
+    OutputTile,
+    InnerProductOp,
+    ConvertOp
+  >>(syclcompat::dim3(grid.x, grid.y, grid.z),
+     syclcompat::dim3(block.x, block.y, block.z),
+#else
   kernel::Gemm<
     TensorRef<ElementA, LayoutA>,
     TensorRef<ElementB, LayoutB>,
@@ -109,6 +123,7 @@ void compute_gemm(
     InnerProductOp,
     ConvertOp
   ><<< grid, block >>>(
+#endif
     problem_size,
     alpha,
     tensor_a,
@@ -333,6 +348,20 @@ void BatchedGemm(
     batch_count
   );
 
+#if defined(CUTLASS_ENABLE_SYCL)
+  syclcompat::launch<
+    kernel::BatchedGemm<
+    TensorRefCollectionA,
+    TensorRefCollectionB,
+    TensorRefCollectionC,
+    ScalarType,
+    AccumulatorType,
+    OutputTile,
+    InnerProductOp,
+    ConvertOp
+  >>(syclcompat::dim3(grid.x, grid.y, grid.z),
+     syclcompat::dim3(block.x, block.y, block.z),
+#else
   // Launch a GEMM kernel
   kernel::BatchedGemm<
     TensorRefCollectionA,
@@ -344,6 +373,7 @@ void BatchedGemm(
     InnerProductOp,
     ConvertOp
   ><<< grid, block >>>(
+#endif
     problem_size,
     alpha,
     tensor_a,
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
index 4e45988a8f..eb3ac61dd6 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
@@ -233,10 +233,6 @@ void GemmComplex(
   int const kMblock = 4;
   int const kNblock = 4;
 
-#if defined (CUTLASS_ENABLE_SYCL)
-using syclcompat::dim3;
-#endif
-
   dim3 block(16, 8);
   dim3 grid(
     (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
@@ -246,38 +242,23 @@ using syclcompat::dim3;
 
   if (grid.y <= std::numeric_limits<uint16_t>::max()) {
 #if defined(CUTLASS_ENABLE_SYCL)
-
+  syclcompat::dim3 sycl_grid(grid.x, grid.y, grid.z);
+  syclcompat::dim3 sycl_block(block.x, block.y, block.z);
   syclcompat::launch<kernel::GemmComplex<
-                      ElementA,
-                      LayoutA,
-                      ElementB,
-                      LayoutB,
-                      ElementC,
-                      LayoutC,
-                      ScalarType,
-                      ComputeType,
-                      ElementD,
-                      ConvertOp,
-                      InnerProductOp,
-                      kMblock,
-                      kNblock
-                    >>(grid, block, 
-                        problem_size,
-                        alpha,
-                        tensor_a,
-                        transform_a,
-                        tensor_b,
-                        transform_b,
-                        beta,
-                        tensor_c,
-                        tensor_d,
-                        initial_accum,
-                        batch_count,
-                        batch_stride_A,
-                        batch_stride_B,
-                        batch_stride_C,
-                        batch_stride_D
-                    );
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ScalarType,
+    ComputeType,
+    ElementD,
+    ConvertOp,
+    InnerProductOp,
+    kMblock,
+    kNblock
+  >>(sycl_grid, sycl_block,
 #else
     kernel::GemmComplex<
       ElementA,
@@ -294,6 +275,7 @@ using syclcompat::dim3;
       kMblock,
       kNblock
     ><<< grid, block >>>(
+#endif
       problem_size,
       alpha,
       tensor_a,
@@ -310,7 +292,6 @@ using syclcompat::dim3;
       batch_stride_C,
       batch_stride_D
     );
-#endif
   } else {
     // Using bigger thread tile size
     int const kBigMblock = 4;
@@ -324,37 +305,23 @@ using syclcompat::dim3;
     );
 
 #if defined (CUTLASS_ENABLE_SYCL)
+  syclcompat::dim3 sycl_Biggrid(Biggrid.x, Biggrid.y, Biggrid.z);
+  syclcompat::dim3 sycl_Bigblock(Bigblock.x, Bigblock.y, Bigblock.z);
   syclcompat::launch<kernel::GemmComplex<
-                      ElementA,
-                      LayoutA,
-                      ElementB,
-                      LayoutB,
-                      ElementC,
-                      LayoutC,
-                      ScalarType,
-                      ComputeType,
-                      ElementD,
-                      ConvertOp,
-                      InnerProductOp,
-                      kBigMblock,
-                      kBigNblock
-                    >>(Biggrid, Bigblock, 
-                        problem_size,
-                        alpha,
-                        tensor_a,
-                        transform_a,
-                        tensor_b,
-                        transform_b,
-                        beta,
-                        tensor_c,
-                        tensor_d,
-                        initial_accum,
-                        batch_count,
-                        batch_stride_A,
-                        batch_stride_B,
-                        batch_stride_C,
-                        batch_stride_D
-                    );
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ScalarType,
+    ComputeType,
+    ElementD,
+    ConvertOp,
+    InnerProductOp,
+    kBigMblock,
+    kBigNblock
+  >>(sycl_Biggrid, sycl_Bigblock,
 #else
     kernel::GemmComplex<
       ElementA,
@@ -371,6 +338,7 @@ using syclcompat::dim3;
       kBigMblock,
       kBigNblock
     ><<< Biggrid, Bigblock >>>(
+#endif
       problem_size,
       alpha,
       tensor_a,
@@ -387,7 +355,6 @@ using syclcompat::dim3;
       batch_stride_C,
       batch_stride_D
     );
-#endif
   }
 }
 
diff --git a/tools/util/include/cutlass/util/reference/device/gett.hpp b/tools/util/include/cutlass/util/reference/device/gett.hpp
index 497a257d17..0f39f2084c 100644
--- a/tools/util/include/cutlass/util/reference/device/gett.hpp
+++ b/tools/util/include/cutlass/util/reference/device/gett.hpp
@@ -44,7 +44,7 @@ template <
   class DTensor,
   class ElementAccumulator,
   class ElementEpilogue>
-__global__ static
+CUTLASS_GLOBAL
 void
 gett_kernel(
   DTensor       D,
@@ -140,7 +140,19 @@ gett(
 
   dim3 dimBlock(256);
   dim3 dimGrid(240);
+#if defined(CUTLASS_ENABLE_SYCL)
+  const syclcompat::dim3 sycl_grid(dimGrid.x, dimGrid.y, dimGrid.z);
+  const syclcompat::dim3 sycl_block(dimBlock.x, dimBlock.y, dimBlock.z);
+  syclcompat::launch<gett_kernel<
+    decltype(A),
+    decltype(B),
+    decltype(C),
+    decltype(D),
+    ElementAccumulator,
+    ElementEpilogue>>(sycl_grid, sycl_block, D, A, B, C, alpha, beta, ElementAccumulator(0));
+#else
   gett_kernel<<< dimGrid, dimBlock, 0, stream >>>(D, A, B, C, alpha, beta, ElementAccumulator(0));
+#endif
 }
 
 } // namespace cutlass::reference::device
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
index 6e131126a3..319f6e05b0 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
@@ -59,7 +59,7 @@ template <
   typename InnerProductOp,
   typename ConvertOp
 >
-__global__ void Gemm(
+CUTLASS_GLOBAL void Gemm(
   gemm::GemmCoord problem_size,
   ScalarType alpha,
   TensorRefA tensor_a,
@@ -110,7 +110,7 @@ template <
   typename InnerProductOp,
   typename ConvertOp
 >
-__global__ void BatchedGemm(
+CUTLASS_GLOBAL void BatchedGemm(
   gemm::GemmCoord problem_size,
   ScalarType alpha,
   TensorRefCollectionA tensor_collection_a,
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
index 8617ab8725..0382546811 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
@@ -34,6 +34,7 @@
 #include <stdexcept>
 #include "cutlass/cutlass.h"
 #include "cutlass/util/reference/device/kernel/tensor_foreach.h"
+#include "cutlass/kernel_hardware_info.h"
 
 namespace cutlass  {
 namespace reference {
@@ -75,8 +76,8 @@ struct TensorForEach {
     }
 
 #if defined(CUTLASS_ENABLE_SYCL)
-    const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
-    const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1);
+    const syclcompat::dim3 sycl_block(block_size, 1, 1);
+    const syclcompat::dim3 sycl_grid(grid_size, 1, 1);
     syclcompat::launch<kernel::TensorForEach<Func, Rank, Params>>(sycl_grid, sycl_block, size, params);
 #else
     dim3 grid(grid_size, 1, 1);
@@ -103,8 +104,8 @@ struct TensorDiagonalForEach {
     }
 
 #if defined(CUTLASS_ENABLE_SYCL)
-    const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
-    const auto sycl_grid = syclcompat::dim3((end - start + block_size - 1) / block_size, 1, 1);
+    const syclcompat::dim3 sycl_block(block_size, 1, 1);
+    const syclcompat::dim3 sycl_grid((end - start + block_size - 1) / block_size, 1, 1);
     syclcompat::launch<kernel::TensorDiagonalForEach<Func, Rank, Params>>(sycl_grid, sycl_block, size, params, start, end);
 #else
     dim3 block(block_size, 1, 1);
@@ -153,8 +154,8 @@ struct BlockForEach {
     }
 
 #if defined(CUTLASS_ENABLE_SYCL)
-    const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
-    const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1);
+    const syclcompat::dim3 sycl_block(block_size, 1, 1);
+    const syclcompat::dim3 sycl_grid(grid_size, 1, 1);
     syclcompat::launch<kernel::BlockForEach<Element, Func>>(sycl_grid, sycl_block, ptr, capacity, params);
 #else
     dim3 grid(grid_size, 1, 1);