codeplaysoftware · FMarno · Jan 16, 2025 · Mar 19, 2025 · Apr 1, 2025 · Apr 1, 2025
diff --git a/.github/workflows/nvidia_test.yml b/.github/workflows/nvidia_test.yml
@@ -63,7 +63,7 @@ jobs:
           cmake -G Ninja  \
             -DCUTLASS_ENABLE_SYCL=ON \
             -DDPCPP_SYCL_TARGET=nvptx64-nvidia-cuda \
-            -DDPCPP_SYCL_ARCH=sm_80
+            -DDPCPP_SYCL_ARCH=sm_90a
           cmake --build .
 
       - name: Unit test

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -26,6 +26,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+if (SYCL_NVIDIA_TARGET)
+  find_package(CUDAToolkit)
+endif()
 
 add_custom_target(cutlass_benchmarks)
 add_custom_target(benchmarks)
@@ -81,6 +84,17 @@ function(cutlass_benchmark_add_executable NAME)
     benchmark::benchmark
   )
 
+  if (SYCL_NVIDIA_TARGET)
+    target_link_libraries(
+      ${NAME}
+      PRIVATE
+      CUDA::cudart
+      CUDA::cuda_driver
+    )
+
+    target_compile_definitions(${NAME} PRIVATE  __CUDACC_VER_MAJOR__=${CUDAToolkit_VERSION_MAJOR} __CUDACC_VER_MINOR__=${CUDAToolkit_VERSION_MINOR})
+  endif()
+
   target_include_directories(
     ${NAME}
     PRIVATE

diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
@@ -53,9 +53,13 @@ endif()
 
 if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
   if("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
+    list(APPEND DPCPP_FLAGS "-fsycl-cuda-compatibility")
+    # Allow to use grid constant, beneficial even if CUTensorMap is not used.
+    list(APPEND DPCPP_FLAGS "-fno-sycl-decompose-functor")
     list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
     list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
     list(APPEND DPCPP_COMPILE_ONLY_FLAGS; "-mllvm;-enable-global-offset=false;")
+    set(CMAKE_CUDA_ARCHITECTURES "${CUTLASS_NVCC_ARCHS}")
   endif()
 endif()
 

diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu
@@ -265,16 +265,12 @@ struct ExampleRunner {
             M * N  // batch_stride_D
     );
 
-#if defined(CUTLASS_ENABLE_SYCL)
-    syclcompat::wait_and_throw();
-#else
     cudaError_t result = cudaDeviceSynchronize();
     if (result != cudaSuccess) {
       std::cerr << "Reference kernel failed. Last CUDA error: "
                 << cudaGetErrorString(result) << std::endl;
       return false;
     }
-#endif
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size());

diff --git a/examples/35_gemm_softmax/gemm_online_softmax.cpp b/examples/35_gemm_softmax/gemm_online_softmax.cpp
@@ -247,7 +247,7 @@ struct ExampleRunner {
       float abs_ref = fabs((float)vector_Input_Ref.at(i));
       float relative_diff = abs_ref > abs_tol ? abs_diff / abs_ref : 0;
       if ( (isnan(abs_diff) || isinf(abs_diff)) ||  (abs_diff > abs_tol && relative_diff > rel_tol)) {
-        printf("i = %d diff = %f, {%f, %f}.\n", i, abs_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
+        printf("i = %ld diff = %f, {%f, %f}.\n", i, abs_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
         return false;
       }
 

diff --git a/examples/35_gemm_softmax/softmax_finalize.hpp b/examples/35_gemm_softmax/softmax_finalize.hpp
@@ -90,6 +90,8 @@ class SoftmaxFinalize {
   };
 
   static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static constexpr int MaxThreadsPerBlock = MaxNumThreadsPerBlock;
+  static constexpr int MinBlocksPerMultiprocessor = 1;
 
   //
   // Params struct

diff --git a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
@@ -510,7 +510,7 @@ int main(int argc, char const **args) {
   //
 
 #if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
-  run<Gemm>(options);
+  return run<Gemm>(options);
 #endif
 
   return 0;

diff --git a/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh b/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh
@@ -37,7 +37,7 @@ namespace example
 
 // Naive grid-stride loop implementation of gather
 template<typename Element, typename Func>
-__global__ void
+CUTLASS_GLOBAL void
 gather_kernel(Element const * __restrict__ input,
               Element       * __restrict__ output,
               Func func,
@@ -76,17 +76,28 @@ gather(Element const * input,
 
   cutlass::FastDivmod stride_divmod(stride_upcast);
   dim3 blocks(hw_info.sm_count, 1, batch_size);
+#if defined(CUTLASS_ENABLE_SYCL)
+  syclcompat::dim3 sycl_grid(blocks.x, blocks.y, blocks.z);
+  syclcompat::launch<gather_kernel<uint128_t,Func>>(sycl_grid, 1024,
+                                  reinterpret_cast<cute::uint128_t const *>(input),
+                                  reinterpret_cast<cute::uint128_t *>(output),
+                                  func,
+                                  num_elems_input_upcast,
+                                  num_elems_output_upcast,
+                                  stride_divmod);
+#else
   gather_kernel<<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input),
                                   reinterpret_cast<cute::uint128_t *>(output),
                                   func,
                                   num_elems_input_upcast,
                                   num_elems_output_upcast,
                                   stride_divmod);
+#endif
 }
 
 // Naive grid-stride loop implementation of scatter
 template<typename Element, typename Func>
-__global__ void
+CUTLASS_GLOBAL void
 scatter_kernel(Element const * __restrict__ input,
                Element       * __restrict__ output,
                Func func,
@@ -125,12 +136,23 @@ scatter(Element const * input,
 
   cutlass::FastDivmod stride_divmod(stride_upcast);
   dim3 blocks(hw_info.sm_count, 1, batch_size);
+#if defined(CUTLASS_ENABLE_SYCL)
+  syclcompat::dim3 sycl_grid(blocks.x, blocks.y, blocks.z);
+  syclcompat::launch<scatter_kernel<uint128_t,Func>>(sycl_grid, 1024,
+                                   reinterpret_cast<cute::uint128_t const *>(input),
+                                   reinterpret_cast<cute::uint128_t *>(output),
+                                   func,
+                                   num_elems_input_upcast,
+                                   num_elems_output_upcast,
+                                   stride_divmod);
+#else
   scatter_kernel<<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input),
                                    reinterpret_cast<cute::uint128_t *>(output),
                                    func,
                                    num_elems_input_upcast,
                                    num_elems_output_upcast,
                                    stride_divmod);
+#endif
 }
 
 } // namespace example
diff --git a/examples/53_hopper_gemm_permute/permute_kernel.cuh b/examples/53_hopper_gemm_permute/permute_kernel.cuh
@@ -47,7 +47,7 @@ namespace example
  * For row major, the inputs must be switched accordingly.
 */
 template<bool Batched, typename Element, typename Permute>
-__global__ void
+CUTLASS_GLOBAL void
 permute_kernel(Element const* __restrict__ input,
                Element* __restrict__ output,
                Permute permute,
@@ -82,11 +82,22 @@ void permute(Element const* input,
 
   cutlass::FastDivmod stride_divmod(stride);
   dim3 blocks(hw_info.sm_count, 1, batch_count);
+#if defined(CUTLASS_ENABLE_SYCL)
+  syclcompat::dim3 sycl_grid(blocks.x, blocks.y, blocks.z);
+  syclcompat::launch<permute_kernel<Batched, cute::uint128_t, Permute>>(sycl_grid, 1024, 
+                                            reinterpret_cast<cute::uint128_t const *>(input), 
+                                            reinterpret_cast<cute::uint128_t *>(output),
+                                            permute_upcast,
+                                            num_elems_upcast,
+                                            stride_upcast);
+
+#else
   permute_kernel<Batched><<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input), 
                                             reinterpret_cast<cute::uint128_t *>(output),
                                             permute_upcast,
                                             num_elems_upcast,
                                             stride_upcast);
+#endif
 }
 
 } // namespace example
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -30,6 +30,10 @@
 
 set(CUTLASS_EXAMPLES_COMMON_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/common)
 
+if (SYCL_NVIDIA_TARGET)
+  find_package(CUDAToolkit)
+endif()
+
 add_custom_target(cutlass_examples)
 add_custom_target(test_examples)
 
@@ -56,11 +60,22 @@ function(cutlass_example_add_executable NAME)
     ${NAME}
     PRIVATE
     CUTLASS
-    cutlass_tools_util_includes
+    cutlass_library_includes
     $<$<BOOL:${CUTLASS_ENABLE_CUBLAS}>:nvidia::cublas>
     $<$<BOOL:${ADD_CUDA}>:cuda>
     )
 
+  if (SYCL_NVIDIA_TARGET)
+    target_link_libraries(
+      ${NAME}
+      PRIVATE
+      CUDA::cudart
+      CUDA::cuda_driver
+    )
+
+    target_compile_definitions(${NAME} PRIVATE  __CUDACC_VER_MAJOR__=${CUDAToolkit_VERSION_MAJOR} __CUDACC_VER_MINOR__=${CUDAToolkit_VERSION_MINOR})
+  endif()
+
   target_include_directories(
     ${NAME}
     PRIVATE
@@ -174,13 +189,33 @@ foreach(EXAMPLE
     add_subdirectory(${EXAMPLE})
   endforeach()
 else()
-  foreach(EXAMPLE
+  set(EXAMPLES
     14_ampere_tf32_tensorop_gemm
     35_gemm_softmax
     cute
     sycl
+  )
+
+  if (SYCL_NVIDIA_TARGET)
+    list(APPEND EXAMPLES
+      48_hopper_warp_specialized_gemm
+      49_hopper_gemm_with_collective_builder
+      50_hopper_gemm_with_epilogue_swizzle
+      51_hopper_gett
+      52_hopper_gather_scatter_fusion
+      53_hopper_gemm_permute
+      54_hopper_fp8_warp_specialized_gemm
+      55_hopper_mixed_dtype_gemm
+      56_hopper_ptr_array_batched_gemm
+      57_hopper_grouped_gemm
+      61_hopper_gemm_with_topk_and_softmax
+      63_hopper_gemm_with_weight_prefetch
+      # 67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling # error: SYCL kernel cannot call a variadic function
+      69_hopper_mixed_dtype_grouped_gemm
     )
+  endif()
+
+  foreach(EXAMPLE ${EXAMPLES})
     add_subdirectory(${EXAMPLE})
   endforeach()
 endif()
-
diff --git a/include/cute/arch/util.hpp b/include/cute/arch/util.hpp
@@ -279,7 +279,6 @@ explode(Fn fn,
   return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]..., f[If]..., g[Ig]...);
 }
 
-#if defined(CUTLASS_ENABLE_SYCL)
 template <class MMA_Op,
           class PtrD, int... Id,
           class PtrA, int... Ia,
@@ -294,7 +293,23 @@ explode_mma(PtrD&& d, int_sequence<Id...>,
 {
   return MMA_Op::fma(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]...);
 }
-#endif
+
+template <class MMA_Op,
+          class PtrD, int... Id,
+          class PtrA, int... Ia,
+          class PtrB, int... Ib,
+          class PtrC, int... Ic,
+          class PtrE, int... Ie>
+CUTE_HOST_DEVICE constexpr
+void
+explode_mma(PtrD&& d, int_sequence<Id...>,
+        PtrA&& a, int_sequence<Ia...>,
+        PtrB&& b, int_sequence<Ib...>,
+        PtrC&& c, int_sequence<Ic...>,
+        PtrE&& e, int_sequence<Ie...>)
+{
+  return MMA_Op::fma(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]...);
+}
 
 //
 // Utility for exploding tuples into functions

diff --git a/include/cute/atom/mma_traits_sm90_gmma.hpp b/include/cute/atom/mma_traits_sm90_gmma.hpp
@@ -418,7 +418,7 @@ mma_unpack(MMA_Traits<MMA_Op, MMA_Args...> const& traits,
   CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
   CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
 
-  detail::explode(MMA_Op::fma,
+  detail::explode_mma<MMA_Op>(
                   rA, make_int_sequence<RegNumA>{},
                   rB, make_int_sequence<RegNumB>{},
                   rC, make_int_sequence<RegNumC>{},

diff --git a/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp b/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
@@ -197,7 +197,8 @@ mma_unpack(MMA_Traits<MMAOp>   const& traits,
 
   CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});
 
-  detail::explode(MMAOp::fma,
+
+  detail::explode_mma<MMAOp>(
                   rA, make_int_sequence<RegNumA>{},
                   rB, make_int_sequence<RegNumB>{},
                   rC, make_int_sequence<RegNumC>{},

diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h
@@ -45,9 +45,9 @@
 #include <cstring>
 #endif
 
-#if !defined(CUTLASS_ENABLE_SYCL)
+#if !defined(CUTLASS_ENABLE_SYCL) || defined(__CUDA__)
 #include <cuda_bf16.h>
-#endif
+#endif // defined(__CUDA__)
 
 #include "cutlass/cutlass.h"
 #include "cutlass/platform/platform.h"
@@ -103,7 +103,7 @@ struct alignas(2) bfloat16_t {
   /// Default constructor
   bfloat16_t() = default;
 
-#if !defined(CUTLASS_ENABLE_SYCL)
+#if !defined(CUTLASS_ENABLE_SYCL) || defined(__CUDA__)
   /// Reinterpret cast from CUDA's __nv_bfloat16 type
   CUTLASS_HOST_DEVICE
   explicit bfloat16_t(__nv_bfloat16 const & x) {
@@ -199,7 +199,7 @@ struct alignas(2) bfloat16_t {
     return (float(*this) != 0.0f);
   }
 
-#if !defined(CUTLASS_ENABLE_SYCL)
+#if !defined(CUTLASS_ENABLE_SYCL) || defined(__CUDA__)
   /// Bitcasts to CUDA's bf16 type
   CUTLASS_DEVICE
   __nv_bfloat16 to_nv_bfloat16() const {

diff --git a/include/cutlass/complex.h b/include/cutlass/complex.h
@@ -31,7 +31,7 @@
 
 #pragma once
 
-#if defined(CUTLASS_ENABLE_SYCL)
+#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
 #include <cutlass/sycl_complex.h>
 #include <cutlass/sycl_fp16.h>
 #else

diff --git a/include/cutlass/cuda_host_adapter.hpp b/include/cutlass/cuda_host_adapter.hpp
@@ -35,7 +35,7 @@
 
 #pragma once
 
-#if !defined(CUTLASS_ENABLE_SYCL)
+#if defined(__CUDA__)
 #include <cuda_runtime_api.h>
 #endif
 #include "cutlass/cutlass.h"
@@ -85,14 +85,16 @@ namespace cutlass {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 
-#if !defined(__CUDACC_RTC__) && !defined(CUTLASS_ENABLE_SYCL)
+#if !defined(__CUDACC_RTC__)
 
 #if ((__CUDACC_VER_MAJOR__ >= 12) ||                               \
     ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
 #include <cudaTypedefs.h>
 #endif // (__CUDACC_VERSION__ >= 11.8)
 
+#if !defined(CUTLASS_ENABLE_SYCL)
 #include <driver_types.h>
+#endif
 
 #define CUTLASS_CUDA_DRIVER_STRINGIFY(tok) #tok