diff --git a/.github/workflows/nvidia_test.yml b/.github/workflows/nvidia_test.yml index 88ded86320..118fbb004e 100644 --- a/.github/workflows/nvidia_test.yml +++ b/.github/workflows/nvidia_test.yml @@ -63,7 +63,7 @@ jobs: cmake -G Ninja \ -DCUTLASS_ENABLE_SYCL=ON \ -DDPCPP_SYCL_TARGET=nvptx64-nvidia-cuda \ - -DDPCPP_SYCL_ARCH=sm_80 + -DDPCPP_SYCL_ARCH=sm_90a cmake --build . - name: Unit test diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 728e5fb2ed..a331f652bd 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -26,6 +26,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +if (SYCL_NVIDIA_TARGET) + find_package(CUDAToolkit) +endif() add_custom_target(cutlass_benchmarks) @@ -56,6 +59,17 @@ function(cutlass_benchmark_add_executable NAME) benchmark::benchmark ) + if (SYCL_NVIDIA_TARGET) + target_link_libraries( + ${NAME} + PRIVATE + CUDA::cudart + CUDA::cuda_driver + ) + + target_compile_definitions(${NAME} PRIVATE __CUDACC_VER_MAJOR__=${CUDAToolkit_VERSION_MAJOR} __CUDACC_VER_MINOR__=${CUDAToolkit_VERSION_MINOR}) + endif() + target_include_directories( ${NAME} PRIVATE diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake index f14a62b267..5ff7aa13e0 100644 --- a/cmake/FindDPCPP.cmake +++ b/cmake/FindDPCPP.cmake @@ -53,9 +53,13 @@ endif() if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "") if("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda") + list(APPEND DPCPP_FLAGS "-fsycl-cuda-compatibility") + # Allow to use grid constant, beneficial even if CUTensorMap is not used. + list(APPEND DPCPP_FLAGS "-fno-sycl-decompose-functor") list(APPEND DPCPP_FLAGS "-Xsycl-target-backend") list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}") list(APPEND DPCPP_COMPILE_ONLY_FLAGS; "-mllvm;-enable-global-offset=false;") + set(CMAKE_CUDA_ARCHITECTURES "${CUTLASS_NVCC_ARCHS}") endif() endif() diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu index 1280ad5b40..1e8f4efe47 100644 --- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu +++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu @@ -265,16 +265,12 @@ struct ExampleRunner { M * N // batch_stride_D ); -#if defined(CUTLASS_ENABLE_SYCL) - syclcompat::wait_and_throw(); -#else cudaError_t result = cudaDeviceSynchronize(); if (result != cudaSuccess) { std::cerr << "Reference kernel failed. Last CUDA error: " << cudaGetErrorString(result) << std::endl; return false; } -#endif // Check if output from CUTLASS kernel and reference kernel are equal or not bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size()); diff --git a/examples/35_gemm_softmax/gemm_online_softmax.cpp b/examples/35_gemm_softmax/gemm_online_softmax.cpp index 11d01f6329..e2f473d089 100644 --- a/examples/35_gemm_softmax/gemm_online_softmax.cpp +++ b/examples/35_gemm_softmax/gemm_online_softmax.cpp @@ -247,7 +247,7 @@ struct ExampleRunner { float abs_ref = fabs((float)vector_Input_Ref.at(i)); float relative_diff = abs_ref > abs_tol ? abs_diff / abs_ref : 0; if ( (isnan(abs_diff) || isinf(abs_diff)) || (abs_diff > abs_tol && relative_diff > rel_tol)) { - printf("i = %d diff = %f, {%f, %f}.\n", i, abs_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i))); + printf("i = %ld diff = %f, {%f, %f}.\n", i, abs_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i))); return false; } diff --git a/examples/35_gemm_softmax/softmax_finalize.hpp b/examples/35_gemm_softmax/softmax_finalize.hpp index ca6e6ac93a..ab6ef57b1b 100644 --- a/examples/35_gemm_softmax/softmax_finalize.hpp +++ b/examples/35_gemm_softmax/softmax_finalize.hpp @@ -90,6 +90,8 @@ class SoftmaxFinalize { }; static constexpr int SharedStorageSize = sizeof(SharedStorage); + static constexpr int MaxThreadsPerBlock = MaxNumThreadsPerBlock; + static constexpr int MinBlocksPerMultiprocessor = 1; // // Params struct diff --git a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu index 3a35cd7197..3af4469184 100644 --- a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu +++ b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu @@ -510,7 +510,7 @@ int main(int argc, char const **args) { // #if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) - run(options); + return run(options); #endif return 0; diff --git a/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh b/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh index b4cafccbb5..070e80f853 100644 --- a/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh +++ b/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh @@ -37,7 +37,7 @@ namespace example // Naive grid-stride loop implementation of gather template -__global__ void +CUTLASS_GLOBAL void gather_kernel(Element const * __restrict__ input, Element * __restrict__ output, Func func, @@ -76,17 +76,28 @@ gather(Element const * input, cutlass::FastDivmod stride_divmod(stride_upcast); dim3 blocks(hw_info.sm_count, 1, batch_size); +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::dim3 sycl_grid(blocks.x, blocks.y, blocks.z); + syclcompat::launch>(sycl_grid, 1024, + reinterpret_cast(input), + reinterpret_cast(output), + func, + num_elems_input_upcast, + num_elems_output_upcast, + stride_divmod); +#else gather_kernel<<>>(reinterpret_cast(input), reinterpret_cast(output), func, num_elems_input_upcast, num_elems_output_upcast, stride_divmod); +#endif } // Naive grid-stride loop implementation of scatter template -__global__ void +CUTLASS_GLOBAL void scatter_kernel(Element const * __restrict__ input, Element * __restrict__ output, Func func, @@ -125,12 +136,23 @@ scatter(Element const * input, cutlass::FastDivmod stride_divmod(stride_upcast); dim3 blocks(hw_info.sm_count, 1, batch_size); +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::dim3 sycl_grid(blocks.x, blocks.y, blocks.z); + syclcompat::launch>(sycl_grid, 1024, + reinterpret_cast(input), + reinterpret_cast(output), + func, + num_elems_input_upcast, + num_elems_output_upcast, + stride_divmod); +#else scatter_kernel<<>>(reinterpret_cast(input), reinterpret_cast(output), func, num_elems_input_upcast, num_elems_output_upcast, stride_divmod); +#endif } } // namespace example diff --git a/examples/53_hopper_gemm_permute/permute_kernel.cuh b/examples/53_hopper_gemm_permute/permute_kernel.cuh index 0cb1aad901..a94774978b 100644 --- a/examples/53_hopper_gemm_permute/permute_kernel.cuh +++ b/examples/53_hopper_gemm_permute/permute_kernel.cuh @@ -47,7 +47,7 @@ namespace example * For row major, the inputs must be switched accordingly. */ template -__global__ void +CUTLASS_GLOBAL void permute_kernel(Element const* __restrict__ input, Element* __restrict__ output, Permute permute, @@ -82,11 +82,22 @@ void permute(Element const* input, cutlass::FastDivmod stride_divmod(stride); dim3 blocks(hw_info.sm_count, 1, batch_count); +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::dim3 sycl_grid(blocks.x, blocks.y, blocks.z); + syclcompat::launch>(sycl_grid, 1024, + reinterpret_cast(input), + reinterpret_cast(output), + permute_upcast, + num_elems_upcast, + stride_upcast); + +#else permute_kernel<<>>(reinterpret_cast(input), reinterpret_cast(output), permute_upcast, num_elems_upcast, stride_upcast); +#endif } } // namespace example diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 84fc931118..1b8bde4ba5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -30,6 +30,10 @@ set(CUTLASS_EXAMPLES_COMMON_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/common) +if (SYCL_NVIDIA_TARGET) + find_package(CUDAToolkit) +endif() + add_custom_target(cutlass_examples) add_custom_target(test_examples) @@ -56,11 +60,22 @@ function(cutlass_example_add_executable NAME) ${NAME} PRIVATE CUTLASS - cutlass_tools_util_includes + cutlass_library_includes $<$:nvidia::cublas> $<$:cuda> ) + if (SYCL_NVIDIA_TARGET) + target_link_libraries( + ${NAME} + PRIVATE + CUDA::cudart + CUDA::cuda_driver + ) + + target_compile_definitions(${NAME} PRIVATE __CUDACC_VER_MAJOR__=${CUDAToolkit_VERSION_MAJOR} __CUDACC_VER_MINOR__=${CUDAToolkit_VERSION_MINOR}) + endif() + target_include_directories( ${NAME} PRIVATE @@ -174,13 +189,33 @@ foreach(EXAMPLE add_subdirectory(${EXAMPLE}) endforeach() else() - foreach(EXAMPLE + set(EXAMPLES 14_ampere_tf32_tensorop_gemm 35_gemm_softmax cute sycl + ) + + if (SYCL_NVIDIA_TARGET) + list(APPEND EXAMPLES + 48_hopper_warp_specialized_gemm + 49_hopper_gemm_with_collective_builder + 50_hopper_gemm_with_epilogue_swizzle + 51_hopper_gett + 52_hopper_gather_scatter_fusion + 53_hopper_gemm_permute + 54_hopper_fp8_warp_specialized_gemm + 55_hopper_mixed_dtype_gemm + 56_hopper_ptr_array_batched_gemm + 57_hopper_grouped_gemm + 61_hopper_gemm_with_topk_and_softmax + 63_hopper_gemm_with_weight_prefetch + # 67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling # error: SYCL kernel cannot call a variadic function + 69_hopper_mixed_dtype_grouped_gemm ) + endif() + + foreach(EXAMPLE ${EXAMPLES}) add_subdirectory(${EXAMPLE}) endforeach() endif() - diff --git a/include/cute/arch/util.hpp b/include/cute/arch/util.hpp index b0899f7a83..82d4538dd8 100644 --- a/include/cute/arch/util.hpp +++ b/include/cute/arch/util.hpp @@ -279,7 +279,6 @@ explode(Fn fn, return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]..., f[If]..., g[Ig]...); } -#if defined(CUTLASS_ENABLE_SYCL) template , { return MMA_Op::fma(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]...); } -#endif + +template +CUTE_HOST_DEVICE constexpr +void +explode_mma(PtrD&& d, int_sequence, + PtrA&& a, int_sequence, + PtrB&& b, int_sequence, + PtrC&& c, int_sequence, + PtrE&& e, int_sequence) +{ + return MMA_Op::fma(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]...); +} // // Utility for exploding tuples into functions diff --git a/include/cute/atom/mma_traits_sm90_gmma.hpp b/include/cute/atom/mma_traits_sm90_gmma.hpp index e688a7e6a8..028d0e79ae 100644 --- a/include/cute/atom/mma_traits_sm90_gmma.hpp +++ b/include/cute/atom/mma_traits_sm90_gmma.hpp @@ -418,7 +418,7 @@ mma_unpack(MMA_Traits const& traits, CUTE_STATIC_ASSERT_V(size(rB) == Int{}); CUTE_STATIC_ASSERT_V(size(rC) == Int{}); - detail::explode(MMA_Op::fma, + detail::explode_mma( rA, make_int_sequence{}, rB, make_int_sequence{}, rC, make_int_sequence{}, diff --git a/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp b/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp index 13ff07c89f..4dc6eb2705 100644 --- a/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp +++ b/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp @@ -197,7 +197,8 @@ mma_unpack(MMA_Traits const& traits, CUTE_STATIC_ASSERT_V(size(rC) == Int{}); - detail::explode(MMAOp::fma, + + detail::explode_mma( rA, make_int_sequence{}, rB, make_int_sequence{}, rC, make_int_sequence{}, diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h index 262f5b8974..ffdf2756c1 100644 --- a/include/cutlass/bfloat16.h +++ b/include/cutlass/bfloat16.h @@ -45,9 +45,9 @@ #include #endif -#if !defined(CUTLASS_ENABLE_SYCL) +#if !defined(CUTLASS_ENABLE_SYCL) || defined(__CUDA__) #include -#endif +#endif // defined(__CUDA__) #include "cutlass/cutlass.h" #include "cutlass/platform/platform.h" @@ -103,7 +103,7 @@ struct alignas(2) bfloat16_t { /// Default constructor bfloat16_t() = default; -#if !defined(CUTLASS_ENABLE_SYCL) +#if !defined(CUTLASS_ENABLE_SYCL) || defined(__CUDA__) /// Reinterpret cast from CUDA's __nv_bfloat16 type CUTLASS_HOST_DEVICE explicit bfloat16_t(__nv_bfloat16 const & x) { @@ -199,7 +199,7 @@ struct alignas(2) bfloat16_t { return (float(*this) != 0.0f); } -#if !defined(CUTLASS_ENABLE_SYCL) +#if !defined(CUTLASS_ENABLE_SYCL) || defined(__CUDA__) /// Bitcasts to CUDA's bf16 type CUTLASS_DEVICE __nv_bfloat16 to_nv_bfloat16() const { diff --git a/include/cutlass/complex.h b/include/cutlass/complex.h index ea355c7130..efab864fe6 100644 --- a/include/cutlass/complex.h +++ b/include/cutlass/complex.h @@ -31,7 +31,7 @@ #pragma once -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) #include #include #else diff --git a/include/cutlass/cuda_host_adapter.hpp b/include/cutlass/cuda_host_adapter.hpp index dffd2c7e15..4194c496aa 100644 --- a/include/cutlass/cuda_host_adapter.hpp +++ b/include/cutlass/cuda_host_adapter.hpp @@ -35,7 +35,7 @@ #pragma once -#if !defined(CUTLASS_ENABLE_SYCL) +#if defined(__CUDA__) #include #endif #include "cutlass/cutlass.h" @@ -85,14 +85,16 @@ namespace cutlass { ///////////////////////////////////////////////////////////////////////////////////////////////// -#if !defined(__CUDACC_RTC__) && !defined(CUTLASS_ENABLE_SYCL) +#if !defined(__CUDACC_RTC__) #if ((__CUDACC_VER_MAJOR__ >= 12) || \ ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))) #include #endif // (__CUDACC_VERSION__ >= 11.8) +#if !defined(CUTLASS_ENABLE_SYCL) #include +#endif #define CUTLASS_CUDA_DRIVER_STRINGIFY(tok) #tok diff --git a/include/cutlass/detail/collective/mixed_input_utils.hpp b/include/cutlass/detail/collective/mixed_input_utils.hpp index 7a8ac1c012..791b2899ed 100644 --- a/include/cutlass/detail/collective/mixed_input_utils.hpp +++ b/include/cutlass/detail/collective/mixed_input_utils.hpp @@ -107,7 +107,7 @@ struct LayoutAwareConvertImpl< static constexpr uint32_t xor_mask = 0x43084308; static constexpr uint32_t lo_mask = 0x000F000F; static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa; -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) CUTE_INVALID_CONTROL_PATH("Unimplemented"); #else asm volatile( @@ -159,7 +159,7 @@ struct LayoutAwareConvertImpl< static constexpr uint32_t or_mask = 0x43004300; static constexpr uint32_t lo_mask = 0x000F000F; static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa; -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) CUTE_INVALID_CONTROL_PATH("Unimplemented"); #else asm volatile( @@ -230,7 +230,7 @@ struct LayoutAwareConvertImpl< static constexpr uint32_t lo_bias = 0x64086408; // {1032, 1032} static constexpr uint32_t hi_bias = 0xD480D480; // {-72, -72} static constexpr uint32_t hi_scale = 0x2C002C00; // {1/16, 1/16} -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) { half2& fp16x2_val = reinterpret_cast(r[ii + 0]); fp16x2_val = fp16x2_val - reinterpret_cast(lo_bias); @@ -308,7 +308,7 @@ struct LayoutAwareConvertImpl< static constexpr uint32_t lo_bias = or_mask; // 0x64006400, {1024, 1024} static constexpr uint32_t hi_bias = 0xD400D400; // {-64, -64} static constexpr uint32_t hi_scale = 0x2C002C00; // {1/16, 1/16} -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) { half2& fp16x2_val = reinterpret_cast(r[ii + 0]); fp16x2_val = fp16x2_val - reinterpret_cast(lo_bias); @@ -412,7 +412,7 @@ struct LayoutAwareConvertImpl< static constexpr uint32_t and_mask_0 = 0x007F007F; static constexpr uint32_t and_mask_1 = 0x00800080; static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa; -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) CUTE_INVALID_CONTROL_PATH("Unimplemented"); #else asm volatile( @@ -477,7 +477,7 @@ struct LayoutAwareConvertImpl< : "n"(and_mask), "n"(xor_mask), "n"(immLut)); { static constexpr uint32_t bias = 0x64806480; -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) half2& fp16x2_val = reinterpret_cast(r[ii]); fp16x2_val = fp16x2_val - reinterpret_cast(bias); #else diff --git a/include/cutlass/float8.h b/include/cutlass/float8.h index 7be37d3fc8..d445d9f734 100644 --- a/include/cutlass/float8.h +++ b/include/cutlass/float8.h @@ -85,14 +85,15 @@ #include #endif -#if !defined(CUTLASS_ENABLE_SYCL) #ifdef CUDA_FP8_ENABLED #include -#endif -#include -#else +#endif // CUDA_FP8_ENABLED + +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) #include -#endif +#else +#include +#endif // defined(__CUDA__) #include "cutlass/cutlass.h" diff --git a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp index 5b22b29296..b498333665 100644 --- a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp +++ b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp @@ -363,7 +363,7 @@ struct CollectiveMma< --k_tile_count; // UNLOCK smem_pipe_write - pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive); + pipeline.template producer_commit(smem_pipe_write); // Advance smem_pipe_write ++smem_pipe_write; @@ -382,7 +382,7 @@ struct CollectiveMma< ++k_tile_iter; // UNLOCK smem_pipe_write - pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive); + pipeline.template producer_commit(smem_pipe_write); // Advance smem_pipe_write ++smem_pipe_write; diff --git a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp index 23b8a868a9..eb936eefe8 100644 --- a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp +++ b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp @@ -289,7 +289,7 @@ struct CollectiveMma< --k_tile_count; // UNLOCK smem_pipe_write - pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive); + pipeline.template producer_commit(smem_pipe_write); // Advance smem_pipe_write ++smem_pipe_write; @@ -308,7 +308,7 @@ struct CollectiveMma< ++k_tile_iter; // UNLOCK smem_pipe_write - pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive); + pipeline.template producer_commit(smem_pipe_write); // Advance smem_pipe_write ++smem_pipe_write; diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp index bcfc38e097..be7bbee167 100644 --- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp +++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp @@ -490,7 +490,7 @@ struct CollectiveMma< // Copy scale tensors from global memory to shared memory copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage)); copy_if(scale_copy_b, tBpB_ScaleB, tBgB_ScaleB(_,_,*k_tile_iter), tBsB_ScaleB(_,_,write_stage)); - pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc); + pipeline.template producer_commit(smem_pipe_write); ++k_tile_iter; diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h index 5588e0ca08..9449b8386f 100644 --- a/include/cutlass/gemm/device/gemm_universal_adapter.h +++ b/include/cutlass/gemm/device/gemm_universal_adapter.h @@ -393,7 +393,6 @@ class GemmUniversalAdapter< #if (CUTLASS_DEBUG_TRACE_LEVEL > 1) CUTLASS_TRACE_HOST("GemmUniversal::run: Use extended launch API"); #endif -#if !defined(CUTLASS_ENABLE_SYCL) [[maybe_unused]] constexpr bool is_static_1x1x1 = cute::is_static_v and cute::size(typename GemmKernel::DispatchPolicy::ClusterShape{}) == 1; @@ -401,6 +400,7 @@ class GemmUniversalAdapter< cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}), cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{})); +#if !defined(CUTLASS_ENABLE_SYCL) // Dynamic cluster support [[maybe_unused]] dim3 fallback_cluster = dim3{0,0,0}; if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 100 @@ -519,6 +519,32 @@ class GemmUniversalAdapter< } } +#elif defined(__CUDA__) + using namespace syclcompat::experimental; + auto launch_props = [smem_size, cluster] { + if constexpr (is_static_1x1x1) { + return sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size), + }; + } else { + return sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::cuda::cluster_size( + sycl::range<3>(cluster.z, cluster.y, cluster.x) + ), + sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size), + }; + } + }(); + launch_properties l_props(launch_props); + kernel_properties k_props( + sycl::ext::oneapi::experimental::max_linear_work_group_size + ); + syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, l_props, k_props}; +#if (CUTLASS_DEBUG_TRACE_LEVEL > 1) + CUTLASS_TRACE_HOST("GemmUniversal::run: Launching sm90 kernel with syclcompat"); +#endif + auto event = syclcompat::experimental::launch>(policy, params); + EventManager::getInstance().addEvent(event); #endif } else { diff --git a/include/cutlass/gpu_generics.h b/include/cutlass/gpu_generics.h index 3ce9e01b90..91cc8d913f 100644 --- a/include/cutlass/gpu_generics.h +++ b/include/cutlass/gpu_generics.h @@ -38,8 +38,10 @@ #if defined(CUTLASS_ENABLE_SYCL) #include +#if !defined(__CUDA__) #include -#endif +#endif // !defined(__CUDA__) +#endif // defined(CUTLASS_ENABLE_SYCL) //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -322,6 +324,28 @@ T shfl_xor_sync( //////////////////////////////////////////////////////////////////////////////////////////////////// +// Needed for CUTLASS_TRACE_HOST + +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) +namespace syclcompat { +static inline std::ostream& operator<<(std::ostream &os, const dim3& dims) +{ + os << dims.x << ", " << dims.y << ", " << dims.z; + return os; +} +} +#endif + +#if defined(CUTLASS_ENABLE_SYCL) && defined(__CUDA__) +static inline std::ostream& operator<<(std::ostream &os, const dim3 &dims) +{ + os << dims.x << ", " << dims.y << ", " << dims.z; + return os; +} +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /* * The CUDA API has functions and types in the global namespace. Ideally, we'd generalize them for both, CUDA and SYCL, * but that requires major changes in Cutlass. To avoid that, we redefine them in the Cutlass namespace that is the base @@ -329,7 +353,7 @@ T shfl_xor_sync( * with CUDA definitions. When using CUDA, only the global definitions are available. This way we don't have to modify * the codebase, and we can rely on the compiler to select the right definition in both cases. */ -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) namespace cutlass { @@ -449,6 +473,6 @@ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( namespace cute { using dim3 = syclcompat::dim3; } -#endif +#endif // defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/kernel_hardware_info.h b/include/cutlass/kernel_hardware_info.h index 4a37f6bcb6..645bc16258 100644 --- a/include/cutlass/kernel_hardware_info.h +++ b/include/cutlass/kernel_hardware_info.h @@ -31,9 +31,11 @@ #pragma once #include "cutlass/device_kernel.h" -#if !defined(__CUDACC_RTC__) && !defined(CUTLASS_ENABLE_SYCL) +#if !defined(__CUDACC_RTC__) +#if !defined(CUTLASS_ENABLE_SYCL) #include "cuda_runtime.h" #include "cutlass/cluster_launch.hpp" +#endif #include "cutlass/trace.h" #endif #include @@ -58,13 +60,13 @@ struct KernelHardwareInfo { // Methods // -#if defined (CUTLASS_ENABLE_SYCL) +#if defined (CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) static inline int query_device_multiprocessor_count(int device_id = 0) { auto queue = syclcompat::get_default_queue(); auto dev = queue.get_device(); int multiprocessor_count = 1; - //TODO (Codeplay): Replace with device.get_info() once available + //TODO (Codeplay): Replace with device.get_info() once available #if defined __SYCL_CUDA_ARCH__ multiprocessor_count = dev.get_info(); #elif defined SYCL_INTEL_TARGET diff --git a/include/cutlass/numeric_conversion.h b/include/cutlass/numeric_conversion.h index 3aeea4c375..c28eb19d17 100644 --- a/include/cutlass/numeric_conversion.h +++ b/include/cutlass/numeric_conversion.h @@ -5347,7 +5347,7 @@ struct NumericArrayConverter { CUTLASS_PRAGMA_UNROLL for (int ii = 0; ii < PackedResultType::kElements; ++ii) { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) t[ii] = syclcompat::dp4a(x, mask[ii], 0); #else t[ii] = __dp4a(x, mask[ii], 0); @@ -5563,7 +5563,7 @@ struct NumericArrayConverter { // Scale and subtract the FP16s to get the original int4 number as FP16. CUTLASS_PRAGMA_UNROLL for (int ii = 0; ii < RegArray::kElements; ++ii) { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) half2& fp16x2_val = reinterpret_cast(r[ii]); fp16x2_val = sycl::fma(fp16x2_val, reinterpret_cast(hfma_scale_rep), @@ -5706,7 +5706,7 @@ struct NumericArrayConverter { // Scale and subtract the FP16s to get the original int4 number as FP16. CUTLASS_PRAGMA_UNROLL for (int ii = 0; ii < RegArray::kElements; ++ii) { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) half2& fp16x2_val = reinterpret_cast(r[ii]); fp16x2_val = sycl::fma(fp16x2_val, reinterpret_cast(hfma_scale_rep), @@ -5860,7 +5860,7 @@ struct NumericArrayConverter { // Scale and subtract the FP16s to get the original int4 number as FP16. CUTLASS_PRAGMA_UNROLL for (int ii = 0; ii < RegArray::kElements; ++ii) { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) half2& fp16x2_val = reinterpret_cast(r[ii]); fp16x2_val = sycl::fma(hfma_scale, fp16x2_val, hfma_bias); #else @@ -5991,7 +5991,7 @@ struct NumericArrayConverter { static constexpr uint32_t hfma_scale = 0x2C003C00; // {1 / 16, 1} { - #if defined(CUTLASS_ENABLE_SYCL) + #if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) half2& fp16x2_val = reinterpret_cast(r[ii]); fp16x2_val = sycl::fma(fp16x2_val, reinterpret_cast(hfma_scale), reinterpret_cast(hfma_bias)); #else @@ -6111,7 +6111,7 @@ struct NumericArrayConverter { const half2& bias = reinterpret_cast(bias_rep); CUTLASS_PRAGMA_UNROLL for (int ii = 0; ii < RegArray::kElements; ++ii) { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) half2& fp16x2_val = reinterpret_cast(r[ii]); fp16x2_val = fp16x2_val - bias; #else @@ -6195,7 +6195,7 @@ struct NumericArrayConverter { const half2& bias = reinterpret_cast(bias_rep); CUTLASS_PRAGMA_UNROLL for (int ii = 0; ii < RegArray::kElements; ++ii) { -#if defined(CUTLASS_ENABLE_SYCL) +#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__) half2& fp16x2_val = reinterpret_cast(r[ii]); fp16x2_val = fp16x2_val - bias; #else diff --git a/include/cutlass/pipeline/sm90_pipeline.hpp b/include/cutlass/pipeline/sm90_pipeline.hpp index a58893196f..143296065f 100644 --- a/include/cutlass/pipeline/sm90_pipeline.hpp +++ b/include/cutlass/pipeline/sm90_pipeline.hpp @@ -430,11 +430,18 @@ class PipelineTmaAsync { producer_commit(state.index(), bytes); } + template + CUTLASS_DEVICE + void producer_commit(PipelineState state) { + ArriveOp(producer_get_barrier(state.index())); + } +#if !defined(CUTLASS_ENABLE_SYCL) template CUTLASS_DEVICE void producer_commit(PipelineState state, UserDefinedArriveOp&& user_defined_arrive_op) { cute::forward(user_defined_arrive_op)(producer_get_barrier(state.index()));; } +#endif // Prevents early exit of producer blocks in Cluster. // This should be called once before kernel exits. @@ -1074,12 +1081,20 @@ class PipelineAsync { producer_commit(state.index()); } + template + CUTLASS_DEVICE + void producer_commit(PipelineState state) { + ArriveOp(producer_get_barrier(state.index())); + producer_commit(state); + } +#if !defined(CUTLASS_ENABLE_SYCL) template CUTLASS_DEVICE void producer_commit(PipelineState state, UserDefinedArriveOp&& user_defined_arrive_op) { cute::forward(user_defined_arrive_op)(producer_get_barrier(state.index())); producer_commit(state); } +#endif // Prevents early exit of producer blocks in Cluster. // This should be called once before kernel exits. diff --git a/include/cutlass/sycl_vector_types.h b/include/cutlass/sycl_vector_types.h index fe5f931201..30a44eaf54 100644 --- a/include/cutlass/sycl_vector_types.h +++ b/include/cutlass/sycl_vector_types.h @@ -30,6 +30,8 @@ **************************************************************************************************/ #pragma once +#if !defined(__CUDA__) + #include "cutlass/detail/helper_macros.hpp" // Add these definitions in the cutlass namespace, so they do not clash with the ones in cuda @@ -103,3 +105,4 @@ int4 make_int4(int x, int y, int z, int w) { return int4 {x,y,z,w}; } } +#endif // !defined(__CUDA__) diff --git a/test/unit/cute/cooperative_gemm_common.hpp b/test/unit/cute/cooperative_gemm_common.hpp index 3fd61d93f9..e154d61a2b 100644 --- a/test/unit/cute/cooperative_gemm_common.hpp +++ b/test/unit/cute/cooperative_gemm_common.hpp @@ -226,7 +226,7 @@ cooperative_gemm_kernel(GMemALayout gmem_a_layout, constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8; - auto smem_buf = reinterpret_cast((char*)base_smem); + auto smem_buf = reinterpret_cast((char*)base_smem); auto* smem_ptr = reinterpret_cast(smem_buf); auto* smem_ptr_a = smem_ptr; @@ -301,7 +301,7 @@ cooperative_gemm_kernel_rmem_c(GMemALayout gmem_a_layout, constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8; - auto smem_buf = reinterpret_cast((char*)base_smem); + auto smem_buf = reinterpret_cast((char*)base_smem); auto* smem_ptr = reinterpret_cast(smem_buf); auto* smem_ptr_a = smem_ptr; auto* smem_ptr_b = smem_ptr_a + round_up((sizeof(TA) * cosize(smem_a_layout)), copy_max_vec_bytes); diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h index ca9d220fc9..2d2c7e374a 100644 --- a/tools/util/include/cutlass/util/device_memory.h +++ b/tools/util/include/cutlass/util/device_memory.h @@ -193,6 +193,10 @@ void insert_to_device(T* device_begin, InputIterator begin, InputIterator end) { template class DeviceAllocation { public: +#if defined(CUTLASS_ENABLE_SYCL) + // keep a reference to the device so it's not destructed before the destructor of this class (siof) + sycl::device dev = syclcompat::get_default_queue().get_device(); +#endif /// Delete functor for CUDA device memory struct deleter { diff --git a/tools/util/include/cutlass/util/mixed_dtype_utils.hpp b/tools/util/include/cutlass/util/mixed_dtype_utils.hpp index 0c7a93a39f..c499173e9a 100644 --- a/tools/util/include/cutlass/util/mixed_dtype_utils.hpp +++ b/tools/util/include/cutlass/util/mixed_dtype_utils.hpp @@ -63,7 +63,7 @@ template < class ElementZero, class ScaleBroadCastLayout, class ThrLayout> -__global__ void dequantize_kernel(DequantizedElement* dq_buffer, +CUTLASS_GLOBAL void dequantize_kernel(DequantizedElement* dq_buffer, QuantizedElement const* q_buffer, OperandLayout const operand_layout, ElementScale const* scale_buffer, @@ -178,7 +178,14 @@ static void dequantize(DequantizedElement* dq_buffer, const auto blocks_y = batches; dim3 blocks(blocks_x, blocks_y, 1); +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::dim3 sycl_grid{blocks.x, blocks.y, blocks.z}; + syclcompat::launch>(sycl_grid, tpb, + dq_buffer, q_buffer, operand_layout, scale_buffer, zero_buffer, scale_layout_bcast, thr_layout); + +#else dequantize_kernel<<>>(dq_buffer, q_buffer, operand_layout, scale_buffer, zero_buffer, scale_layout_bcast, thr_layout); +#endif CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -394,7 +401,7 @@ constexpr auto compute_memory_reordering_atom(AtomLayout atom_layout = {}, ValLa } template -__global__ void reorder_tensor_kernel( +CUTLASS_GLOBAL void reorder_tensor_kernel( cute::Tensor S, cute::Tensor D, TiledCopy tiled_copy) @@ -445,7 +452,18 @@ void reorder_tensor( auto tiled_D = group_modes<3,rank_v>(tiled_divide(D, TileShape{})); dim3 blocks{unsigned(size<1>(tiled_D)), 1u, unsigned(size<3>(tiled_D))}; +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::dim3 sycl_grid{blocks.x, blocks.y, blocks.z}; + syclcompat::launch>(sycl_grid, NumThreads, + S, D, tiled_copy); +#else reorder_tensor_kernel<<>>(S, D, tiled_copy); +#endif CUDA_CHECK(cudaDeviceSynchronize()); } diff --git a/tools/util/include/cutlass/util/reference/device/gemm.h b/tools/util/include/cutlass/util/reference/device/gemm.h index 7d575d522c..4040a493bc 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm.h +++ b/tools/util/include/cutlass/util/reference/device/gemm.h @@ -99,6 +99,20 @@ void compute_gemm( ); // Launch a GEMM kernel +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::launch< + kernel::Gemm< + TensorRef, + TensorRef, + TensorRef, + ScalarType, + AccumulatorType, + OutputTile, + InnerProductOp, + ConvertOp + >>(syclcompat::dim3(grid.x, grid.y, grid.z), + syclcompat::dim3(block.x, block.y, block.z), +#else kernel::Gemm< TensorRef, TensorRef, @@ -109,6 +123,7 @@ void compute_gemm( InnerProductOp, ConvertOp ><<< grid, block >>>( +#endif problem_size, alpha, tensor_a, @@ -333,6 +348,20 @@ void BatchedGemm( batch_count ); +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::launch< + kernel::BatchedGemm< + TensorRefCollectionA, + TensorRefCollectionB, + TensorRefCollectionC, + ScalarType, + AccumulatorType, + OutputTile, + InnerProductOp, + ConvertOp + >>(syclcompat::dim3(grid.x, grid.y, grid.z), + syclcompat::dim3(block.x, block.y, block.z), +#else // Launch a GEMM kernel kernel::BatchedGemm< TensorRefCollectionA, @@ -344,6 +373,7 @@ void BatchedGemm( InnerProductOp, ConvertOp ><<< grid, block >>>( +#endif problem_size, alpha, tensor_a, diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h index 4e45988a8f..eb3ac61dd6 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h +++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h @@ -233,10 +233,6 @@ void GemmComplex( int const kMblock = 4; int const kNblock = 4; -#if defined (CUTLASS_ENABLE_SYCL) -using syclcompat::dim3; -#endif - dim3 block(16, 8); dim3 grid( (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock), @@ -246,38 +242,23 @@ using syclcompat::dim3; if (grid.y <= std::numeric_limits::max()) { #if defined(CUTLASS_ENABLE_SYCL) - + syclcompat::dim3 sycl_grid(grid.x, grid.y, grid.z); + syclcompat::dim3 sycl_block(block.x, block.y, block.z); syclcompat::launch>(grid, block, - problem_size, - alpha, - tensor_a, - transform_a, - tensor_b, - transform_b, - beta, - tensor_c, - tensor_d, - initial_accum, - batch_count, - batch_stride_A, - batch_stride_B, - batch_stride_C, - batch_stride_D - ); + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ScalarType, + ComputeType, + ElementD, + ConvertOp, + InnerProductOp, + kMblock, + kNblock + >>(sycl_grid, sycl_block, #else kernel::GemmComplex< ElementA, @@ -294,6 +275,7 @@ using syclcompat::dim3; kMblock, kNblock ><<< grid, block >>>( +#endif problem_size, alpha, tensor_a, @@ -310,7 +292,6 @@ using syclcompat::dim3; batch_stride_C, batch_stride_D ); -#endif } else { // Using bigger thread tile size int const kBigMblock = 4; @@ -324,37 +305,23 @@ using syclcompat::dim3; ); #if defined (CUTLASS_ENABLE_SYCL) + syclcompat::dim3 sycl_Biggrid(Biggrid.x, Biggrid.y, Biggrid.z); + syclcompat::dim3 sycl_Bigblock(Bigblock.x, Bigblock.y, Bigblock.z); syclcompat::launch>(Biggrid, Bigblock, - problem_size, - alpha, - tensor_a, - transform_a, - tensor_b, - transform_b, - beta, - tensor_c, - tensor_d, - initial_accum, - batch_count, - batch_stride_A, - batch_stride_B, - batch_stride_C, - batch_stride_D - ); + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ScalarType, + ComputeType, + ElementD, + ConvertOp, + InnerProductOp, + kBigMblock, + kBigNblock + >>(sycl_Biggrid, sycl_Bigblock, #else kernel::GemmComplex< ElementA, @@ -371,6 +338,7 @@ using syclcompat::dim3; kBigMblock, kBigNblock ><<< Biggrid, Bigblock >>>( +#endif problem_size, alpha, tensor_a, @@ -387,7 +355,6 @@ using syclcompat::dim3; batch_stride_C, batch_stride_D ); -#endif } } diff --git a/tools/util/include/cutlass/util/reference/device/gett.hpp b/tools/util/include/cutlass/util/reference/device/gett.hpp index 497a257d17..0f39f2084c 100644 --- a/tools/util/include/cutlass/util/reference/device/gett.hpp +++ b/tools/util/include/cutlass/util/reference/device/gett.hpp @@ -44,7 +44,7 @@ template < class DTensor, class ElementAccumulator, class ElementEpilogue> -__global__ static +CUTLASS_GLOBAL void gett_kernel( DTensor D, @@ -140,7 +140,19 @@ gett( dim3 dimBlock(256); dim3 dimGrid(240); +#if defined(CUTLASS_ENABLE_SYCL) + const syclcompat::dim3 sycl_grid(dimGrid.x, dimGrid.y, dimGrid.z); + const syclcompat::dim3 sycl_block(dimBlock.x, dimBlock.y, dimBlock.z); + syclcompat::launch>(sycl_grid, sycl_block, D, A, B, C, alpha, beta, ElementAccumulator(0)); +#else gett_kernel<<< dimGrid, dimBlock, 0, stream >>>(D, A, B, C, alpha, beta, ElementAccumulator(0)); +#endif } } // namespace cutlass::reference::device diff --git a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h index 6e131126a3..319f6e05b0 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h @@ -59,7 +59,7 @@ template < typename InnerProductOp, typename ConvertOp > -__global__ void Gemm( +CUTLASS_GLOBAL void Gemm( gemm::GemmCoord problem_size, ScalarType alpha, TensorRefA tensor_a, @@ -110,7 +110,7 @@ template < typename InnerProductOp, typename ConvertOp > -__global__ void BatchedGemm( +CUTLASS_GLOBAL void BatchedGemm( gemm::GemmCoord problem_size, ScalarType alpha, TensorRefCollectionA tensor_collection_a, diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h index 3de89c0648..582beb65f4 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h @@ -33,6 +33,7 @@ #include #include "cutlass/cutlass.h" #include "cutlass/util/reference/device/kernel/tensor_foreach.h" +#include "cutlass/kernel_hardware_info.h" namespace cutlass { namespace reference { @@ -74,8 +75,8 @@ struct TensorForEach { } #if defined(CUTLASS_ENABLE_SYCL) - const auto sycl_block = syclcompat::dim3(block_size, 1, 1); - const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1); + const syclcompat::dim3 sycl_block(block_size, 1, 1); + const syclcompat::dim3 sycl_grid(grid_size, 1, 1); syclcompat::launch>(sycl_grid, sycl_block, size, params); #else dim3 grid(grid_size, 1, 1); @@ -102,8 +103,8 @@ struct TensorDiagonalForEach { } #if defined(CUTLASS_ENABLE_SYCL) - const auto sycl_block = syclcompat::dim3(block_size, 1, 1); - const auto sycl_grid = syclcompat::dim3((end - start + block_size - 1) / block_size, 1, 1); + const syclcompat::dim3 sycl_block(block_size, 1, 1); + const syclcompat::dim3 sycl_grid((end - start + block_size - 1) / block_size, 1, 1); syclcompat::launch>(sycl_grid, sycl_block, size, params, start, end); #else dim3 block(block_size, 1, 1); @@ -151,8 +152,8 @@ struct BlockForEach { } #if defined(CUTLASS_ENABLE_SYCL) - const auto sycl_block = syclcompat::dim3(block_size, 1, 1); - const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1); + const syclcompat::dim3 sycl_block(block_size, 1, 1); + const syclcompat::dim3 sycl_grid(grid_size, 1, 1); syclcompat::launch>(sycl_grid, sycl_block, ptr, capacity, params); #else dim3 grid(grid_size, 1, 1);