Skip to content

Enable SM90 via sycl-cuda-compat #276

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: sycl-develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/nvidia_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ jobs:
cmake -G Ninja \
-DCUTLASS_ENABLE_SYCL=ON \
-DDPCPP_SYCL_TARGET=nvptx64-nvidia-cuda \
-DDPCPP_SYCL_ARCH=sm_80
-DDPCPP_SYCL_ARCH=sm_90a
cmake --build .

- name: Unit test
Expand Down
14 changes: 14 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

if (SYCL_NVIDIA_TARGET)
find_package(CUDAToolkit)
endif()

add_custom_target(cutlass_benchmarks)

Expand Down Expand Up @@ -56,6 +59,17 @@ function(cutlass_benchmark_add_executable NAME)
benchmark::benchmark
)

if (SYCL_NVIDIA_TARGET)
target_link_libraries(
${NAME}
PRIVATE
CUDA::cudart
CUDA::cuda_driver
)

target_compile_definitions(${NAME} PRIVATE __CUDACC_VER_MAJOR__=${CUDAToolkit_VERSION_MAJOR} __CUDACC_VER_MINOR__=${CUDAToolkit_VERSION_MINOR})
endif()

target_include_directories(
${NAME}
PRIVATE
Expand Down
4 changes: 4 additions & 0 deletions cmake/FindDPCPP.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,13 @@ endif()

if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
if("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
list(APPEND DPCPP_FLAGS "-fsycl-cuda-compatibility")
# Allow to use grid constant, beneficial even if CUTensorMap is not used.
list(APPEND DPCPP_FLAGS "-fno-sycl-decompose-functor")
list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
list(APPEND DPCPP_COMPILE_ONLY_FLAGS; "-mllvm;-enable-global-offset=false;")
set(CMAKE_CUDA_ARCHITECTURES "${CUTLASS_NVCC_ARCHS}")
endif()
endif()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -265,16 +265,12 @@ struct ExampleRunner {
M * N // batch_stride_D
);

#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::wait_and_throw();
#else
cudaError_t result = cudaDeviceSynchronize();
if (result != cudaSuccess) {
std::cerr << "Reference kernel failed. Last CUDA error: "
<< cudaGetErrorString(result) << std::endl;
return false;
}
#endif

// Check if output from CUTLASS kernel and reference kernel are equal or not
bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size());
Expand Down
2 changes: 1 addition & 1 deletion examples/35_gemm_softmax/gemm_online_softmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ struct ExampleRunner {
float abs_ref = fabs((float)vector_Input_Ref.at(i));
float relative_diff = abs_ref > abs_tol ? abs_diff / abs_ref : 0;
if ( (isnan(abs_diff) || isinf(abs_diff)) || (abs_diff > abs_tol && relative_diff > rel_tol)) {
printf("i = %d diff = %f, {%f, %f}.\n", i, abs_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
printf("i = %ld diff = %f, {%f, %f}.\n", i, abs_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
return false;
}

Expand Down
2 changes: 2 additions & 0 deletions examples/35_gemm_softmax/softmax_finalize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ class SoftmaxFinalize {
};

static constexpr int SharedStorageSize = sizeof(SharedStorage);
static constexpr int MaxThreadsPerBlock = MaxNumThreadsPerBlock;
static constexpr int MinBlocksPerMultiprocessor = 1;

//
// Params struct
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ int main(int argc, char const **args) {
//

#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
run<Gemm>(options);
return run<Gemm>(options);
#endif

return 0;
Expand Down
26 changes: 24 additions & 2 deletions examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ namespace example

// Naive grid-stride loop implementation of gather
template<typename Element, typename Func>
__global__ void
CUTLASS_GLOBAL void
gather_kernel(Element const * __restrict__ input,
Element * __restrict__ output,
Func func,
Expand Down Expand Up @@ -76,17 +76,28 @@ gather(Element const * input,

cutlass::FastDivmod stride_divmod(stride_upcast);
dim3 blocks(hw_info.sm_count, 1, batch_size);
#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::dim3 sycl_grid(blocks.x, blocks.y, blocks.z);
syclcompat::launch<gather_kernel<uint128_t,Func>>(sycl_grid, 1024,
reinterpret_cast<cute::uint128_t const *>(input),
reinterpret_cast<cute::uint128_t *>(output),
func,
num_elems_input_upcast,
num_elems_output_upcast,
stride_divmod);
#else
gather_kernel<<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input),
reinterpret_cast<cute::uint128_t *>(output),
func,
num_elems_input_upcast,
num_elems_output_upcast,
stride_divmod);
#endif
}

// Naive grid-stride loop implementation of scatter
template<typename Element, typename Func>
__global__ void
CUTLASS_GLOBAL void
scatter_kernel(Element const * __restrict__ input,
Element * __restrict__ output,
Func func,
Expand Down Expand Up @@ -125,12 +136,23 @@ scatter(Element const * input,

cutlass::FastDivmod stride_divmod(stride_upcast);
dim3 blocks(hw_info.sm_count, 1, batch_size);
#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::dim3 sycl_grid(blocks.x, blocks.y, blocks.z);
syclcompat::launch<scatter_kernel<uint128_t,Func>>(sycl_grid, 1024,
reinterpret_cast<cute::uint128_t const *>(input),
reinterpret_cast<cute::uint128_t *>(output),
func,
num_elems_input_upcast,
num_elems_output_upcast,
stride_divmod);
#else
scatter_kernel<<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input),
reinterpret_cast<cute::uint128_t *>(output),
func,
num_elems_input_upcast,
num_elems_output_upcast,
stride_divmod);
#endif
}

} // namespace example
13 changes: 12 additions & 1 deletion examples/53_hopper_gemm_permute/permute_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace example
* For row major, the inputs must be switched accordingly.
*/
template<bool Batched, typename Element, typename Permute>
__global__ void
CUTLASS_GLOBAL void
permute_kernel(Element const* __restrict__ input,
Element* __restrict__ output,
Permute permute,
Expand Down Expand Up @@ -82,11 +82,22 @@ void permute(Element const* input,

cutlass::FastDivmod stride_divmod(stride);
dim3 blocks(hw_info.sm_count, 1, batch_count);
#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::dim3 sycl_grid(blocks.x, blocks.y, blocks.z);
syclcompat::launch<permute_kernel<Batched, cute::uint128_t, Permute>>(sycl_grid, 1024,
reinterpret_cast<cute::uint128_t const *>(input),
reinterpret_cast<cute::uint128_t *>(output),
permute_upcast,
num_elems_upcast,
stride_upcast);

#else
permute_kernel<Batched><<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input),
reinterpret_cast<cute::uint128_t *>(output),
permute_upcast,
num_elems_upcast,
stride_upcast);
#endif
}

} // namespace example
41 changes: 38 additions & 3 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@

set(CUTLASS_EXAMPLES_COMMON_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/common)

if (SYCL_NVIDIA_TARGET)
find_package(CUDAToolkit)
endif()

add_custom_target(cutlass_examples)
add_custom_target(test_examples)

Expand All @@ -56,11 +60,22 @@ function(cutlass_example_add_executable NAME)
${NAME}
PRIVATE
CUTLASS
cutlass_tools_util_includes
cutlass_library_includes
$<$<BOOL:${CUTLASS_ENABLE_CUBLAS}>:nvidia::cublas>
$<$<BOOL:${ADD_CUDA}>:cuda>
)

if (SYCL_NVIDIA_TARGET)
target_link_libraries(
${NAME}
PRIVATE
CUDA::cudart
CUDA::cuda_driver
)

target_compile_definitions(${NAME} PRIVATE __CUDACC_VER_MAJOR__=${CUDAToolkit_VERSION_MAJOR} __CUDACC_VER_MINOR__=${CUDAToolkit_VERSION_MINOR})
endif()

target_include_directories(
${NAME}
PRIVATE
Expand Down Expand Up @@ -174,13 +189,33 @@ foreach(EXAMPLE
add_subdirectory(${EXAMPLE})
endforeach()
else()
foreach(EXAMPLE
set(EXAMPLES
14_ampere_tf32_tensorop_gemm
35_gemm_softmax
cute
sycl
)

if (SYCL_NVIDIA_TARGET)
list(APPEND EXAMPLES
48_hopper_warp_specialized_gemm
49_hopper_gemm_with_collective_builder
50_hopper_gemm_with_epilogue_swizzle
51_hopper_gett
52_hopper_gather_scatter_fusion
53_hopper_gemm_permute
54_hopper_fp8_warp_specialized_gemm
55_hopper_mixed_dtype_gemm
56_hopper_ptr_array_batched_gemm
57_hopper_grouped_gemm
61_hopper_gemm_with_topk_and_softmax
63_hopper_gemm_with_weight_prefetch
# 67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling # error: SYCL kernel cannot call a variadic function
69_hopper_mixed_dtype_grouped_gemm
)
endif()

foreach(EXAMPLE ${EXAMPLES})
add_subdirectory(${EXAMPLE})
endforeach()
endif()

19 changes: 17 additions & 2 deletions include/cute/arch/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,6 @@ explode(Fn fn,
return fn(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]..., f[If]..., g[Ig]...);
}

#if defined(CUTLASS_ENABLE_SYCL)
template <class MMA_Op,
class PtrD, int... Id,
class PtrA, int... Ia,
Expand All @@ -294,7 +293,23 @@ explode_mma(PtrD&& d, int_sequence<Id...>,
{
return MMA_Op::fma(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]...);
}
#endif

template <class MMA_Op,
class PtrD, int... Id,
class PtrA, int... Ia,
class PtrB, int... Ib,
class PtrC, int... Ic,
class PtrE, int... Ie>
CUTE_HOST_DEVICE constexpr
void
explode_mma(PtrD&& d, int_sequence<Id...>,
PtrA&& a, int_sequence<Ia...>,
PtrB&& b, int_sequence<Ib...>,
PtrC&& c, int_sequence<Ic...>,
PtrE&& e, int_sequence<Ie...>)
{
return MMA_Op::fma(d[Id]..., a[Ia]..., b[Ib]..., c[Ic]..., e[Ie]...);
}

//
// Utility for exploding tuples into functions
Expand Down
2 changes: 1 addition & 1 deletion include/cute/atom/mma_traits_sm90_gmma.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ mma_unpack(MMA_Traits<MMA_Op, MMA_Args...> const& traits,
CUTE_STATIC_ASSERT_V(size(rB) == Int<RegNumB>{});
CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});

detail::explode(MMA_Op::fma,
detail::explode_mma<MMA_Op>(
rA, make_int_sequence<RegNumA>{},
rB, make_int_sequence<RegNumB>{},
rC, make_int_sequence<RegNumC>{},
Expand Down
3 changes: 2 additions & 1 deletion include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,8 @@ mma_unpack(MMA_Traits<MMAOp> const& traits,

CUTE_STATIC_ASSERT_V(size(rC) == Int<RegNumC>{});

detail::explode(MMAOp::fma,

detail::explode_mma<MMAOp>(
rA, make_int_sequence<RegNumA>{},
rB, make_int_sequence<RegNumB>{},
rC, make_int_sequence<RegNumC>{},
Expand Down
8 changes: 4 additions & 4 deletions include/cutlass/bfloat16.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@
#include <cstring>
#endif

#if !defined(CUTLASS_ENABLE_SYCL)
#if !defined(CUTLASS_ENABLE_SYCL) || defined(__CUDA__)
#include <cuda_bf16.h>
#endif
#endif // defined(__CUDA__)

#include "cutlass/cutlass.h"
#include "cutlass/platform/platform.h"
Expand Down Expand Up @@ -103,7 +103,7 @@ struct alignas(2) bfloat16_t {
/// Default constructor
bfloat16_t() = default;

#if !defined(CUTLASS_ENABLE_SYCL)
#if !defined(CUTLASS_ENABLE_SYCL) || defined(__CUDA__)
/// Reinterpret cast from CUDA's __nv_bfloat16 type
CUTLASS_HOST_DEVICE
explicit bfloat16_t(__nv_bfloat16 const & x) {
Expand Down Expand Up @@ -199,7 +199,7 @@ struct alignas(2) bfloat16_t {
return (float(*this) != 0.0f);
}

#if !defined(CUTLASS_ENABLE_SYCL)
#if !defined(CUTLASS_ENABLE_SYCL) || defined(__CUDA__)
/// Bitcasts to CUDA's bf16 type
CUTLASS_DEVICE
__nv_bfloat16 to_nv_bfloat16() const {
Expand Down
2 changes: 1 addition & 1 deletion include/cutlass/complex.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

#pragma once

#if defined(CUTLASS_ENABLE_SYCL)
#if defined(CUTLASS_ENABLE_SYCL) && !defined(__CUDA__)
#include <cutlass/sycl_complex.h>
#include <cutlass/sycl_fp16.h>
#else
Expand Down
6 changes: 4 additions & 2 deletions include/cutlass/cuda_host_adapter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

#pragma once

#if !defined(CUTLASS_ENABLE_SYCL)
#if defined(__CUDA__)
#include <cuda_runtime_api.h>
#endif
#include "cutlass/cutlass.h"
Expand Down Expand Up @@ -85,14 +85,16 @@ namespace cutlass {
/////////////////////////////////////////////////////////////////////////////////////////////////


#if !defined(__CUDACC_RTC__) && !defined(CUTLASS_ENABLE_SYCL)
#if !defined(__CUDACC_RTC__)

#if ((__CUDACC_VER_MAJOR__ >= 12) || \
((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
#include <cudaTypedefs.h>
#endif // (__CUDACC_VERSION__ >= 11.8)

#if !defined(CUTLASS_ENABLE_SYCL)
#include <driver_types.h>
#endif

#define CUTLASS_CUDA_DRIVER_STRINGIFY(tok) #tok

Expand Down
Loading