diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index 09d57d672e..d8ee96231a 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -195,7 +195,7 @@ if(NOT FBGEMM_CPU_ONLY) add_subdirectory(experimental/gemm) endif() -if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM) +if(NOT FBGEMM_CPU_ONLY) # TODO: Re-enable gen_ai for ROCm once ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp # lands into latest ROCm add_subdirectory(experimental/gen_ai) diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip index 25f532ad35..ece74362d2 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip @@ -23,8 +23,40 @@ #include "ck/ck.hpp" #include "ck/stream_config.hpp" #include "ck/host_utility/hip_check_error.hpp" + +#if __has_include("ck/utility/flush_icache.hpp") + #include "ck/utility/flush_icache.hpp" +#else + +#include + +namespace ck { +// Copied from: https://github.com/ROCm/composable_kernel/blob/rocm-6.2.0/include/ck/utility/flush_icache.hpp +static __global__ void flush_icache() { + asm __volatile__("s_icache_inv \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" + "s_nop 0 \n\t" :: + :); +} +} // namespace ck + +#endif namespace fbgemm_gpu { void flush_icache_ck() diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip index 53b8020c6b..8f13c204e8 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip @@ -15,6 +15,8 @@ #include #include +#if (defined(USE_ROCM) && ROCM_VERSION >= 60300) + #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" @@ -30,6 +32,8 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/literals.hpp" +// NOTE: This source is currently only available in the `develop` branch of CK +// https://github.com/ROCm/composable_kernel #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp" // Define commonly used types. @@ -40,8 +44,11 @@ using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; using PassThrough = ck::tensor_operation::element_wise::PassThrough; +#endif + namespace fbgemm_gpu { +#if (defined(USE_ROCM) && ROCM_VERSION >= 60300) template < int BLOCK_SIZE, int MBLOCK, @@ -269,4 +276,20 @@ at::Tensor f8f8bf16_blockwise( } } +#else + +at::Tensor f8f8bf16_blockwise( + at::Tensor XQ, + at::Tensor WQ, + at::Tensor x_scale, + at::Tensor w_scale, + int64_t block_m = 128, + int64_t block_n = 128, + int64_t block_k = 128) { + throw std::runtime_error( + "ROCm version is older than 6.3"); // requires ROCm>=6.3 +} + +#endif + } // namespace fbgemm_gpu