pytorch · q10 · Nov 25, 2024
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -195,7 +195,7 @@ if(NOT FBGEMM_CPU_ONLY)
   add_subdirectory(experimental/gemm)
 endif()
 
-if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM)
+if(NOT FBGEMM_CPU_ONLY)
   # TODO: Re-enable gen_ai for ROCm once ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
   # lands into latest ROCm
   add_subdirectory(experimental/gen_ai)

diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip
@@ -23,8 +23,40 @@
 #include "ck/ck.hpp"
 #include "ck/stream_config.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
+
+#if __has_include("ck/utility/flush_icache.hpp")
+
 #include "ck/utility/flush_icache.hpp"
 
+#else
+
+#include <hip/hip_runtime.h>
+
+namespace ck {
+// Copied from: https://github.com/ROCm/composable_kernel/blob/rocm-6.2.0/include/ck/utility/flush_icache.hpp
+static __global__ void flush_icache() {
+    asm __volatile__("s_icache_inv \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t" ::
+                         :);
+}
+} // namespace ck
+
+#endif
 namespace fbgemm_gpu {
 
 void flush_icache_ck()

diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip
@@ -15,6 +15,8 @@
 #include <c10/hip/HIPStream.h>
 #include <torch/torch.h>
 
+#if (defined(USE_ROCM) && ROCM_VERSION >= 60300)
+
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -30,6 +32,8 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
 
+// NOTE: This source is currently only available in the `develop` branch of CK
+// https://github.com/ROCm/composable_kernel
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp"
 
 // Define commonly used types.
@@ -40,8 +44,11 @@ using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
+#endif
+
 namespace fbgemm_gpu {
 
+#if (defined(USE_ROCM) && ROCM_VERSION >= 60300)
 template <
     int BLOCK_SIZE,
     int MBLOCK,
@@ -269,4 +276,20 @@ at::Tensor f8f8bf16_blockwise(
   }
 }
 
+#else
+
+at::Tensor f8f8bf16_blockwise(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    int64_t block_m = 128,
+    int64_t block_n = 128,
+    int64_t block_k = 128) {
+  throw std::runtime_error(
+      "ROCm version is older than 6.3"); // requires ROCm>=6.3
+}
+
+#endif
+
 } // namespace fbgemm_gpu