wip

q10 · q10 · commit fa24249a614a · 2024-11-25T15:13:56.000-08:00
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -195,7 +195,7 @@ if(NOT FBGEMM_CPU_ONLY)
   add_subdirectory(experimental/gemm)
 endif()
 
-if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM)
+if(NOT FBGEMM_CPU_ONLY)
   # TODO: Re-enable gen_ai for ROCm once ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
   # lands into latest ROCm
   add_subdirectory(experimental/gen_ai)
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip
@@ -30,7 +30,11 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
 
+#if (defined(USE_ROCM) && ROCM_VERSION >= 60300)
+// NOTE: This source is currently only available in the `develop` branch of CK
+// https://github.com/ROCm/composable_kernel
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp"
+#endif
 
 // Define commonly used types.
 template <ck::index_t... Is>
@@ -42,6 +46,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 namespace fbgemm_gpu {
 
+#if (defined(USE_ROCM) && ROCM_VERSION >= 60300)
 template <
     int BLOCK_SIZE,
     int MBLOCK,
@@ -269,4 +274,20 @@ at::Tensor f8f8bf16_blockwise(
   }
 }
 
+#else
+
+at::Tensor f8f8bf16_blockwise(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    int64_t block_m = 128,
+    int64_t block_n = 128,
+    int64_t block_k = 128) {
+  throw std::runtime_error(
+      "ROCm version is older than 6.3"); // requires ROCm>=6.3
+}
+
+#endif
+
 } // namespace fbgemm_gpu