Fix the performance regression for flash attention for 2025.1 release compiler (#327)

mehdi-goli · t4c1 · aacostadiaz · web-flow · commit 9f0b460f78b5 · 2025-04-26T13:12:48.000+01:00
This PR is a workaround for the performance regression in 2025.1 compiler. The following commit that fixes the issue( intel/llvm@71ca51f) did not meet the dpcpp 2025.1 cut-off date. This will be added in the 2025.2 release. --------- Co-authored-by: Tadej Ciglarič <tadej.c@gmail.com> Co-authored-by: Alejandro Acosta <alejandro.acosta@codeplay.com>
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_mma.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_mma.hpp
@@ -47,11 +47,11 @@ using namespace cute;
 
 template <typename To_type, typename Engine, typename Layout>
 CUTLASS_DEVICE auto convert_type(Tensor<Engine, Layout> const &tensor) {
-    using From_type = typename Engine::value_type;
-    constexpr int numel = decltype(size(tensor))::value;
-    cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
-    auto frag = convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel> *>(tensor.data()));
-    return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
+  using From_type = typename Engine::value_type;
+  constexpr int numel = decltype(size(tensor))::value;
+  cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
+  auto frag = convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel> *>(tensor.data()));
+  return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/numeric_conversion.h b/include/cutlass/numeric_conversion.h
@@ -539,7 +539,15 @@ struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_to_ne
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
+    #if defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER < 20250200) && defined(__SYCL_DEVICE_ONLY__)
+    // Temporary patch to avoid linking in the devicelib fallback unconditionally.
+    // This is the work around to fix performance regression in 2025.1 
+    result_type res;
+    res.storage=(__spirv_ConvertFToBF16INTEL(s));
+    return res;
+    #else
     return static_cast<cutlass::bfloat16_t>(s);
+    #endif
   }
 
   CUTLASS_HOST_DEVICE