correct flop calculation for causal mask when qk_seq_len!=kv_seq_len (#332)

mehdi-goli · muhammad-tanvir-1211 · aacostadiaz · web-flow · commit 167f160559bb · 2025-04-29T06:42:26.000+01:00
This PR fixes the number of floating point operation and read/write
bytes when masking applied for cases that `qk_seq_len` is not equal to
`kv_seq_len`.

---------

Co-authored-by: Muhammad Tanvir &lt;muhammad.tanvir@codeplay.com&gt;
Co-authored-by: Alejandro Acosta &lt;alejandro.acosta@codeplay.com&gt;
diff --git a/benchmarks/pvc/flash_attention_v2/benchmark_runner.hpp b/benchmarks/pvc/flash_attention_v2/benchmark_runner.hpp
@@ -535,13 +535,19 @@ template <class FMHAConfiguration> struct BenchmarkRunnerFMHA {
     extra_label << "layoutV=RowMajor ";
 
     state.SetLabel(extra_label.str());
-
-    double flops_qk = 2.0 * options.batch * options.num_heads_q * options.seq_len_qo * options.seq_len_kv * options.head_size_qk;
-    double flops_pv = 2.0 * options.batch * options.num_heads_q * options.seq_len_qo * options.head_size_vo * options.seq_len_kv;
+    // when seq_len_qo is not equal to seq_len_kv we use bottom up approach for the masking. 
+    // Following changes will adjust the effective_seq_len_kv when masking applied for such cases. 
+    auto offset = cute::min(options.seq_len_qo, options.seq_len_kv);
+    auto discard_seq_coord = options.seq_len_qo - offset;
+    auto full_tile_offset = options.seq_len_kv - offset;
+    auto effective_seq_len_kv = Causal ? full_tile_offset + (options.seq_len_kv / 2.0): options.seq_len_kv;
+    auto effective_seq_len_qo = Causal ? options.seq_len_qo - discard_seq_coord : options.seq_len_qo;
+   
+    double flops_qk = 2.0 * options.batch * options.num_heads_q * effective_seq_len_qo * effective_seq_len_kv * options.head_size_qk;
+    double flops_pv = 2.0 * options.batch * options.num_heads_q * effective_seq_len_qo * options.head_size_vo * effective_seq_len_kv;
     double gflops = (flops_qk + flops_pv) * 1e-9;
-
-    double gbps_qk = 2.0 * options.batch * options.num_heads_q * (options.seq_len_qo * options.head_size_qk + options.seq_len_kv * options.head_size_qk);
-    double gbps_pv = 2.0 * options.batch * options.num_heads_q * (options.seq_len_kv * options.seq_len_qo + options.seq_len_qo * options.head_size_vo);
+    double gbps_qk = 2.0 * options.batch * options.num_heads_q * (effective_seq_len_qo * options.head_size_qk + effective_seq_len_kv * options.head_size_qk);
+    double gbps_pv = 2.0 * options.batch * options.num_heads_q * (effective_seq_len_kv * effective_seq_len_qo + effective_seq_len_qo * options.head_size_vo);
     double mega_bytes_transferred = (gbps_qk + gbps_pv) * (1e-6);
 
     initialize_counters(state);
diff --git a/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp b/examples/sycl/06_pvc_flash_attention/pvc_flash_attn_runner.hpp
@@ -530,17 +530,19 @@ template <class GemmKernel, bool isVarLen> struct ExampleRunner {
         run(params);
       }
       syclcompat::wait();
-
-      double effective_seq_len_kv = options.is_causal ?
-        options.seq_len_kv / 2.0 :
-        options.seq_len_kv;
-      
+    // when seq_len_qo is not equal to seq_len_kv we use bottom up approach for the masking. 
+      // Following changes will adjust the effective_seq_len_kv when masking applied for such cases
+      auto offset = cute::min(options.seq_len_qo, options.seq_len_kv);
+      auto discard_seq_coord = options.seq_len_qo - offset;
+      auto full_tile_offset = options.seq_len_kv - offset;
+      auto effective_seq_len_kv = options.is_causal ? full_tile_offset + (options.seq_len_kv / 2.0): options.seq_len_kv;
+      auto effective_seq_len_qo = options.is_causal ? options.seq_len_qo - discard_seq_coord : options.seq_len_qo;
       double cute_time = timer.seconds() / options.iterations;
-      double flops_qk = 2.0 * options.batch * options.num_heads_q * options.seq_len_qo * effective_seq_len_kv * options.head_size_qk;
-      double flops_pv = 2.0 * options.batch * options.num_heads_q * options.seq_len_qo * options.head_size_vo * effective_seq_len_kv;
+      double flops_qk = 2.0 * options.batch * options.num_heads_q * effective_seq_len_qo * effective_seq_len_kv * options.head_size_qk;
+      double flops_pv = 2.0 * options.batch * options.num_heads_q * effective_seq_len_qo * options.head_size_vo * effective_seq_len_kv;
       double tflops = ((flops_qk + flops_pv) * 1e-12) / cute_time;
-      double gbps_qk = 2.0 * options.batch * options.num_heads_q * (options.seq_len_qo * options.head_size_qk + effective_seq_len_kv * options.head_size_qk);
-      double gbps_pv = 2.0 * options.batch * options.num_heads_q * (effective_seq_len_kv * options.seq_len_qo + options.seq_len_qo * options.head_size_vo);
+      double gbps_qk = 2.0 * options.batch * options.num_heads_q * (effective_seq_len_qo * options.head_size_qk + effective_seq_len_kv * options.head_size_qk);
+      double gbps_pv = 2.0 * options.batch * options.num_heads_q * (effective_seq_len_kv * effective_seq_len_qo + effective_seq_len_qo * options.head_size_vo);
       double gbps = ((gbps_qk + gbps_pv)  * 1e-9) / (cute_time);
       std::cout << "Batch: " << options.batch << "\tNumHeads_q: " << options.num_heads_q  << "\tNumHeads_kv: " << options.num_heads_kv  << "\tSeq Length QO: " << options.seq_len_qo
                 << "\tSeq Length KV: " << options.seq_len_kv << "\tHead Size QK: " << options.head_size_qk << "\tHead Size VO: " << options.head_size_vo