style fixes

FMarno · FMarno · commit bf21e0f1eff6 · 2025-04-15T10:40:55.000+01:00
diff --git a/benchmarks/pvc/gemm_configuration.hpp b/benchmarks/pvc/gemm_configuration.hpp
@@ -80,7 +80,8 @@ struct GemmConfiguration<
       float, LayoutC,
       float, TileShape,
       TileScheduler> {
-  using KernelScheduleType =std::conditional_t<TileScheduler == Scheduler::Gemm, cutlass::gemm::KernelPVC, cutlass::gemm::KernelPVCCooperative>;
+  using KernelScheduleType = std::conditional_t<TileScheduler == Scheduler::Gemm,
+    cutlass::gemm::KernelPVC, cutlass::gemm::KernelPVCCooperative>;
 
 
   static_assert(std::is_same_v<LayoutC, cutlass::layout::RowMajor>, "Column Major LayoutC unsupported in collective builder");
diff --git a/include/cutlass/gemm/collective/builders/xe_mma_builder.inl b/include/cutlass/gemm/collective/builders/xe_mma_builder.inl
@@ -39,6 +39,34 @@
 namespace cutlass::gemm::collective {
 
 namespace {
+// TODO(codeplay): generic selection methods are overcomplicated
+
+// Generic way to pick the number of subgroups along the M dim
+// If tuning for a specific case, create a SubgroupTilingMap specialization
+template <typename LayoutA, class TileShape_MNK>
+constexpr inline int calculate_sgs_in_M() {
+  constexpr int tile_M = get<0>(TileShape_MNK{});
+  if constexpr (cute::is_same_v<LayoutA, cutlass::layout::RowMajor>) {
+    // Non-transpose load can be size 1, 2, 4, 8, 16, or 32 in the M dim (for bf16),
+    // but we are only supporting 8, 16 and 32 so far.
+    for (auto atom_m : {32,16,8}) {
+      auto atoms_in_m = tile_M / atom_m;
+      for (auto atoms : {8,4,2}) {
+        if (atoms_in_m >= atoms) {
+          return atoms;
+        }
+      }
+    }
+    return 1;
+  } else {
+    // Transpose loads are always size 16 in the M dim (for bf16).
+    static_assert(tile_M / 16 > 0 and tile_M % 16 == 0, "Invalid Tile size in M dim");
+    return tile_M / 16;
+  }
+}
+
+// Generic way to pick a copy atom for A
+// If tuning for a specific case, create a SubgroupTilingMap specialization
 template <typename LayoutA, class TileShape_MNK, int sgs_M>
 inline auto pick_load_atom_for_A() {
   if constexpr (cute::is_same_v<LayoutA, cutlass::layout::RowMajor>) {
@@ -60,6 +88,8 @@ inline auto pick_load_atom_for_A() {
   }
 }
 
+// Generic way to pick a copy atom for B
+// If tuning for a specific case, create a SubgroupTilingMap specialization
 template <typename LayoutB, class TileShape_MNK, int sgs_N>
 inline auto pick_load_atom_for_B() {
   if constexpr (cute::is_same_v<LayoutB, cutlass::layout::RowMajor>) {
@@ -76,28 +106,6 @@ inline auto pick_load_atom_for_B() {
   }
 }
 
-template <typename LayoutA, class TileShape_MNK>
-constexpr inline int calculate_sgs_in_M() {
-  constexpr int tile_M = get<0>(TileShape_MNK{});
-  if constexpr (cute::is_same_v<LayoutA, cutlass::layout::RowMajor>) {
-    // Non-transpose load can be size 1, 2, 4, 8, 16, or 32 in the M dim (for bf16),
-    // but we are only supporting 8, 16 and 32 so far.
-    for (auto atom_m : {32,16,8}) {
-      auto atoms_in_m = tile_M / atom_m;
-      for (auto atoms : {8,4,2}) {
-        if (atoms_in_m >= atoms) {
-          return atoms;
-        }
-      }
-    }
-    return 1;
-  } else {
-    // Transpose loads are always size 16 in the M dim (for bf16).
-    static_assert(tile_M / 16 > 0 and tile_M % 16 == 0, "Invalid Tile size in M dim");
-    return tile_M / 16;
-  }
-}
-
 // Lookup table for subgroup layout
 // This is the default case
 template <typename TileShape, typename LayoutA, typename LayoutB>
@@ -115,7 +123,6 @@ struct SubgroupTilingMap {
       using sgs_N = Int<std::min(tile_N/atom_N, sgs_total/sgs_M::value)>;
       using GmemTiledCopyA = decltype(pick_load_atom_for_A<LayoutA, TileShape, sgs_M{}>());
       using GmemTiledCopyB = decltype(pick_load_atom_for_B<LayoutB, TileShape, sgs_N{}>());
-
 };
 
 template <>
@@ -222,16 +229,7 @@ struct CollectiveBuilder<
                                                   XE_8x16x16_F32BF16BF16F32_TT,
                                                   XE_8x16x16_F32F16F16F32_TT>>;
 
-      // We have too many subgroups, we can have at most 32, but only 8 are needed for 8x128 values (8x16 mma)
       // Prepare Template arguments required of CollectiveMainLoop
-      static constexpr auto tile_M = get<0>(TileShape_MNK{});
-      static constexpr auto tile_N = get<1>(TileShape_MNK{});
-      static constexpr auto tile_K = get<2>(TileShape_MNK{});
-
-      // number of subgroups in a dim is at most (values in a dim)/(atom size in a dim)
-      using atom_mnk = typename MMAAtom::Shape_MNK;
-      using max_subgroups = decltype(take<0,2>(shape_div(TileShape_MNK{}, atom_mnk{}))); // M, N
-
       using SgTilingMap = SubgroupTilingMap<TileShape_MNK, GmemLayoutATag, GmemLayoutBTag>;
       using sgs_M = typename SgTilingMap::sgs_M;
       using sgs_N = typename SgTilingMap::sgs_N;
@@ -262,7 +260,6 @@ struct CollectiveBuilder<
       using StrideA = cutlass::gemm::TagToStrideA_t<std::conditional_t<IsGroup, GmemLayoutATag*, GmemLayoutATag>>;
       using StrideB = cutlass::gemm::TagToStrideB_t<std::conditional_t<IsGroup, GmemLayoutBTag*, GmemLayoutBTag>>;
 
-
       using CollectiveOp = cutlass::gemm::collective::CollectiveMma<
               DispatchPolicy,
               TileShape_MNK,