codeplaysoftware
diff --git a/‎benchmarks/ampere/input.in renamed to ‎benchmarks/ampere/input_files/input.in b/‎benchmarks/ampere/input.in renamed to ‎benchmarks/ampere/input_files/input.in
diff --git a/‎benchmarks/benchmark_runner.hpp
Lines changed: 67 additions & 1 deletion b/‎benchmarks/benchmark_runner.hpp
Lines changed: 67 additions & 1 deletion
@@ -36,6 +36,7 @@
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
 #include "cutlass/gemm/collective/collective_mma.hpp"
 #include "cutlass/util/GPU_Clock.hpp"
+#include "cutlass/epilogue/fusion/operations.hpp"
 
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/tensor_fill.h"
@@ -47,6 +48,7 @@
 #include "cutlass/util/reference/device/gemm_complex.h"
 #include "cutlass/util/reference/device/tensor_compare.h"
 #include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_silu.h"
 
 #include <benchmark/benchmark.h>
 
@@ -169,6 +171,37 @@ struct BenchmarkRunnerGemm {
 
   using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
 
+  using FusionOp = typename Gemm::EpilogueOutputOp;
+
+  // TODO(codeplay): Epilogue detection here should be replaced w/ general solution (see other TODO)
+  using FusionSilu = cutlass::epilogue::fusion::LinCombEltAct<
+      cutlass::epilogue::thread::SiLu, ElementOutput, ElementCompute, ElementAccumulator,
+      ElementAccumulator, cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using FusionDeEltMul = cutlass::epilogue::fusion::LinCombDeEltAct<LayoutC, std::multiplies,
+                                                                    ElementOutput, ElementCompute>;
+  using FusionLinComb = epilogue::fusion::LinearCombination<
+      ElementOutput, ElementCompute, ElementAccumulator, ElementAccumulator,
+      FloatRoundStyle::round_to_nearest>;
+
+  // Epilogue used in ampere/gemm_configuration.hpp
+  using DefaultEpilogue = epilogue::collective::DefaultEpilogue<
+    float,
+    cutlass::gemm::TagToStrideC_t<LayoutC>,
+    cutlass::gemm::TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<float, 1>,
+    cutlass::gemm::EpilogueDefault>;
+
+  static constexpr bool epi_is_deeltactmul = std::is_same_v<FusionOp, FusionDeEltMul>;
+  static constexpr bool epi_is_silu = std::is_same_v<FusionOp, FusionSilu>;
+  static constexpr bool epi_is_lincomb = std::is_same_v<FusionOp, FusionLinComb>;
+  static constexpr bool epi_is_default = std::is_same_v<CollectiveEpilogue, DefaultEpilogue>;
+  static_assert(cute::is_base_of_v<cutlass::epilogue::fusion::FusionOperation, FusionOp> ||
+                    epi_is_default,
+                "Failed to determine benchmark epilogue");
+  static_assert(epi_is_default || epi_is_deeltactmul || epi_is_silu || epi_is_lincomb,
+                "Failed to determine benchmark epilogue");
+
   int32_t count;
 
   //
@@ -188,6 +221,7 @@ struct BenchmarkRunnerGemm {
   std::vector<DeviceAllocation<ElementC>> block_C;
   DeviceAllocation<ElementOutput> block_D;
   DeviceAllocation<ElementOutput> block_ref_D;
+  std::vector<DeviceAllocation<ElementOutput>> block_Aux;
 
   BenchmarkRunnerGemm() : seed(0) {};
 
@@ -227,6 +261,20 @@ struct BenchmarkRunnerGemm {
     cudaDeviceSynchronize();
 #endif
 
+    // TODO(codeplay): Replace this with a general solution (hook up to Testbed3x)
+    if constexpr (epi_is_silu) {
+      using TensorView = cutlass::TensorView<ElementOutput, LayoutD>;
+      for (int batch = 0, offset = 0; batch < L; batch++, offset += M * N) {
+        cutlass::reference::device::TensorSiLu(TensorView(
+            block_ref_D.get() + offset, LayoutD::packed({M, N}), cutlass::make_Coord(M, N)));
+      }
+    } else if constexpr (epi_is_deeltactmul) {
+      cutlass::reference::device::BlockElementwiseOp<std::multiplies>(
+          block_ref_D.get(), block_ref_D.get(), block_Aux[0].get(), block_D.size());
+    }
+
+    syclcompat::wait();
+
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = reference::device::BlockCompareEqual(
       block_ref_D.get(), block_D.get(), block_D.size());
@@ -256,6 +304,9 @@ struct BenchmarkRunnerGemm {
       block_A.emplace_back();
       block_B.emplace_back();
       block_C.emplace_back();
+      if constexpr (epi_is_deeltactmul) {
+        block_Aux.emplace_back();
+      }
     }
 
     for (int i=0; i < count; i++) {
@@ -265,6 +316,10 @@ struct BenchmarkRunnerGemm {
       initialize_block(block_A[i], seed + i);
       initialize_block(block_B[i], seed + i);
       initialize_block(block_C[i], seed + i);
+      if constexpr (epi_is_deeltactmul) {
+        block_Aux[i].reset(size_C);
+        initialize_block(block_Aux[i], seed + i);
+      }
     }
 
     block_D.reset(size_C);
@@ -284,6 +339,11 @@ struct BenchmarkRunnerGemm {
     arguments.epilogue = {{options.alpha, options.beta}, block_C[0].get(), stride_C, block_D.get(), stride_D};
     arguments.hw_info = hw_info;
 
+    if constexpr(epi_is_deeltactmul){
+      arguments.epilogue.thread.aux_ptr = block_Aux[0].get();
+      arguments.epilogue.thread.dAux = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
+    }
+
     Gemm gemm_op;
 
     size_t workspace_size = Gemm::get_workspace_size(arguments);
@@ -352,6 +412,10 @@ struct BenchmarkRunnerGemm {
         {{options.alpha, options.beta}, block_C[input_num].get(), stride_C, block_D.get(), stride_D},
         hw_info
       };
+      if constexpr(epi_is_deeltactmul){
+        arguments.epilogue.thread.aux_ptr = block_Aux[input_num].get();
+        arguments.epilogue.thread.dAux = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
+      }
       gemm_op.initialize(arguments, workspace.get());
       state.ResumeTiming();
 
@@ -370,18 +434,20 @@ struct BenchmarkRunnerGemm {
   static void initialize_counters(::benchmark::State& state) {
     state.counters["avg_runtime_ms"] = 0;
     state.counters["best_runtime_ms"] = std::numeric_limits<double>::max();
+    state.counters["worst_runtime_ms"] = -std::numeric_limits<double>::max();
   }
 
   static void update_counters(::benchmark::State& state, double ms_elapsed) {
     state.PauseTiming();
     state.counters["total_runtime_ms"] += ms_elapsed;
     state.counters["best_runtime_ms"] = std::min<double>(state.counters["best_runtime_ms"], ms_elapsed);
+    state.counters["worst_runtime_ms"] = std::max<double>(state.counters["worst_runtime_ms"], ms_elapsed);
     state.ResumeTiming();
   }
 
   static void finalize_counters(::benchmark::State& state,  double gflop, double mega_bytes_transferred) {
     state.counters["avg_runtime_ms"] =
-      state.counters["total_runtime_ms"] / static_cast<double>(state.iterations());
+      (state.counters["total_runtime_ms"] -state.counters["best_runtime_ms"] - state.counters["worst_runtime_ms"] ) / static_cast<double>(state.iterations() - 2);
     state.counters["avg_tflops"] = gflop / state.counters["avg_runtime_ms"];
     state.counters["avg_throughput"] = mega_bytes_transferred / state.counters["avg_runtime_ms"];
     state.counters["best_tflop"] = gflop / state.counters["best_runtime_ms"];