From 93eb7c803851c74eb92f58b67b8973e722e85b94 Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com>
Date: Mon, 10 Feb 2025 20:32:20 -0800
Subject: [PATCH] [LLVMGPU] Add initial kernel config for horizontally fused
 gemms. (#19923)

This is in preparation of the modified way of generating horizontally
fused GEMMs. This PR adds kernel configuration for these GEMM ops to
allow them to go down the vector distribute pipeline.

---------

Signed-off-by: MaheshRavishankar <mahesh.ravishankar@gmail.com>
---
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp |  18 +-
 .../compiler/Codegen/LLVMGPU/test/BUILD.bazel |   1 +
 .../Codegen/LLVMGPU/test/CMakeLists.txt       |   1 +
 .../test/config_horizontally_fused_ops.mlir   | 283 ++++++++++++++++++
 .../Dialect/LinalgExt/Utils/BUILD.bazel       |   1 +
 .../Dialect/LinalgExt/Utils/CMakeLists.txt    |   1 +
 .../Dialect/LinalgExt/Utils/Utils.cpp         | 191 ++++++++++++
 .../compiler/Dialect/LinalgExt/Utils/Utils.h  |   5 +
 8 files changed, 499 insertions(+), 2 deletions(-)
 create mode 100644 compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_horizontally_fused_ops.mlir
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 94bb2246bf52..ac0ecf86158a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -490,6 +490,15 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
   SmallVector<int64_t, 4> bounds = op.getStaticLoopRanges();
   FailureOr<mlir::linalg::ContractionDimensions> contractionDims =
       mlir::linalg::inferContractionDims(op);
+  if (failed(contractionDims)) {
+    assert(IREE::LinalgExt::isaHorizontallyFusedContraction(op) &&
+           "expected horizontally fused contraction op");
+    SmallVector<AffineMap> indexingMaps;
+    indexingMaps.push_back(op.getMatchingIndexingMap(op.getDpsInputOperand(0)));
+    indexingMaps.push_back(op.getMatchingIndexingMap(op.getDpsInputOperand(1)));
+    indexingMaps.push_back(op.getMatchingIndexingMap(op.getDpsInitOperand(0)));
+    contractionDims = mlir::linalg::inferContractionDims(indexingMaps);
+  }
   assert(succeeded(contractionDims) && "Could not infer contraction dims");
 
   if (contractionDims->k.size() < 1 || contractionDims->m.size() < 1 ||
@@ -602,6 +611,8 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
              /*bestMNTileCountPerSubgroup=*/8,
              /*bestKTileCountPerSubgroup=*/4};
   }
+  // Scale the seed by number of contractions of horizontally fused case.
+  seeds.bestMNTileCountPerSubgroup /= op.getNumDpsInputs() - 1;
 
   int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes();
 
@@ -699,7 +710,9 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
   SmallVector<NamedAttribute, 2> attrs = {
       NamedAttribute("workgroup", b.getI64ArrayAttr(workgroupTileSizes)),
       NamedAttribute("reduction", b.getI64ArrayAttr(reductionTileSizes))};
-  IREE::GPU::setPromotedOperandList(context, attrs, {0, 1});
+  auto promotedOperands =
+      llvm::to_vector(llvm::seq<int64_t>(op.getNumDpsInputs()));
+  IREE::GPU::setPromotedOperandList(context, attrs, promotedOperands);
   IREE::GPU::setMmaKind(context, attrs, mmaKinds[schedule->index]);
   IREE::GPU::setSubgroupMCount(context, attrs, schedule->mSubgroupCounts[0]);
   IREE::GPU::setSubgroupNCount(context, attrs, schedule->nSubgroupCounts[0]);
@@ -1204,7 +1217,8 @@ setVectorDistributionConfig(IREE::GPU::TargetAttr target,
   LDBG("VectorDistribution: finding a suitable config...");
 
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
-    if (linalg::isaContractionOpInterface(linalgOp)) {
+    if (linalg::isaContractionOpInterface(linalgOp) ||
+        IREE::LinalgExt::isaHorizontallyFusedContraction(linalgOp)) {
       LDBG("VectorDistribution: trying to find a suitable contraction config");
       return setMatmulVectorDistributionConfig(target, entryPoint, linalgOp);
     }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 2183e35a75af..909c4ce15c18 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -29,6 +29,7 @@ iree_lit_test_suite(
             "cast_address_space_function.mlir",
             "cast_type_to_fit_mma.mlir",
             "config_custom_op.mlir",
+            "config_horizontally_fused_ops.mlir",
             "config_matvec.mlir",
             "config_root_op_attribute.mlir",
             "config_winograd.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index 53b06322befb..284c966535b5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -18,6 +18,7 @@ iree_lit_test_suite(
     "cast_address_space_function.mlir"
     "cast_type_to_fit_mma.mlir"
     "config_custom_op.mlir"
+    "config_horizontally_fused_ops.mlir"
     "config_matvec.mlir"
     "config_root_op_attribute.mlir"
     "config_winograd.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_horizontally_fused_ops.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_horizontally_fused_ops.mlir
new file mode 100644
index 000000000000..87a3d9c18b41
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_horizontally_fused_ops.mlir
@@ -0,0 +1,283 @@
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' --mlir-print-local-scope %s | FileCheck %s
+
+func.func @fused_contraction_1(%arg0: tensor<2x4096x640xf16>,
+    %arg1 : tensor<10x64x640xf16>, %arg2 : tensor<10x64x640xf16>,
+    %arg3 : tensor<10x64x640xf16>)
+    -> (tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>) {
+  %11 = tensor.empty() : tensor<2x10x4096x64xf16>
+  %12 = tensor.empty() : tensor<2x10x4096x64xf32>
+  %cst = arith.constant 0.0: f32
+  %13 = linalg.fill ins(%cst : f32)
+      outs(%12 : tensor<2x10x4096x64xf32>) -> tensor<2x10x4096x64xf32>
+  %14:3 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4)>,
+                       affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>,
+                       affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>,
+                       affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>,
+                       affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]}
+      ins(%arg0, %arg1, %arg2, %arg3
+          : tensor<2x4096x640xf16>, tensor<10x64x640xf16>, tensor<10x64x640xf16>,
+            tensor<10x64x640xf16>)
+      outs(%13, %13, %13
+          : tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>) {
+    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %in_2: f16, %out: f32, %out_3: f32, %out_4: f32):
+      %18 = arith.extf %in : f16 to f32
+      %19 = arith.extf %in_0 : f16 to f32
+      %20 = arith.mulf %18, %19 : f32
+      %21 = arith.addf %out, %20 : f32
+      %22 = arith.extf %in_1 : f16 to f32
+      %23 = arith.mulf %18, %22 : f32
+      %24 = arith.addf %out_3, %23 : f32
+      %25 = arith.extf %in_2 : f16 to f32
+      %26 = arith.mulf %18, %25 : f32
+      %27 = arith.addf %out_4, %26 : f32
+      linalg.yield %21, %24, %27 : f32, f32, f32
+  } -> (tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>)
+  %15 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      ins(%14#0 : tensor<2x10x4096x64xf32>) outs(%11 : tensor<2x10x4096x64xf16>) {
+    ^bb0(%in: f32, %out: f16):
+      %18 = arith.truncf %in : f32 to f16
+      linalg.yield %18 : f16
+  } -> tensor<2x10x4096x64xf16>
+  %16 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      ins(%14#1 : tensor<2x10x4096x64xf32>) outs(%11 : tensor<2x10x4096x64xf16>) {
+    ^bb0(%in: f32, %out: f16):
+      %18 = arith.truncf %in : f32 to f16
+      linalg.yield %18 : f16
+    } -> tensor<2x10x4096x64xf16>
+  %17 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      ins(%14#2 : tensor<2x10x4096x64xf32>) outs(%11 : tensor<2x10x4096x64xf16>) {
+    ^bb0(%in: f32, %out: f16):
+      %18 = arith.truncf %in : f32 to f16
+      linalg.yield %18 : f16
+  } -> tensor<2x10x4096x64xf16>
+  return %15, %16, %17
+      : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>
+}
+// CHECK-LABEL: func @fused_contraction_1
+//  CHECK-SAME:     translation_info = #iree_codegen.translation_info
+//  CHECK-SAME:         pipeline = LLVMGPUVectorDistribute
+//  CHECK-SAME:         workgroup_size = [256, 1, 1]
+//  CHECK-SAME:         subgroup_size = 64
+//       CHECK:   %[[GENERIC:.+]]:3 = linalg.generic
+//  CHECK-SAME:       lowering_config = #iree_gpu.lowering_config
+//  CHECK-SAME:       mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16
+//  CHECK-SAME:       promote_operands = [0, 1, 2, 3]
+//  CHECK-SAME:       reduction = [0, 0, 0, 0, 128]
+//  CHECK-SAME:       subgroup_m_count = 2
+//  CHECK-SAME:       subgroup_n_count = 2
+//  CHECK-SAME:       workgroup = [1, 1, 32, 32, 0]
+
+// -----
+
+func.func @fused_contraction_2(%arg0: tensor<4096x640xf32>,
+    %arg1 : tensor<640x640xf32>, %arg2 : tensor<640x640xf32>,
+    %arg3 : tensor<640x640xf32>)
+    -> (tensor<4096x640xf32>, tensor<4096x640xf32>, tensor<4096x640xf32>) {
+  %11 = tensor.empty() : tensor<4096x640xf32>
+  %12 = tensor.empty() : tensor<4096x640xf32>
+  %cst = arith.constant 0.0: f32
+  %13 = linalg.fill ins(%cst : f32)
+      outs(%12 : tensor<4096x640xf32>) -> tensor<4096x640xf32>
+  %14:3 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel", "reduction"]}
+      ins(%arg0, %arg1, %arg2, %arg3
+          : tensor<4096x640xf32>, tensor<640x640xf32>, tensor<640x640xf32>,
+            tensor<640x640xf32>)
+      outs(%13, %13, %13
+          : tensor<4096x640xf32>, tensor<4096x640xf32>, tensor<4096x640xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %in_1: f32, %in_2: f32, %out: f32, %out_3: f32, %out_4: f32):
+      %20 = arith.mulf %in, %in_0 : f32
+      %21 = arith.addf %out, %20 : f32
+      %23 = arith.mulf %in, %in_1 : f32
+      %24 = arith.addf %out_3, %23 : f32
+      %26 = arith.mulf %in, %in_2 : f32
+      %27 = arith.addf %out_4, %26 : f32
+      linalg.yield %21, %24, %27 : f32, f32, f32
+  } -> (tensor<4096x640xf32>, tensor<4096x640xf32>, tensor<4096x640xf32>)
+  return %14#0, %14#1, %14#2
+      : tensor<4096x640xf32>, tensor<4096x640xf32>, tensor<4096x640xf32>
+}
+// CHECK-LABEL: func @fused_contraction_2
+//  CHECK-SAME:     translation_info = #iree_codegen.translation_info
+//  CHECK-SAME:         pipeline = LLVMGPUVectorDistribute
+//  CHECK-SAME:         workgroup_size = [256, 1, 1]
+//  CHECK-SAME:         subgroup_size = 64
+//       CHECK:   %[[GENERIC:.+]]:3 = linalg.generic
+//  CHECK-SAME:       lowering_config = #iree_gpu.lowering_config
+//  CHECK-SAME:       mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32
+//  CHECK-SAME:       promote_operands = [0, 1, 2, 3]
+//  CHECK-SAME:       reduction = [0, 0, 16]
+//  CHECK-SAME:       subgroup_m_count = 2
+//  CHECK-SAME:       subgroup_n_count = 2
+//  CHECK-SAME:       workgroup = [32, 64, 0]
+
+// -----
+
+func.func @fused_contraction_3(%arg0 : tensor<2x4096x640xi8>,
+    %arg1 : tensor<2x640x640xi8>, %arg2 : tensor<2x640x640xi8>)
+    -> (tensor<2x4096x640xf16>, tensor<2x4096x640xf16>) {
+  %c0_i32 = arith.constant 0 : i32
+  %18 = tensor.empty() : tensor<2x4096x640xf16>
+  %19 = tensor.empty() : tensor<2x4096x640xi32>
+  %20 = linalg.fill ins(%c0_i32 : i32)
+      outs(%19 : tensor<2x4096x640xi32>) -> tensor<2x4096x640xi32>
+  %21:2 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>],
+      iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+      ins(%arg0, %arg1, %arg2 : tensor<2x4096x640xi8>, tensor<2x640x640xi8>, tensor<2x640x640xi8>)
+      outs(%20, %20 : tensor<2x4096x640xi32>, tensor<2x4096x640xi32>) {
+    ^bb0(%in: i8, %in_0: i8, %in_1: i8, %out: i32, %out_2: i32):
+      %24 = arith.extsi %in : i8 to i32
+      %25 = arith.extsi %in_0 : i8 to i32
+      %26 = arith.muli %24, %25 : i32
+      %27 = arith.addi %out, %26 : i32
+      %28 = arith.extsi %in_1 : i8 to i32
+      %29 = arith.muli %24, %28 : i32
+      %30 = arith.addi %out_2, %29 : i32
+      linalg.yield %27, %30 : i32, i32
+  } -> (tensor<2x4096x640xi32>, tensor<2x4096x640xi32>)
+  %22 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+      iterator_types = ["parallel", "parallel", "parallel"]}
+      ins(%21#0 : tensor<2x4096x640xi32>) outs(%18 : tensor<2x4096x640xf16>) {
+    ^bb0(%in: i32, %out: f16):
+      %27 = arith.sitofp %in : i32 to f32
+      %29 = arith.truncf %27 : f32 to f16
+      linalg.yield %29 : f16
+  } -> tensor<2x4096x640xf16>
+  %23 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+      iterator_types = ["parallel", "parallel", "parallel"]}
+      ins(%21#1 : tensor<2x4096x640xi32>) outs(%18 : tensor<2x4096x640xf16>) {
+    ^bb0(%in: i32, %out: f16):
+      %27 = arith.sitofp %in : i32 to f32
+      %29 = arith.truncf %27 : f32 to f16
+      linalg.yield %29 : f16
+  } -> tensor<2x4096x640xf16>
+  return %22, %23 : tensor<2x4096x640xf16>, tensor<2x4096x640xf16>
+}
+// CHECK-LABEL: func @fused_contraction_3
+//  CHECK-SAME:     translation_info = #iree_codegen.translation_info
+//  CHECK-SAME:         pipeline = LLVMGPUVectorDistribute
+//  CHECK-SAME:         workgroup_size = [256, 1, 1]
+//  CHECK-SAME:         subgroup_size = 64
+//       CHECK:   %[[GENERIC:.+]]:2 = linalg.generic
+//  CHECK-SAME:       lowering_config = #iree_gpu.lowering_config
+//  CHECK-SAME:       mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8
+//  CHECK-SAME:       promote_operands = [0, 1, 2]
+//  CHECK-SAME:       reduction = [0, 0, 0, 128]
+//  CHECK-SAME:       subgroup_m_count = 2
+//  CHECK-SAME:       subgroup_n_count = 2
+//  CHECK-SAME:       workgroup = [1, 64, 64, 0]
+
+// -----
+
+func.func @fused_contraction_4(%arg0: tensor<2x4096x640xf16>,
+    %arg1 : tensor<10x64x640xf16>, %arg2 : tensor<10x64x640xf16>,
+    %arg3 : tensor<10x64x640xf16>)
+    -> (tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>) {
+  %9 = tensor.empty() : tensor<2x10x64x4096xf16>
+  %10 = tensor.empty() : tensor<2x10x64x4096xf32>
+  %11 = tensor.empty() : tensor<2x10x4096x64xf16>
+  %12 = tensor.empty() : tensor<2x10x4096x64xf32>
+  %cst = arith.constant 0.0: f32
+  %fill0 = linalg.fill ins(%cst : f32)
+      outs(%12 : tensor<2x10x4096x64xf32>) -> tensor<2x10x4096x64xf32>
+  %fill1 = linalg.fill ins(%cst : f32)
+      outs(%10 : tensor<2x10x64x4096xf32>) -> tensor<2x10x64x4096xf32>
+  %14:3 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4)>,
+                       affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>,
+                       affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>,
+                       affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>,
+                       affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d2)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]}
+      ins(%arg0, %arg1, %arg2, %arg3
+          : tensor<2x4096x640xf16>, tensor<10x64x640xf16>, tensor<10x64x640xf16>,
+            tensor<10x64x640xf16>)
+      outs(%fill0, %fill0, %fill1
+          : tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>, tensor<2x10x64x4096xf32>) {
+    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %in_2: f16, %out: f32, %out_3: f32, %out_4: f32):
+      %18 = arith.extf %in : f16 to f32
+      %19 = arith.extf %in_0 : f16 to f32
+      %20 = arith.mulf %18, %19 : f32
+      %21 = arith.addf %out, %20 : f32
+      %22 = arith.extf %in_1 : f16 to f32
+      %23 = arith.mulf %18, %22 : f32
+      %24 = arith.addf %out_3, %23 : f32
+      %25 = arith.extf %in_2 : f16 to f32
+      %26 = arith.mulf %18, %25 : f32
+      %27 = arith.addf %out_4, %26 : f32
+      linalg.yield %21, %24, %27 : f32, f32, f32
+  } -> (tensor<2x10x4096x64xf32>, tensor<2x10x4096x64xf32>, tensor<2x10x64x4096xf32>)
+  %15 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      ins(%14#0 : tensor<2x10x4096x64xf32>) outs(%11 : tensor<2x10x4096x64xf16>) {
+    ^bb0(%in: f32, %out: f16):
+      %18 = arith.truncf %in : f32 to f16
+      linalg.yield %18 : f16
+  } -> tensor<2x10x4096x64xf16>
+  %16 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      ins(%14#1 : tensor<2x10x4096x64xf32>) outs(%11 : tensor<2x10x4096x64xf16>) {
+    ^bb0(%in: f32, %out: f16):
+      %18 = arith.truncf %in : f32 to f16
+      linalg.yield %18 : f16
+    } -> tensor<2x10x4096x64xf16>
+  %17 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      ins(%14#2 : tensor<2x10x64x4096xf32>) outs(%9 : tensor<2x10x64x4096xf16>) {
+    ^bb0(%in: f32, %out: f16):
+      %18 = arith.truncf %in : f32 to f16
+      linalg.yield %18 : f16
+  } -> tensor<2x10x64x4096xf16>
+  return %15, %16, %17
+      : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>
+}
+// CHECK-LABEL: func @fused_contraction_4
+//  CHECK-SAME:     translation_info = #iree_codegen.translation_info
+//  CHECK-SAME:         pipeline = LLVMGPUVectorDistribute
+//  CHECK-SAME:         workgroup_size = [256, 1, 1]
+//  CHECK-SAME:         subgroup_size = 64
+//       CHECK:   %[[GENERIC:.+]]:3 = linalg.generic
+//  CHECK-SAME:       lowering_config = #iree_gpu.lowering_config
+//  CHECK-SAME:       mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16
+//  CHECK-SAME:       promote_operands = [0, 1, 2, 3]
+//  CHECK-SAME:       reduction = [0, 0, 0, 0, 128]
+//  CHECK-SAME:       subgroup_m_count = 2
+//  CHECK-SAME:       subgroup_n_count = 2
+//  CHECK-SAME:       workgroup = [1, 1, 32, 32, 0]
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/BUILD.bazel b/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/BUILD.bazel
index bf240b8d83f3..619e3508f14f 100644
--- a/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/BUILD.bazel
@@ -28,6 +28,7 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgDialect",
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/CMakeLists.txt
index 24eb1c5852c6..6ddd85ce8003 100644
--- a/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/CMakeLists.txt
@@ -25,6 +25,7 @@ iree_cc_library(
     MLIRAffineDialect
     MLIRAnalysis
     MLIRArithDialect
+    MLIRComplexDialect
     MLIRIR
     MLIRLinalgDialect
     MLIRLinalgUtils
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/Utils.cpp b/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/Utils.cpp
index 7b714a882c30..c8896967b7f8 100644
--- a/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/Utils.cpp
@@ -7,11 +7,13 @@
 #include "iree/compiler/Dialect/LinalgExt/Utils/Utils.h"
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -596,4 +598,193 @@ getIGEMMGenericConvDetails(linalg::LinalgOp linalgOp) {
   return igemmDetails;
 }
 
+//===---------------------------------------------------------------------===//
+// Checking for horizontally fused contraction ops.
+// Copied over from implementation of `isaContractionInterfaceImpl` in Linalg
+//===---------------------------------------------------------------------===//
+
+/// If the value is defined by a chain of unary side effect-free, go up the
+/// use-def chain until the first value that isn't defined by such an op.
+// TODO: relax to multi-operands with constants, which are technically unary ops
+// as needed (e.g. add5).
+static Value getSourceSkipUnary(Value value) {
+  Operation *op = value.getDefiningOp();
+  while (op && op->getNumOperands() == 1) {
+    auto iface = dyn_cast<MemoryEffectOpInterface>(op);
+    if (!iface || !iface.hasNoEffect())
+      break;
+    value = op->getOperand(0);
+    op = value.getDefiningOp();
+  }
+  return value;
+}
+
+struct ContractionOpSequenceArgs {
+  std::pair<BlockArgument, BlockArgument> operands;
+  BlockArgument accumulator;
+};
+static std::optional<ContractionOpSequenceArgs>
+isContractionOpSequence(Value yielded,
+                        function_ref<bool(Operation *, Operation *)> isaPair) {
+  Operation *reductionOp = yielded.getDefiningOp();
+  if (reductionOp->getNumResults() != 1 || reductionOp->getNumOperands() != 2) {
+    return std::nullopt;
+  }
+
+  Value reductionLHS = getSourceSkipUnary(reductionOp->getOperand(0));
+  Value reductionRHS = getSourceSkipUnary(reductionOp->getOperand(1));
+
+  BlockArgument updated = dyn_cast<BlockArgument>(reductionRHS);
+  Value contributed = reductionLHS;
+  if (!updated) {
+    updated = dyn_cast<BlockArgument>(reductionLHS);
+    if (!updated) {
+      return std::nullopt;
+    }
+    contributed = reductionRHS;
+  }
+  contributed = getSourceSkipUnary(contributed);
+
+  Operation *elementwiseOp = contributed.getDefiningOp();
+  if (!elementwiseOp || elementwiseOp->getNumResults() != 1 ||
+      elementwiseOp->getNumOperands() != 2) {
+    return std::nullopt;
+  }
+
+  if (!isaPair(elementwiseOp, reductionOp)) {
+    return std::nullopt;
+  }
+
+  auto elementwiseLHS = dyn_cast_or_null<BlockArgument>(
+      getSourceSkipUnary(elementwiseOp->getOperand(0)));
+  auto elementwiseRHS = dyn_cast_or_null<BlockArgument>(
+      getSourceSkipUnary(elementwiseOp->getOperand(1)));
+  if (!elementwiseLHS || !elementwiseRHS) {
+    return std::nullopt;
+  }
+
+  return ContractionOpSequenceArgs{{elementwiseLHS, elementwiseRHS}, updated};
+}
+
+/// Returns true if the two operations are of the kinds specified by a pair of
+/// consecutive template arguments.
+template <typename AddOpTy, typename MulOpTy, typename... Args>
+static bool isPairTemplateImpl(Operation *add, Operation *mul) {
+  static_assert(sizeof...(Args) % 2 == 0,
+                "expected an even number of template arguments");
+  if (isa<AddOpTy>(add) && isa<MulOpTy>(mul))
+    return true;
+
+  if constexpr (sizeof...(Args) > 0)
+    return isPairTemplateImpl<Args...>(add, mul);
+  else
+    return false;
+}
+
+/// Returns true if the block is a body of a contraction with the kinds of
+/// operations given pairwise by template arguments.
+template <typename... Args>
+static std::optional<ContractionOpSequenceArgs>
+isContractionOpSequence(Value yielded) {
+  return isContractionOpSequence(yielded, &isPairTemplateImpl<Args...>);
+}
+
+/// Recognize an operation that is horizontally fused contraction.
+/// TODO: The logic below is quite convoluted. Might be better
+/// off having a dedicated operation for this.
+bool isaHorizontallyFusedContraction(linalg::LinalgOp linalgOp) {
+  if (linalgOp->getNumResults() == 1) {
+    return false;
+  }
+  // Check that the number of `ins` is one more than the number of results.
+  if (linalgOp.getNumDpsInputs() != linalgOp->getNumResults() + 1) {
+    return false;
+  }
+  SmallVector<AffineMap> indexingMaps = linalgOp.getIndexingMapsArray();
+  if (!llvm::all_of(indexingMaps, [](AffineMap m) {
+        return m.isProjectedPermutation() && !m.isPermutation();
+      })) {
+    return false;
+  }
+
+  llvm::SetVector<BlockArgument> rhsArgs;
+  llvm::SetVector<BlockArgument> outArgs;
+  for (auto [index, yieldedVal] :
+       llvm::enumerate(linalgOp.getBlock()->getTerminator()->getOperands())) {
+    std::optional<ContractionOpSequenceArgs> args =
+        isContractionOpSequence<arith::MulFOp, arith::AddFOp, arith::MulIOp,
+                                arith::AddIOp, complex::MulOp, complex::AddOp,
+                                arith::AndIOp, arith::OrIOp>(yieldedVal);
+    if (!args) {
+      return false;
+    }
+    BlockArgument lhs = args->operands.first;
+    BlockArgument rhs = args->operands.second;
+
+    // One of the block arguments must be argument 0, corresponding to the LHS.
+    if (lhs.getArgNumber() != 0) {
+      if (rhs.getArgNumber() != 0) {
+        return false;
+      }
+      std::swap(lhs, rhs);
+    }
+    assert(rhs.getArgNumber() != 0 && "cannot have rhs be arg number 0");
+    if (rhs.getArgNumber() != index + 1) {
+      return false;
+    }
+    BlockArgument accumulator = args->accumulator;
+    if (accumulator.getArgNumber() != index + linalgOp.getNumDpsInputs()) {
+      return false;
+    }
+  }
+
+  // Check that they have valid m, n and k dims.
+  ArrayRef<AffineMap> indexingMapsRef(indexingMaps);
+  AffineMap lhsIndexingMap = indexingMaps.front();
+
+  auto getResultDims = [](AffineMap m) {
+    auto r = llvm::map_range(m.getResults(), [](AffineExpr e) {
+      return cast<AffineDimExpr>(e).getPosition();
+    });
+    return llvm::SmallDenseSet<unsigned>(r.begin(), r.end());
+  };
+  llvm::SmallDenseSet<unsigned> lhsDims = getResultDims(lhsIndexingMap);
+
+  // Check that all the horizontally fused gemms have common N-dims. M and K
+  // dims are already known consistent since they are what the LHS has.
+  std::optional<llvm::SmallDenseSet<unsigned>> refNDimsSet;
+  for (auto [rhsIndexingMap, outputIndexingMap] :
+       llvm::zip_equal(indexingMapsRef.slice(1, linalgOp.getNumDpsInputs() - 1),
+                       indexingMapsRef.take_back(linalgOp.getNumDpsInits()))) {
+    llvm::SmallDenseSet<unsigned> rhsDims = getResultDims(rhsIndexingMap);
+    llvm::SmallDenseSet<unsigned> outsDims = getResultDims(outputIndexingMap);
+    llvm::SmallDenseSet<unsigned> mDims = lhsDims;
+    llvm::set_intersect(mDims, outsDims);
+    if (mDims.empty()) {
+      return false;
+    }
+    llvm::SmallDenseSet<unsigned> nDims = rhsDims;
+    llvm::set_intersect(nDims, outsDims);
+    if (nDims.empty()) {
+      return false;
+    }
+    llvm::SmallDenseSet<unsigned> kDims = lhsDims;
+    llvm::set_intersect(kDims, rhsDims);
+    if (kDims.empty()) {
+      return false;
+    }
+
+    if (refNDimsSet) {
+      if (!llvm::all_of(nDims, [&](unsigned nDim) {
+            return refNDimsSet->contains(nDim);
+          })) {
+        return false;
+      }
+    } else {
+      refNDimsSet = std::move(nDims);
+    }
+  }
+  return true;
+}
+
 } // namespace mlir::iree_compiler::IREE::LinalgExt
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/Utils.h b/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/Utils.h
index b9afb32fbd5e..c02699f80c25 100644
--- a/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/Utils.h
+++ b/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/Utils.h
@@ -211,5 +211,10 @@ bool isBroadcastingOp(linalg::LinalgOp op);
 ///     2. `linalg.yield` consumes the result of a `tensor.extract_slice`
 bool isGatherlikeOp(Operation *op);
 
+/// Check if a given operation is a horizontally fused contraction operation.
+/// The expectation is that the LHS is common, and all the operands are
+/// different RHS.
+bool isaHorizontallyFusedContraction(linalg::LinalgOp genericOp);
+
 } // namespace mlir::iree_compiler::IREE::LinalgExt
 #endif // IREE_COMPILER_DIALECT_LINALGEXT_UTILS_UTILS_H_