diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
index aff7b8965b320..840d45fc27cb4 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
@@ -46,8 +46,8 @@ argmax_types = [
 [iree_amdgpu_bitcode_library(
     name = "iree_uk_amdgpu_argmax_%s_%s" % (type, gpu_arch),
     srcs = [
-        "iree_uk_amdgpu_argmax_%s.c" % type,
         "common.h",
+        "iree_uk_amdgpu_argmax_%s.c" % type,
     ],
     out = "iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch),
     gpu_arch = gpu_arch,
@@ -59,9 +59,21 @@ argmax_bc_files = [
     for gpu_arch in gpu_archs
 ]
 
+iree_amdgpu_bitcode_library(
+    name = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4_gfx942",
+    srcs = [
+        "common.h",
+        "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c",
+    ],
+    out = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc",
+    gpu_arch = "gfx942",
+)
+
 iree_c_embed_data(
     name = "iree_uk_amdgpu_bitcode",
-    srcs = argmax_bc_files,
+    srcs = argmax_bc_files + [
+        "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc",
+    ],
     c_file_output = "iree_uk_amdgpu_bitcode.c",
     flatten = True,
     h_file_output = "iree_uk_amdgpu_bitcode.h",
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
index 71d4705eed1a6..ad1a19028a5b4 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
@@ -206,6 +206,18 @@ iree_amdgpu_bitcode_library(
     "iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
 )
 
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4_gfx942
+  GPU_ARCH
+    gfx942
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c"
+  OUT
+    "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
+)
+
 iree_c_embed_data(
   NAME
     iree_uk_amdgpu_bitcode
@@ -226,6 +238,7 @@ iree_c_embed_data(
     "iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
     "iree_uk_amdgpu_argmax_f32i64.gfx90a.bc"
     "iree_uk_amdgpu_argmax_f32i64.gfx942.bc"
+    "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
   C_FILE_OUTPUT
     "iree_uk_amdgpu_bitcode.c"
   H_FILE_OUTPUT
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c
new file mode 100644
index 0000000000000..e7fd7649ad336
--- /dev/null
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c
@@ -0,0 +1,55 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
+
+// Very naive kernel. TODO(bjacob):
+// 1. Shared memory: can't allocate it within the microkernel (which is just a
+//    helper device function, not the actual amdgpu_kernel). Need to get it
+//    passed down here as a `T [[clang::address_space(3)]] *` parameter.
+// 2. Better scheduling via either barrier intrinsics or inline assemby.
+// 3. Subgroups1x4 being asymmetric is a historical accident... should be 2x2.
+[[clang::always_inline, clang::convergent]] void
+iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4(
+    const int8_t *a_buffer, int64_t a_offset, const int8_t *b_buffer,
+    int64_t b_offset, int32_t *c_buffer, int64_t c_offset, int64_t k_size) {
+  typedef __attribute__((__vector_size__(8 * 2))) int64_t int64x2_t;
+  typedef __attribute__((__vector_size__(4 * 4))) int32_t int32x4_t;
+  int tid = __builtin_amdgcn_workitem_id_x();
+
+  // Load existing accumulators.
+  int32x4_t acc[8][2] = {{0}};
+  int32x4_t *c_global = (int32x4_t *)(c_buffer + c_offset);
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      acc[i][j] = c_global[256 * (2 * i + j) + tid];
+    }
+  }
+
+  // Arithmetic loop.
+  const int64x2_t *a_global =
+      (const int64x2_t *)(a_buffer + a_offset) + (tid % 64);
+  const int64x2_t *b_global = (const int64x2_t *)(b_buffer + b_offset) + tid;
+  for (int k_outer = 0; k_outer < k_size; ++k_outer) {
+    for (int i = 0; i < 8; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 2; ++k) {
+          acc[i][j] = __builtin_amdgcn_mfma_i32_16x16x32_i8(
+              a_global[64 * i][k], b_global[256 * j][k], acc[i][j], 0, 0, 0);
+        }
+      }
+    }
+    a_global += 512;
+    b_global += 512;
+  }
+
+  // Store accumulators.
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      c_global[256 * (2 * i + j) + tid] = acc[i][j];
+    }
+  }
+}
diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel
index 2a71f590c6e30..7201e4b988e84 100644
--- a/compiler/plugins/target/ROCM/test/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/test/BUILD.bazel
@@ -17,6 +17,7 @@ iree_lit_test_suite(
     srcs = [
         "config_ukernel_argmax_gfx908.mlir",
         "config_ukernel_argmax_gfx942.mlir",
+        "config_ukernel_multi_mma_gfx942.mlir",
         "default_tuning_specs_amdgpu.mlir",
         "lowering_strategy_from_tuning_spec.mlir",
         "ukernel_pipeline_transform.mlir",
diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt
index bab88582a8b0e..06249daa0039e 100644
--- a/compiler/plugins/target/ROCM/test/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/test/CMakeLists.txt
@@ -16,6 +16,7 @@ iree_lit_test_suite(
   SRCS
     "config_ukernel_argmax_gfx908.mlir"
     "config_ukernel_argmax_gfx942.mlir"
+    "config_ukernel_multi_mma_gfx942.mlir"
     "default_tuning_specs_amdgpu.mlir"
     "lowering_strategy_from_tuning_spec.mlir"
     "ukernel_pipeline_transform.mlir"
diff --git a/compiler/plugins/target/ROCM/test/config_ukernel_multi_mma_gfx942.mlir b/compiler/plugins/target/ROCM/test/config_ukernel_multi_mma_gfx942.mlir
new file mode 100644
index 0000000000000..646418f806663
--- /dev/null
+++ b/compiler/plugins/target/ROCM/test/config_ukernel_multi_mma_gfx942.mlir
@@ -0,0 +1,29 @@
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s
+
+func.func @multi_mma_mfma_i32_16x16x32_i8(%a : tensor<1x2x8x4x16x2x8xi8>,
+                                %b : tensor<1x2x4x2x4x16x2x8xi8>,
+                                %c : tensor<1x1x8x4x2x4x16x4xi32>)
+    -> tensor<1x1x8x4x2x4x16x4xi32> attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "multi_mma"}>
+} {
+  %d = iree_gpu.multi_mma %a, %b, %c {indexing_maps = [
+      affine_map<(d0, d1, d2) -> (d0, d2)>,
+      affine_map<(d0, d1, d2) -> (d1, d2)>,
+      affine_map<(d0, d1, d2) -> (d0, d1)>
+    ], iterator_types = [
+      #iree_gpu.iterator_type<parallel>,
+      #iree_gpu.iterator_type<parallel>,
+      #iree_gpu.iterator_type<reduction>
+    ], kind = #iree_gpu.data_tiled_mma_layout<
+      intrinsic =  MFMA_I32_16x16x32_I8,
+      unroll_m = 8, unroll_n = 2, subgroups_n = 4, unroll_k = 2
+    >} : tensor<1x2x8x4x16x2x8xi8>, tensor<1x2x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
+  return %d : tensor<1x1x8x4x2x4x16x4xi32>
+}
+
+// CHECK-LABEL: @multi_mma_mfma_i32_16x16x32_i8
+//       CHECK: iree_gpu.multi_mma
+//  CHECK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
+//  CHECK-NOT:  promote_operands
+//  CHECK-SAME: reduction = [0, 0, 0]
+//  CHECK-SAME: #iree_gpu.ukernel_config<name = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4"
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 0a734b4d96fb3..6a0700362ab61 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -33,10 +33,9 @@ namespace mlir::iree_compiler::IREE::GPU {
 
 constexpr int64_t kCacheLineSizeBits = 128 * 8;
 
-LogicalResult
-setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
-                                   mlir::FunctionOpInterface entryPoint,
-                                   Operation *op) {
+LogicalResult setDataTiledMultiMmaLoweringConfig(
+    IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
+    Operation *op, IREE::GPU::UKernelConfigAttr ukernelConfig) {
   auto multiMmaOp = dyn_cast<IREE::GPU::MultiMmaOp>(op);
   if (!multiMmaOp) {
     return failure();
@@ -70,7 +69,7 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
   SmallVector<int64_t> reductionTileSizes(iterationRank, 0);
   for (int64_t kDim : contractionDims.k) {
     workgroupTileSizes[kDim] = 0;
-    reductionTileSizes[kDim] = 1;
+    reductionTileSizes[kDim] = ukernelConfig ? 0 : 1;
   }
 
   // Set tile sizes.
@@ -81,8 +80,16 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
                      b.getI64ArrayAttr(workgroupTileSizes));
   attrs.emplace_back(b.getStringAttr("reduction"),
                      b.getI64ArrayAttr(reductionTileSizes));
-  // Promote operands to use shared memory for LHS and RHS.
-  GPU::setPromotedOperandList(context, attrs, {0, 1});
+  if (ukernelConfig) {
+    attrs.emplace_back(b.getStringAttr("ukernel"), ukernelConfig);
+  } else {
+    // Promote operands to use shared memory for LHS and RHS.
+    // Don't do that with ukernels: their untiled reduction dimension is too
+    // large to fit in shared memory, so they just want global memory and they
+    // will take care of moving small chunks at a time into a shared memory
+    // operand that will be created together with the ukernel op.
+    GPU::setPromotedOperandList(context, attrs, {0, 1});
+  }
   auto configDict = b.getDictionaryAttr(attrs);
   auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h
index 0458ea91d6adf..636ffe5f0898e 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h
@@ -16,10 +16,9 @@ namespace mlir::iree_compiler::IREE::GPU {
 
 /// Helper for setting up a data tiled multi_mma config based on the specified
 /// target.
-LogicalResult
-setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
-                                   mlir::FunctionOpInterface entryPoint,
-                                   Operation *op);
+LogicalResult setDataTiledMultiMmaLoweringConfig(
+    IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
+    Operation *op, IREE::GPU::UKernelConfigAttr ukernelConfig);
 
 /// Helper for setting up a convolution config using IGEMM based on the
 /// specified target.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index be069e7282993..fc890d1db70da 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -2043,15 +2043,9 @@ static LogicalResult setTransposeConfig(mlir::FunctionOpInterface entryPoint,
 /// Set the configuration for argmax when ukernels are enabled.
 /// Distribute all parallel dim across different workgroups, and only use single
 /// subgroup per workgroup.
-static LogicalResult
-setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
-                       mlir::FunctionOpInterface entryPoint,
-                       linalg::GenericOp op) {
-  IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(op);
-  if (!ukernelConfig) {
-    return failure();
-  }
-
+static LogicalResult setArgmaxUkernelConfig(
+    IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
+    linalg::GenericOp op, IREE::GPU::UKernelConfigAttr ukernelConfig) {
   SmallVector<unsigned> parallelDims;
   SmallVector<unsigned> reductionDims;
   op.getParallelDims(parallelDims);
@@ -2114,15 +2108,6 @@ setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
   return success();
 }
 
-/// Make UKernels take the LLVMGPUDefault lowering pipeline.
-static LogicalResult
-setUKernelConfig(mlir::FunctionOpInterface entryPoint,
-                 IREE::Codegen::UKernelOpInterface ukernelOp) {
-  auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
-      entryPoint->getContext(), CodeGenPipeline::LLVMGPUDefault);
-  return setTranslationInfo(entryPoint, translationInfo);
-}
-
 /// Decides the tiling and distribution parameters for one convolution
 /// dimension. Returns true if we can succesfully deduce.
 ///
@@ -2302,13 +2287,14 @@ static LogicalResult setConvolutionConfig(
 static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
                                    mlir::FunctionOpInterface entryPointFn,
                                    Operation *computeOp) {
+  IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(computeOp);
   LLVM_DEBUG({
     DBGS() << "Selecting root config for: ";
     computeOp->print(llvm::dbgs(), OpPrintingFlags().skipRegions());
     llvm::dbgs() << "\n";
   });
   if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn,
-                                                   computeOp))) {
+                                                   computeOp, ukernelConfig))) {
     LDBG("Tile and fuse data tiled multi_mma config");
     return success();
   }
@@ -2354,8 +2340,9 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
     if (genericOp && succeeded(setTransposeConfig(entryPointFn, genericOp))) {
       LDBG("Transpose Config");
       return success();
-    } else if (genericOp && succeeded(setArgmaxUkernelConfig(
-                                target, entryPointFn, genericOp))) {
+    } else if (genericOp && ukernelConfig &&
+               succeeded(setArgmaxUkernelConfig(target, entryPointFn, genericOp,
+                                                ukernelConfig))) {
       LDBG("Argmax Ukernel Config");
       return success();
     }
@@ -2379,10 +2366,6 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
         LDBG("Pack Config");
         return setPackConfig(target, entryPointFn, packOp);
       })
-      .Case<IREE::Codegen::UKernelOpInterface>([&](auto ukernelOp) {
-        LDBG("Ukernel Config");
-        return setUKernelConfig(entryPointFn, ukernelOp);
-      })
       .Case<IREE::LinalgExt::CustomOp>([&](auto customOp) {
         LDBG("CustomOp Config");
         return setDefaultCustomOpLoweringConfig(entryPointFn, customOp,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp
index 26245cba1b257..c52cd07a1cd14 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.h"
 
+#include "compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
@@ -272,8 +273,9 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target,
 static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
                                    mlir::FunctionOpInterface entryPointFn,
                                    Operation *computeOp) {
+  IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(computeOp);
   if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn,
-                                                   computeOp))) {
+                                                   computeOp, ukernelConfig))) {
     return success();
   }
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp
index 2f2861f926cc4..83946299e26b4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Utils/EmbeddedDataDirectory.h"
@@ -29,6 +30,26 @@ getUKernelNameAndSuffixForArgmax(linalg::GenericOp op) {
                                   indexType.getElementType())};
 }
 
+// Returns ukernel name and suffix for multi_mma. Empty name = no ukernel.
+static std::tuple<std::string, std::string>
+getUKernelNameAndSuffixForMultiMma(IREE::GPU::MultiMmaOp op) {
+  auto mma = dyn_cast<IREE::GPU::DataTiledMMAAttr>(op.getKind());
+  if (!mma) {
+    return {}; // Only handling DataTiledMMAAttr for now.
+  }
+  std::string suffix{
+      stringifyMMAIntrinsic(mma.getIntrinsic().getValue()).lower()};
+  if (mma.getUnrollM() != 1 || mma.getUnrollN() != 1 || mma.getUnrollK() != 1) {
+    suffix += llvm::formatv("_unroll{}x{}x{}", mma.getUnrollM(),
+                            mma.getUnrollN(), mma.getUnrollK());
+  }
+  if (mma.getSubgroupsM() != 1 || mma.getSubgroupsN() != 1) {
+    suffix += llvm::formatv("_subgroups{}x{}", mma.getSubgroupsM(),
+                            mma.getSubgroupsN());
+  }
+  return {"multi_mma", suffix};
+}
+
 // Returns ukernel name and suffix for any op. Empty name = no ukernel.
 static std::tuple<std::string, std::string>
 getUKernelNameAndSuffix(Operation *op) {
@@ -36,6 +57,8 @@ getUKernelNameAndSuffix(Operation *op) {
     if (succeeded(isArgmaxOp(genericOp))) {
       return getUKernelNameAndSuffixForArgmax(genericOp);
     }
+  } else if (auto multiMmaOp = dyn_cast<IREE::GPU::MultiMmaOp>(op)) {
+    return getUKernelNameAndSuffixForMultiMma(multiMmaOp);
   }
   return {};
 }