From 60e9143afb9425cf1e0eb6758332e721bfa282ab Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <mravisha@amd.com>
Date: Wed, 12 Feb 2025 21:23:56 -0600
Subject: [PATCH 1/4] [GPU] Take element type bitwidth into account with vector
 size.

Co-authored-by: MaheshRavishankar <mravisha@amd.com>
Signed-off-by: MaheshRavishankar <mravisha@amd.com>
Signed-off-by: Nirvedh <nirvedh@gmail.com>
---
 .../Dialect/GPU/TargetUtils/ConfigUtils.cpp   | 66 ++++++++++++++++---
 .../test/ROCDL/config_tile_and_fuse.mlir      | 19 ++++++
 2 files changed, 77 insertions(+), 8 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index c447c9635167..1205942dd9a2 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -503,6 +503,45 @@ static bool isNonMatvecContraction(linalg::LinalgOp linalgOp) {
          getElementCount(contractionDims->n) != 1;
 }
 
+// To find the number of vector elements per work-item, find a
+// bit width that is representative of the computation.
+static unsigned getRepresentativeBitWidth(linalg::LinalgOp linalgOp) {
+  // Check all the inputs with permutation indexing maps. Use
+  // the maximum of those to get the bit width.
+  std::optional<unsigned> maxBitWidth;
+  auto updateElementTypeBitWidth = [&](Value v) {
+    auto elementType = getElementTypeOrSelf(v);
+    unsigned bitWidth = elementType.getIntOrFloatBitWidth();
+    if (maxBitWidth) {
+      maxBitWidth = std::max(maxBitWidth.value(), bitWidth);
+      return;
+    }
+    maxBitWidth = bitWidth;
+  };
+  for (OpOperand *input : linalgOp.getDpsInputOperands()) {
+    AffineMap inputOperandMap = linalgOp.getMatchingIndexingMap(input);
+    if (!inputOperandMap.isPermutation()) {
+      continue;
+    }
+    updateElementTypeBitWidth(input->get());
+  }
+  if (maxBitWidth) {
+    return maxBitWidth.value();
+  }
+
+  // If none of the operands have permutation inputs, use the result.
+  // Dont bother about the indexing map.
+  for (OpOperand &output : linalgOp.getDpsInitsMutable()) {
+    updateElementTypeBitWidth(output.get());
+  }
+  if (maxBitWidth) {
+    return maxBitWidth.value();
+  }
+
+  // Fall back, just be a word.
+  return 32;
+}
+
 LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target,
                                            mlir::FunctionOpInterface entryPoint,
                                            Operation *op) {
@@ -572,6 +611,7 @@ LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target,
   bool vectorizable = projPerm && powTwo;
 
   const unsigned minBitwidth = getMinElementBitwidth(linalgOp);
+  const unsigned representativeBitWidth = getRepresentativeBitWidth(linalgOp);
   // Make sure we use a tile size that results in some integral number of bytes.
   const unsigned scaleToByte =
       std::max(8 / minBitwidth, static_cast<unsigned>(1));
@@ -580,7 +620,7 @@ LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target,
   auto distributeToThreads = [&](int64_t numThreads,
                                  std::optional<int64_t> lossFactor =
                                      std::nullopt) {
-    LDBG("Loss factor: " << lossFactor << "\n");
+    LDBG("Loss factor: " << lossFactor);
     // Initialize the configuration.
     flatWorkgroupSize = 1;
     // Initialize thread tiling along all partitioned loops with size 1, and
@@ -607,13 +647,23 @@ LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target,
 
       // Ensure vectorization works with the `workgroupTileMultiple`.
       int64_t workgroupTileMultiple = workgroupTileSizeMultiples[shapeDim];
+      unsigned numVectorElements = std::max(4u, 128 / representativeBitWidth);
+      int64_t vecorizableCandidate = numVectorElements * numThreads;
+      // For smaller shapes, we reduce `numVectorElements` as we may not find
+      // work for all threads otherwise and we dont have vectorization enabled
+      // with loss.
+      while (vectorizable && (vecorizableCandidate > loopBound) &&
+             numVectorElements > 4) {
+        numVectorElements /= 2;
+        vecorizableCandidate = numVectorElements * numThreads;
+      }
       vectorizable =
-          vectorizable && 4 * numThreads % workgroupTileMultiple == 0;
-      // For the inner most workgroup dim, try to see if we can have 4
-      // elements per thread. This enables vectorization.
+          vectorizable && vecorizableCandidate % workgroupTileMultiple == 0;
+
       if (vectorizable && wgDim == 0 && !lossFactor) {
-        candidates.push_back(4 * numThreads);
+        candidates.push_back(vecorizableCandidate);
       }
+
       // Try all power of two multiples of `workgroupTileMultiple` up to the
       // subgroup size.
       uint64_t maxCandidate =
@@ -645,17 +695,17 @@ LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target,
             llvm::divideCeil(loopBound, scaledTileSize) <= 2) {
           continue;
         }
-
         // Try to let each thread handle 4 elements if this is the workgroup x
         // dimension.
         // TODO: Try to take into account element type bit width to get
         // 4xdword reads instead of 4x{elements}.
-        if (vectorizable && wgDim == 0 && !lossFactor && candidate % 4 == 0) {
+        if (vectorizable && wgDim == 0 && !lossFactor &&
+            candidate % numVectorElements == 0) {
           // Use size-1 vectors to increase parallelism if larger ones causes
           // idle threads in the subgroup.
           bool hasIdleThreads =
               partitionableLoops.size() == 1 && candidate <= subgroupSize;
-          int vectorSize = hasIdleThreads ? 1 : 4;
+          int vectorSize = hasIdleThreads ? 1 : numVectorElements;
           LLVM_DEBUG(llvm::dbgs() << "Use vector size: " << vectorSize << "\n");
           threadTileSizes[shapeDim] = vectorSize * scaleToByte;
           candidateWorkgroupSize = candidate / vectorSize;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
index c769f0424860..6ba78769d765 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -219,6 +219,25 @@ module {
 
 // -----
 
+module {
+  func.func @elementwise_dynamic_dim_large(%11: tensor<?x512xf16>, %12: tensor<?x512xf16>) -> tensor<?x512xf16> {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %8 = tensor.dim %11, %c0 : tensor<?x512xf16>
+    %13 = tensor.empty(%8) : tensor<?x512xf16>
+    %15 = linalg.add ins(%11, %12 : tensor<?x512xf16>, tensor<?x512xf16>) outs(%13 : tensor<?x512xf16>) -> tensor<?x512xf16>
+    return %15 : tensor<?x512xf16>
+  }
+}
+
+// CHECK-LABEL: func.func @elementwise_dynamic_dim_large
+//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
+//       CHECK:   linalg.add {{.*}}lowering_config = #iree_gpu.lowering_config
+//  CHECK-SAME:     thread = [1, 8]
+//  CHECK-SAME:     workgroup = [1, 512]
+
+// -----
+
 module @elementwise_unaligned {
   func.func @elementwise_unaligned(%11: tensor<180x180xf16>, %12: tensor<180x180xf16>) -> tensor<180x180xf16> {
     %cst = arith.constant 0.000000e+00 : f32

From 792df23bbc8d52788599a87c80b3d74fd64e0510 Mon Sep 17 00:00:00 2001
From: Nirvedh <nirvedh@gmail.com>
Date: Thu, 13 Feb 2025 20:15:52 -0600
Subject: [PATCH 2/4] fix test and add erf test

Signed-off-by: Nirvedh <nirvedh@gmail.com>
---
 .../Dialect/GPU/TargetUtils/ConfigUtils.cpp   | 10 +--
 .../LLVMGPU/test/rocdl_pipeline_test.mlir     | 73 ++++++++++++++++++-
 2 files changed, 74 insertions(+), 9 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 1205942dd9a2..1ac741871780 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -648,20 +648,20 @@ LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target,
       // Ensure vectorization works with the `workgroupTileMultiple`.
       int64_t workgroupTileMultiple = workgroupTileSizeMultiples[shapeDim];
       unsigned numVectorElements = std::max(4u, 128 / representativeBitWidth);
-      int64_t vecorizableCandidate = numVectorElements * numThreads;
+      int64_t vectorizableCandidate = numVectorElements * numThreads;
       // For smaller shapes, we reduce `numVectorElements` as we may not find
       // work for all threads otherwise and we dont have vectorization enabled
       // with loss.
-      while (vectorizable && (vecorizableCandidate > loopBound) &&
+      while (vectorizable && (vectorizableCandidate > loopBound) &&
              numVectorElements > 4) {
         numVectorElements /= 2;
-        vecorizableCandidate = numVectorElements * numThreads;
+        vectorizableCandidate = numVectorElements * numThreads;
       }
       vectorizable =
-          vectorizable && vecorizableCandidate % workgroupTileMultiple == 0;
+          vectorizable && vectorizableCandidate % workgroupTileMultiple == 0;
 
       if (vectorizable && wgDim == 0 && !lossFactor) {
-        candidates.push_back(vecorizableCandidate);
+        candidates.push_back(vectorizableCandidate);
       }
 
       // Try all power of two multiples of `workgroupTileMultiple` up to the
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
index 941bcf60989c..ad34cd771c4b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
@@ -137,10 +137,10 @@ hal.executable @ext_fp8_dispatch {
 
 //   CDNA3-LABEL: hal.executable public @ext_fp8_dispatch
 //         CDNA3:   hal.executable.variant public @rocm
-// CDNA3-COUNT-4:     rocdl.cvt.f32.fp8 %{{.*}} : f32
-// CDNA3-COUNT-4:     rocdl.cvt.f32.bf8 %{{.*}} : f32
-//         CDNA3:     %[[ADD:.+]] = llvm.fadd %{{.*}}, %{{.*}} : vector<4xf32>
-//         CDNA3:     llvm.store %[[ADD]], %{{.*}} : vector<4xf32>, !llvm.ptr<1>
+// CDNA3-COUNT-16:     rocdl.cvt.f32.fp8 %{{.*}} : f32
+// CDNA3-COUNT-16:     rocdl.cvt.f32.bf8 %{{.*}} : f32
+//         CDNA3:     %[[ADD:.+]] = llvm.fadd %{{.*}}, %{{.*}} : vector<16xf32>
+//         CDNA3:     llvm.store %[[ADD]], %{{.*}} : vector<16xf32>, !llvm.ptr<1>
 
 // -----
 
@@ -189,3 +189,68 @@ hal.executable @ceildiv_expand_dispatch {
 // CDNA3-COUNT-2:     llvm.and {{.*}} : vector<1xi1>
 // CDNA3-COUNT-1:     llvm.or {{.*}} : vector<1xi1>
 // CDNA3-COUNT-1:     llvm.select {{.*}} : vector<1xi1>, vector<1xi32>
+
+// -----
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @erf_dispatch {
+  hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
+  hal.executable.export @erf layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device, %arg1: index):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1
+      hal.return %x, %y, %z : index, index, index
+    }
+builtin.module {
+  func.func @erf() {
+    %cst = arith.constant 1.270000e+02 : f16
+    %cst_0 = arith.constant -1.280000e+02 : f16
+    %cst_1 = arith.constant 5.000000e-01 : f16
+    %cst_2 = arith.constant 1.000000e+00 : f16
+    %cst_3 = arith.constant 2.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %5 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0)  : !flow.dispatch.tensor<readonly:tensor<2x1024x10240xf16>>
+    %6 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1)  : !flow.dispatch.tensor<readonly:tensor<5120xf16>>
+    %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2)  : !flow.dispatch.tensor<readonly:tensor<f32>>
+    %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3)  : !flow.dispatch.tensor<writeonly:tensor<2x1024x5120xi8>>
+    %9 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [5120], strides = [1] : !flow.dispatch.tensor<readonly:tensor<5120xf16>> -> tensor<5120xf16>
+    %10 = flow.dispatch.tensor.load %7, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
+    %11 = tensor.empty() : tensor<2x1024x5120xi8>
+    %12 = flow.dispatch.tensor.load %5, offsets = [0, 0, 5120], sizes = [2, 1024, 5120], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1024x10240xf16>> -> tensor<2x1024x5120xf16>
+    %13 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [2, 1024, 5120], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1024x10240xf16>> -> tensor<2x1024x5120xf16>
+    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%13, %12, %9, %10 : tensor<2x1024x5120xf16>, tensor<2x1024x5120xf16>, tensor<5120xf16>, tensor<f32>) outs(%11 : tensor<2x1024x5120xi8>) {
+    ^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f32, %out: i8):
+      %15 = math.sqrt %cst_3 : f16
+      %16 = arith.divf %in_4, %15 : f16
+      %17 = math.erf %16 : f16
+      %18 = arith.addf %17, %cst_2 : f16
+      %19 = arith.mulf %18, %cst_1 : f16
+      %20 = arith.mulf %in_4, %19 : f16
+      %21 = arith.mulf %in, %20 : f16
+      %22 = arith.mulf %21, %in_5 : f16
+      %23 = arith.truncf %in_6 : f32 to f16
+      %24 = arith.divf %22, %23 : f16
+      %25 = math.roundeven %24 : f16
+      %26 = arith.cmpf ult, %25, %cst_0 : f16
+      %27 = arith.select %26, %cst_0, %25 : f16
+      %28 = arith.cmpf ugt, %27, %cst : f16
+      %29 = arith.select %28, %cst, %27 : f16
+      %30 = arith.fptosi %29 : f16 to i8
+      linalg.yield %30 : i8
+    } -> tensor<2x1024x5120xi8>
+    flow.dispatch.tensor.store %14, %8, offsets = [0, 0, 0], sizes = [2, 1024, 5120], strides = [1, 1, 1] : tensor<2x1024x5120xi8> -> !flow.dispatch.tensor<writeonly:tensor<2x1024x5120xi8>>
+    return
+  }
+}
+  }
+}
+
+//    CDNA3-LABEL: hal.executable public @erf_dispatch
+//          CDNA3:   hal.executable.variant public @rocm
+// CDNA3-COUNT-19:     llvm.select {{.*}} : vector<8xi1>, vector<8xf32>
+//  CDNA3-COUNT-8:     llvm.intr.fma{{.*}} : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+//          CDNA3:     %[[RESULT:.+]] = llvm.fptosi {{.*}} : vector<8xf16> to vector<8xi8>
+//          CDNA3:     llvm.store %[[RESULT]], %{{.*}} : vector<8xi8>, !llvm.ptr<1>

From 6485474f9e9ffe63a8732838050014e7c4e91c99 Mon Sep 17 00:00:00 2001
From: Nirvedh <nirvedh@gmail.com>
Date: Thu, 13 Feb 2025 21:14:02 -0600
Subject: [PATCH 3/4] add config test for erf

Signed-off-by: Nirvedh <nirvedh@gmail.com>
---
 .../test/ROCDL/config_tile_and_fuse.mlir      | 41 ++++++------
 .../LLVMGPU/test/rocdl_pipeline_test.mlir     | 65 -------------------
 2 files changed, 22 insertions(+), 84 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
index 6ba78769d765..5901999d5adb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -219,25 +219,6 @@ module {
 
 // -----
 
-module {
-  func.func @elementwise_dynamic_dim_large(%11: tensor<?x512xf16>, %12: tensor<?x512xf16>) -> tensor<?x512xf16> {
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant 0.000000e+00 : f32
-    %8 = tensor.dim %11, %c0 : tensor<?x512xf16>
-    %13 = tensor.empty(%8) : tensor<?x512xf16>
-    %15 = linalg.add ins(%11, %12 : tensor<?x512xf16>, tensor<?x512xf16>) outs(%13 : tensor<?x512xf16>) -> tensor<?x512xf16>
-    return %15 : tensor<?x512xf16>
-  }
-}
-
-// CHECK-LABEL: func.func @elementwise_dynamic_dim_large
-//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
-//       CHECK:   linalg.add {{.*}}lowering_config = #iree_gpu.lowering_config
-//  CHECK-SAME:     thread = [1, 8]
-//  CHECK-SAME:     workgroup = [1, 512]
-
-// -----
-
 module @elementwise_unaligned {
   func.func @elementwise_unaligned(%11: tensor<180x180xf16>, %12: tensor<180x180xf16>) -> tensor<180x180xf16> {
     %cst = arith.constant 0.000000e+00 : f32
@@ -673,3 +654,25 @@ func.func @pack_dynamic_tile(%arg0: tensor<32x32xi8>, %d0: index, %d1: index, %t
 //       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     thread = [1, 4]
 //  CHECK-SAME:     workgroup = [8, 32]
+
+// -----
+
+module {
+  func.func @erf(%13 : tensor<2x1024x5120xf16>, %12 : tensor<2x1024x5120xf16>, %9 : tensor<5120xf16>, %10 : tensor<f32>) -> tensor<2x1024x5120xi8> {
+    %cst = arith.constant 0.000000e+00 : f16
+    %11 = tensor.empty() : tensor<2x1024x5120xi8>
+    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%13, %12, %9, %10 : tensor<2x1024x5120xf16>, tensor<2x1024x5120xf16>, tensor<5120xf16>, tensor<f32>) outs(%11 : tensor<2x1024x5120xi8>) {
+    ^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f32, %out: i8):
+      %17 = math.erf %in : f16
+      %30 = arith.fptosi %17 : f16 to i8
+      linalg.yield %30 : i8
+    } -> tensor<2x1024x5120xi8>
+    return %14 : tensor<2x1024x5120xi8>
+  }
+}
+
+// CHECK-LABEL: func.func @erf
+//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
+//       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
+//  CHECK-SAME:     thread = [1, 1, 8]
+//  CHECK-SAME:     workgroup = [1, 1, 512]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
index ad34cd771c4b..b4f8d8259925 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
@@ -189,68 +189,3 @@ hal.executable @ceildiv_expand_dispatch {
 // CDNA3-COUNT-2:     llvm.and {{.*}} : vector<1xi1>
 // CDNA3-COUNT-1:     llvm.or {{.*}} : vector<1xi1>
 // CDNA3-COUNT-1:     llvm.select {{.*}} : vector<1xi1>, vector<1xi32>
-
-// -----
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-hal.executable @erf_dispatch {
-  hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
-  hal.executable.export @erf layout(#pipeline_layout) {
-    ^bb0(%arg0: !hal.device, %arg1: index):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1
-      hal.return %x, %y, %z : index, index, index
-    }
-builtin.module {
-  func.func @erf() {
-    %cst = arith.constant 1.270000e+02 : f16
-    %cst_0 = arith.constant -1.280000e+02 : f16
-    %cst_1 = arith.constant 5.000000e-01 : f16
-    %cst_2 = arith.constant 1.000000e+00 : f16
-    %cst_3 = arith.constant 2.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %5 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0)  : !flow.dispatch.tensor<readonly:tensor<2x1024x10240xf16>>
-    %6 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1)  : !flow.dispatch.tensor<readonly:tensor<5120xf16>>
-    %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2)  : !flow.dispatch.tensor<readonly:tensor<f32>>
-    %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3)  : !flow.dispatch.tensor<writeonly:tensor<2x1024x5120xi8>>
-    %9 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [5120], strides = [1] : !flow.dispatch.tensor<readonly:tensor<5120xf16>> -> tensor<5120xf16>
-    %10 = flow.dispatch.tensor.load %7, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
-    %11 = tensor.empty() : tensor<2x1024x5120xi8>
-    %12 = flow.dispatch.tensor.load %5, offsets = [0, 0, 5120], sizes = [2, 1024, 5120], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1024x10240xf16>> -> tensor<2x1024x5120xf16>
-    %13 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [2, 1024, 5120], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1024x10240xf16>> -> tensor<2x1024x5120xf16>
-    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%13, %12, %9, %10 : tensor<2x1024x5120xf16>, tensor<2x1024x5120xf16>, tensor<5120xf16>, tensor<f32>) outs(%11 : tensor<2x1024x5120xi8>) {
-    ^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f32, %out: i8):
-      %15 = math.sqrt %cst_3 : f16
-      %16 = arith.divf %in_4, %15 : f16
-      %17 = math.erf %16 : f16
-      %18 = arith.addf %17, %cst_2 : f16
-      %19 = arith.mulf %18, %cst_1 : f16
-      %20 = arith.mulf %in_4, %19 : f16
-      %21 = arith.mulf %in, %20 : f16
-      %22 = arith.mulf %21, %in_5 : f16
-      %23 = arith.truncf %in_6 : f32 to f16
-      %24 = arith.divf %22, %23 : f16
-      %25 = math.roundeven %24 : f16
-      %26 = arith.cmpf ult, %25, %cst_0 : f16
-      %27 = arith.select %26, %cst_0, %25 : f16
-      %28 = arith.cmpf ugt, %27, %cst : f16
-      %29 = arith.select %28, %cst, %27 : f16
-      %30 = arith.fptosi %29 : f16 to i8
-      linalg.yield %30 : i8
-    } -> tensor<2x1024x5120xi8>
-    flow.dispatch.tensor.store %14, %8, offsets = [0, 0, 0], sizes = [2, 1024, 5120], strides = [1, 1, 1] : tensor<2x1024x5120xi8> -> !flow.dispatch.tensor<writeonly:tensor<2x1024x5120xi8>>
-    return
-  }
-}
-  }
-}
-
-//    CDNA3-LABEL: hal.executable public @erf_dispatch
-//          CDNA3:   hal.executable.variant public @rocm
-// CDNA3-COUNT-19:     llvm.select {{.*}} : vector<8xi1>, vector<8xf32>
-//  CDNA3-COUNT-8:     llvm.intr.fma{{.*}} : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-//          CDNA3:     %[[RESULT:.+]] = llvm.fptosi {{.*}} : vector<8xf16> to vector<8xi8>
-//          CDNA3:     llvm.store %[[RESULT]], %{{.*}} : vector<8xi8>, !llvm.ptr<1>

From c26d3dae86eb9669947ec1c19e0a7c583986648e Mon Sep 17 00:00:00 2001
From: Nirvedh <nirvedh@gmail.com>
Date: Fri, 14 Feb 2025 11:07:24 -0600
Subject: [PATCH 4/4] fail for complex types

Signed-off-by: Nirvedh <nirvedh@gmail.com>
---
 .../compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 1ac741871780..98571af9262b 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -511,6 +511,9 @@ static unsigned getRepresentativeBitWidth(linalg::LinalgOp linalgOp) {
   std::optional<unsigned> maxBitWidth;
   auto updateElementTypeBitWidth = [&](Value v) {
     auto elementType = getElementTypeOrSelf(v);
+    if (!elementType.isIntOrFloat()) {
+      return;
+    }
     unsigned bitWidth = elementType.getIntOrFloatBitWidth();
     if (maxBitWidth) {
       maxBitWidth = std::max(maxBitWidth.value(), bitWidth);