From 60e9143afb9425cf1e0eb6758332e721bfa282ab Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Wed, 12 Feb 2025 21:23:56 -0600 Subject: [PATCH 1/4] [GPU] Take element type bitwidth into account with vector size. Co-authored-by: MaheshRavishankar Signed-off-by: MaheshRavishankar Signed-off-by: Nirvedh --- .../Dialect/GPU/TargetUtils/ConfigUtils.cpp | 66 ++++++++++++++++--- .../test/ROCDL/config_tile_and_fuse.mlir | 19 ++++++ 2 files changed, 77 insertions(+), 8 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index c447c9635167..1205942dd9a2 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -503,6 +503,45 @@ static bool isNonMatvecContraction(linalg::LinalgOp linalgOp) { getElementCount(contractionDims->n) != 1; } +// To find the number of vector elements per work-item, find a +// bit width that is representative of the computation. +static unsigned getRepresentativeBitWidth(linalg::LinalgOp linalgOp) { + // Check all the inputs with permutation indexing maps. Use + // the maximum of those to get the bit width. + std::optional maxBitWidth; + auto updateElementTypeBitWidth = [&](Value v) { + auto elementType = getElementTypeOrSelf(v); + unsigned bitWidth = elementType.getIntOrFloatBitWidth(); + if (maxBitWidth) { + maxBitWidth = std::max(maxBitWidth.value(), bitWidth); + return; + } + maxBitWidth = bitWidth; + }; + for (OpOperand *input : linalgOp.getDpsInputOperands()) { + AffineMap inputOperandMap = linalgOp.getMatchingIndexingMap(input); + if (!inputOperandMap.isPermutation()) { + continue; + } + updateElementTypeBitWidth(input->get()); + } + if (maxBitWidth) { + return maxBitWidth.value(); + } + + // If none of the operands have permutation inputs, use the result. + // Dont bother about the indexing map. + for (OpOperand &output : linalgOp.getDpsInitsMutable()) { + updateElementTypeBitWidth(output.get()); + } + if (maxBitWidth) { + return maxBitWidth.value(); + } + + // Fall back, just be a word. + return 32; +} + LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint, Operation *op) { @@ -572,6 +611,7 @@ LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target, bool vectorizable = projPerm && powTwo; const unsigned minBitwidth = getMinElementBitwidth(linalgOp); + const unsigned representativeBitWidth = getRepresentativeBitWidth(linalgOp); // Make sure we use a tile size that results in some integral number of bytes. const unsigned scaleToByte = std::max(8 / minBitwidth, static_cast(1)); @@ -580,7 +620,7 @@ LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target, auto distributeToThreads = [&](int64_t numThreads, std::optional lossFactor = std::nullopt) { - LDBG("Loss factor: " << lossFactor << "\n"); + LDBG("Loss factor: " << lossFactor); // Initialize the configuration. flatWorkgroupSize = 1; // Initialize thread tiling along all partitioned loops with size 1, and @@ -607,13 +647,23 @@ LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target, // Ensure vectorization works with the `workgroupTileMultiple`. int64_t workgroupTileMultiple = workgroupTileSizeMultiples[shapeDim]; + unsigned numVectorElements = std::max(4u, 128 / representativeBitWidth); + int64_t vecorizableCandidate = numVectorElements * numThreads; + // For smaller shapes, we reduce `numVectorElements` as we may not find + // work for all threads otherwise and we dont have vectorization enabled + // with loss. + while (vectorizable && (vecorizableCandidate > loopBound) && + numVectorElements > 4) { + numVectorElements /= 2; + vecorizableCandidate = numVectorElements * numThreads; + } vectorizable = - vectorizable && 4 * numThreads % workgroupTileMultiple == 0; - // For the inner most workgroup dim, try to see if we can have 4 - // elements per thread. This enables vectorization. + vectorizable && vecorizableCandidate % workgroupTileMultiple == 0; + if (vectorizable && wgDim == 0 && !lossFactor) { - candidates.push_back(4 * numThreads); + candidates.push_back(vecorizableCandidate); } + // Try all power of two multiples of `workgroupTileMultiple` up to the // subgroup size. uint64_t maxCandidate = @@ -645,17 +695,17 @@ LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target, llvm::divideCeil(loopBound, scaledTileSize) <= 2) { continue; } - // Try to let each thread handle 4 elements if this is the workgroup x // dimension. // TODO: Try to take into account element type bit width to get // 4xdword reads instead of 4x{elements}. - if (vectorizable && wgDim == 0 && !lossFactor && candidate % 4 == 0) { + if (vectorizable && wgDim == 0 && !lossFactor && + candidate % numVectorElements == 0) { // Use size-1 vectors to increase parallelism if larger ones causes // idle threads in the subgroup. bool hasIdleThreads = partitionableLoops.size() == 1 && candidate <= subgroupSize; - int vectorSize = hasIdleThreads ? 1 : 4; + int vectorSize = hasIdleThreads ? 1 : numVectorElements; LLVM_DEBUG(llvm::dbgs() << "Use vector size: " << vectorSize << "\n"); threadTileSizes[shapeDim] = vectorSize * scaleToByte; candidateWorkgroupSize = candidate / vectorSize; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir index c769f0424860..6ba78769d765 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir @@ -219,6 +219,25 @@ module { // ----- +module { + func.func @elementwise_dynamic_dim_large(%11: tensor, %12: tensor) -> tensor { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %8 = tensor.dim %11, %c0 : tensor + %13 = tensor.empty(%8) : tensor + %15 = linalg.add ins(%11, %12 : tensor, tensor) outs(%13 : tensor) -> tensor + return %15 : tensor + } +} + +// CHECK-LABEL: func.func @elementwise_dynamic_dim_large +// CHECK-SAME: #iree_codegen.translation_info +// CHECK: linalg.add {{.*}}lowering_config = #iree_gpu.lowering_config +// CHECK-SAME: thread = [1, 8] +// CHECK-SAME: workgroup = [1, 512] + +// ----- + module @elementwise_unaligned { func.func @elementwise_unaligned(%11: tensor<180x180xf16>, %12: tensor<180x180xf16>) -> tensor<180x180xf16> { %cst = arith.constant 0.000000e+00 : f32 From 792df23bbc8d52788599a87c80b3d74fd64e0510 Mon Sep 17 00:00:00 2001 From: Nirvedh Date: Thu, 13 Feb 2025 20:15:52 -0600 Subject: [PATCH 2/4] fix test and add erf test Signed-off-by: Nirvedh --- .../Dialect/GPU/TargetUtils/ConfigUtils.cpp | 10 +-- .../LLVMGPU/test/rocdl_pipeline_test.mlir | 73 ++++++++++++++++++- 2 files changed, 74 insertions(+), 9 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 1205942dd9a2..1ac741871780 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -648,20 +648,20 @@ LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target, // Ensure vectorization works with the `workgroupTileMultiple`. int64_t workgroupTileMultiple = workgroupTileSizeMultiples[shapeDim]; unsigned numVectorElements = std::max(4u, 128 / representativeBitWidth); - int64_t vecorizableCandidate = numVectorElements * numThreads; + int64_t vectorizableCandidate = numVectorElements * numThreads; // For smaller shapes, we reduce `numVectorElements` as we may not find // work for all threads otherwise and we dont have vectorization enabled // with loss. - while (vectorizable && (vecorizableCandidate > loopBound) && + while (vectorizable && (vectorizableCandidate > loopBound) && numVectorElements > 4) { numVectorElements /= 2; - vecorizableCandidate = numVectorElements * numThreads; + vectorizableCandidate = numVectorElements * numThreads; } vectorizable = - vectorizable && vecorizableCandidate % workgroupTileMultiple == 0; + vectorizable && vectorizableCandidate % workgroupTileMultiple == 0; if (vectorizable && wgDim == 0 && !lossFactor) { - candidates.push_back(vecorizableCandidate); + candidates.push_back(vectorizableCandidate); } // Try all power of two multiples of `workgroupTileMultiple` up to the diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir index 941bcf60989c..ad34cd771c4b 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir @@ -137,10 +137,10 @@ hal.executable @ext_fp8_dispatch { // CDNA3-LABEL: hal.executable public @ext_fp8_dispatch // CDNA3: hal.executable.variant public @rocm -// CDNA3-COUNT-4: rocdl.cvt.f32.fp8 %{{.*}} : f32 -// CDNA3-COUNT-4: rocdl.cvt.f32.bf8 %{{.*}} : f32 -// CDNA3: %[[ADD:.+]] = llvm.fadd %{{.*}}, %{{.*}} : vector<4xf32> -// CDNA3: llvm.store %[[ADD]], %{{.*}} : vector<4xf32>, !llvm.ptr<1> +// CDNA3-COUNT-16: rocdl.cvt.f32.fp8 %{{.*}} : f32 +// CDNA3-COUNT-16: rocdl.cvt.f32.bf8 %{{.*}} : f32 +// CDNA3: %[[ADD:.+]] = llvm.fadd %{{.*}}, %{{.*}} : vector<16xf32> +// CDNA3: llvm.store %[[ADD]], %{{.*}} : vector<16xf32>, !llvm.ptr<1> // ----- @@ -189,3 +189,68 @@ hal.executable @ceildiv_expand_dispatch { // CDNA3-COUNT-2: llvm.and {{.*}} : vector<1xi1> // CDNA3-COUNT-1: llvm.or {{.*}} : vector<1xi1> // CDNA3-COUNT-1: llvm.select {{.*}} : vector<1xi1>, vector<1xi32> + +// ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding +]> +hal.executable @erf_dispatch { + hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export @erf layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device, %arg1: index): + %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1 + hal.return %x, %y, %z : index, index, index + } +builtin.module { + func.func @erf() { + %cst = arith.constant 1.270000e+02 : f16 + %cst_0 = arith.constant -1.280000e+02 : f16 + %cst_1 = arith.constant 5.000000e-01 : f16 + %cst_2 = arith.constant 1.000000e+00 : f16 + %cst_3 = arith.constant 2.000000e+00 : f16 + %c0 = arith.constant 0 : index + %5 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) : !flow.dispatch.tensor> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) : !flow.dispatch.tensor> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) : !flow.dispatch.tensor> + %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) : !flow.dispatch.tensor> + %9 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [5120], strides = [1] : !flow.dispatch.tensor> -> tensor<5120xf16> + %10 = flow.dispatch.tensor.load %7, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %11 = tensor.empty() : tensor<2x1024x5120xi8> + %12 = flow.dispatch.tensor.load %5, offsets = [0, 0, 5120], sizes = [2, 1024, 5120], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1024x5120xf16> + %13 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [2, 1024, 5120], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1024x5120xf16> + %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%13, %12, %9, %10 : tensor<2x1024x5120xf16>, tensor<2x1024x5120xf16>, tensor<5120xf16>, tensor) outs(%11 : tensor<2x1024x5120xi8>) { + ^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f32, %out: i8): + %15 = math.sqrt %cst_3 : f16 + %16 = arith.divf %in_4, %15 : f16 + %17 = math.erf %16 : f16 + %18 = arith.addf %17, %cst_2 : f16 + %19 = arith.mulf %18, %cst_1 : f16 + %20 = arith.mulf %in_4, %19 : f16 + %21 = arith.mulf %in, %20 : f16 + %22 = arith.mulf %21, %in_5 : f16 + %23 = arith.truncf %in_6 : f32 to f16 + %24 = arith.divf %22, %23 : f16 + %25 = math.roundeven %24 : f16 + %26 = arith.cmpf ult, %25, %cst_0 : f16 + %27 = arith.select %26, %cst_0, %25 : f16 + %28 = arith.cmpf ugt, %27, %cst : f16 + %29 = arith.select %28, %cst, %27 : f16 + %30 = arith.fptosi %29 : f16 to i8 + linalg.yield %30 : i8 + } -> tensor<2x1024x5120xi8> + flow.dispatch.tensor.store %14, %8, offsets = [0, 0, 0], sizes = [2, 1024, 5120], strides = [1, 1, 1] : tensor<2x1024x5120xi8> -> !flow.dispatch.tensor> + return + } +} + } +} + +// CDNA3-LABEL: hal.executable public @erf_dispatch +// CDNA3: hal.executable.variant public @rocm +// CDNA3-COUNT-19: llvm.select {{.*}} : vector<8xi1>, vector<8xf32> +// CDNA3-COUNT-8: llvm.intr.fma{{.*}} : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32> +// CDNA3: %[[RESULT:.+]] = llvm.fptosi {{.*}} : vector<8xf16> to vector<8xi8> +// CDNA3: llvm.store %[[RESULT]], %{{.*}} : vector<8xi8>, !llvm.ptr<1> From 6485474f9e9ffe63a8732838050014e7c4e91c99 Mon Sep 17 00:00:00 2001 From: Nirvedh Date: Thu, 13 Feb 2025 21:14:02 -0600 Subject: [PATCH 3/4] add config test for erf Signed-off-by: Nirvedh --- .../test/ROCDL/config_tile_and_fuse.mlir | 41 ++++++------ .../LLVMGPU/test/rocdl_pipeline_test.mlir | 65 ------------------- 2 files changed, 22 insertions(+), 84 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir index 6ba78769d765..5901999d5adb 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir @@ -219,25 +219,6 @@ module { // ----- -module { - func.func @elementwise_dynamic_dim_large(%11: tensor, %12: tensor) -> tensor { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %8 = tensor.dim %11, %c0 : tensor - %13 = tensor.empty(%8) : tensor - %15 = linalg.add ins(%11, %12 : tensor, tensor) outs(%13 : tensor) -> tensor - return %15 : tensor - } -} - -// CHECK-LABEL: func.func @elementwise_dynamic_dim_large -// CHECK-SAME: #iree_codegen.translation_info -// CHECK: linalg.add {{.*}}lowering_config = #iree_gpu.lowering_config -// CHECK-SAME: thread = [1, 8] -// CHECK-SAME: workgroup = [1, 512] - -// ----- - module @elementwise_unaligned { func.func @elementwise_unaligned(%11: tensor<180x180xf16>, %12: tensor<180x180xf16>) -> tensor<180x180xf16> { %cst = arith.constant 0.000000e+00 : f32 @@ -673,3 +654,25 @@ func.func @pack_dynamic_tile(%arg0: tensor<32x32xi8>, %d0: index, %d1: index, %t // CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config // CHECK-SAME: thread = [1, 4] // CHECK-SAME: workgroup = [8, 32] + +// ----- + +module { + func.func @erf(%13 : tensor<2x1024x5120xf16>, %12 : tensor<2x1024x5120xf16>, %9 : tensor<5120xf16>, %10 : tensor) -> tensor<2x1024x5120xi8> { + %cst = arith.constant 0.000000e+00 : f16 + %11 = tensor.empty() : tensor<2x1024x5120xi8> + %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%13, %12, %9, %10 : tensor<2x1024x5120xf16>, tensor<2x1024x5120xf16>, tensor<5120xf16>, tensor) outs(%11 : tensor<2x1024x5120xi8>) { + ^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f32, %out: i8): + %17 = math.erf %in : f16 + %30 = arith.fptosi %17 : f16 to i8 + linalg.yield %30 : i8 + } -> tensor<2x1024x5120xi8> + return %14 : tensor<2x1024x5120xi8> + } +} + +// CHECK-LABEL: func.func @erf +// CHECK-SAME: #iree_codegen.translation_info +// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config +// CHECK-SAME: thread = [1, 1, 8] +// CHECK-SAME: workgroup = [1, 1, 512] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir index ad34cd771c4b..b4f8d8259925 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir @@ -189,68 +189,3 @@ hal.executable @ceildiv_expand_dispatch { // CDNA3-COUNT-2: llvm.and {{.*}} : vector<1xi1> // CDNA3-COUNT-1: llvm.or {{.*}} : vector<1xi1> // CDNA3-COUNT-1: llvm.select {{.*}} : vector<1xi1>, vector<1xi32> - -// ----- -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -hal.executable @erf_dispatch { - hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export @erf layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device, %arg1: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1 - hal.return %x, %y, %z : index, index, index - } -builtin.module { - func.func @erf() { - %cst = arith.constant 1.270000e+02 : f16 - %cst_0 = arith.constant -1.280000e+02 : f16 - %cst_1 = arith.constant 5.000000e-01 : f16 - %cst_2 = arith.constant 1.000000e+00 : f16 - %cst_3 = arith.constant 2.000000e+00 : f16 - %c0 = arith.constant 0 : index - %5 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) : !flow.dispatch.tensor> - %6 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) : !flow.dispatch.tensor> - %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) : !flow.dispatch.tensor> - %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) : !flow.dispatch.tensor> - %9 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [5120], strides = [1] : !flow.dispatch.tensor> -> tensor<5120xf16> - %10 = flow.dispatch.tensor.load %7, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %11 = tensor.empty() : tensor<2x1024x5120xi8> - %12 = flow.dispatch.tensor.load %5, offsets = [0, 0, 5120], sizes = [2, 1024, 5120], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1024x5120xf16> - %13 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [2, 1024, 5120], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1024x5120xf16> - %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%13, %12, %9, %10 : tensor<2x1024x5120xf16>, tensor<2x1024x5120xf16>, tensor<5120xf16>, tensor) outs(%11 : tensor<2x1024x5120xi8>) { - ^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f32, %out: i8): - %15 = math.sqrt %cst_3 : f16 - %16 = arith.divf %in_4, %15 : f16 - %17 = math.erf %16 : f16 - %18 = arith.addf %17, %cst_2 : f16 - %19 = arith.mulf %18, %cst_1 : f16 - %20 = arith.mulf %in_4, %19 : f16 - %21 = arith.mulf %in, %20 : f16 - %22 = arith.mulf %21, %in_5 : f16 - %23 = arith.truncf %in_6 : f32 to f16 - %24 = arith.divf %22, %23 : f16 - %25 = math.roundeven %24 : f16 - %26 = arith.cmpf ult, %25, %cst_0 : f16 - %27 = arith.select %26, %cst_0, %25 : f16 - %28 = arith.cmpf ugt, %27, %cst : f16 - %29 = arith.select %28, %cst, %27 : f16 - %30 = arith.fptosi %29 : f16 to i8 - linalg.yield %30 : i8 - } -> tensor<2x1024x5120xi8> - flow.dispatch.tensor.store %14, %8, offsets = [0, 0, 0], sizes = [2, 1024, 5120], strides = [1, 1, 1] : tensor<2x1024x5120xi8> -> !flow.dispatch.tensor> - return - } -} - } -} - -// CDNA3-LABEL: hal.executable public @erf_dispatch -// CDNA3: hal.executable.variant public @rocm -// CDNA3-COUNT-19: llvm.select {{.*}} : vector<8xi1>, vector<8xf32> -// CDNA3-COUNT-8: llvm.intr.fma{{.*}} : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32> -// CDNA3: %[[RESULT:.+]] = llvm.fptosi {{.*}} : vector<8xf16> to vector<8xi8> -// CDNA3: llvm.store %[[RESULT]], %{{.*}} : vector<8xi8>, !llvm.ptr<1> From c26d3dae86eb9669947ec1c19e0a7c583986648e Mon Sep 17 00:00:00 2001 From: Nirvedh Date: Fri, 14 Feb 2025 11:07:24 -0600 Subject: [PATCH 4/4] fail for complex types Signed-off-by: Nirvedh --- .../compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 1ac741871780..98571af9262b 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -511,6 +511,9 @@ static unsigned getRepresentativeBitWidth(linalg::LinalgOp linalgOp) { std::optional maxBitWidth; auto updateElementTypeBitWidth = [&](Value v) { auto elementType = getElementTypeOrSelf(v); + if (!elementType.isIntOrFloat()) { + return; + } unsigned bitWidth = elementType.getIntOrFloatBitWidth(); if (maxBitWidth) { maxBitWidth = std::max(maxBitWidth.value(), bitWidth);