diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 4ec0fddb8809..67dec1e91978 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -62,15 +62,15 @@ static uint32_t getArchID(MMAIntrinsic intrinsic) { return static_cast(intrinsic) & 0xFF00; } -static bool is_AMD_MFMA(MMAIntrinsic intrinsic) { +bool is_AMD_MFMA(MMAIntrinsic intrinsic) { return getArchID(intrinsic) >= 0x1000 && getArchID(intrinsic) <= 0x17FF; } -static bool is_AMD_WMMA(MMAIntrinsic intrinsic) { +bool is_AMD_WMMA(MMAIntrinsic intrinsic) { return getArchID(intrinsic) >= 0x1800 && getArchID(intrinsic) <= 0x1FFF; } -static bool is_AMD(MMAIntrinsic intrinsic) { +bool is_AMD(MMAIntrinsic intrinsic) { return is_AMD_MFMA(intrinsic) || is_AMD_WMMA(intrinsic); } @@ -296,6 +296,21 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic, return {}; } +MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic, + MMAFragment fragment, + bool colMajor) { + MMASingleSubgroupLayout baseLayout = + getSingleSubgroupLayout(intrinsic, fragment); + assert(baseLayout.element.size() == 2 && "expected 2d layout"); + if (colMajor) { + std::swap(baseLayout.element[0], baseLayout.element[1]); + std::swap(baseLayout.thread[0], baseLayout.thread[1]); + std::swap(baseLayout.outer[0], baseLayout.outer[1]); + std::swap(baseLayout.tstrides[0], baseLayout.tstrides[1]); + } + return baseLayout; +} + // Struct describing the shape of a MMA operation, but not the detailed layout. struct OpaqueMmaLayout { int64_t mSize = 0; @@ -339,7 +354,11 @@ static OpaqueMmaLayout getOpaqueMMALayout(MLIRContext *context, MMASingleSubgroupLayout getSingleSubgroupLayout(MmaInterfaceAttr mmaKind, MMAFragment fragment) { if (auto mmaAttr = dyn_cast(mmaKind)) { - return getSingleSubgroupLayout(mmaAttr.getIntrinsic(), fragment); + // |colMajor| indicates that the accumulator layout should be returned + // column major. + return getSingleSubgroupLayout(mmaAttr.getIntrinsic(), fragment, + fragment == MMAFragment::Acc && + mmaAttr.getColMajor()); } if (auto vmmaAttr = dyn_cast(mmaKind)) { return getSingleSubgroupLayout(vmmaAttr.getIntrinsic(), fragment); @@ -352,6 +371,10 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(MmaInterfaceAttr mmaKind, // MMA Attributes //===----------------------------------------------------------------------===// +MMAAttr MMAAttr::get(MLIRContext *context, MMAIntrinsic type) { + return Base::get(context, type, /*colMajor=*/false); +} + std::tuple MMAAttr::getABCElementTypes() const { return IREE::GPU::getABCElementTypes(getContext(), getIntrinsic()); } @@ -419,7 +442,7 @@ SmallVector MMAAttr::getVirtualIntrinsics() const { static Value createMmaOp(OpBuilder &builder, Location loc, MMAIntrinsic intrinsic, Type resultType, Value lhs, - Value rhs, Value acc) { + Value rhs, Value acc, bool colMajor = false) { auto getVecOrSingleElem = [&](Value vec) -> Value { bool one = llvm::cast(vec.getType()).getNumElements() == 1; return one ? builder.create(loc, vec, 0) : vec; @@ -429,6 +452,13 @@ static Value createMmaOp(OpBuilder &builder, Location loc, // MFMA intrinsics want single-element operands of element type, not vector. lhs = getVecOrSingleElem(lhs); rhs = getVecOrSingleElem(rhs); + + // Because the thread layout of the lhs and rhs are transpositions of one + // another for all MFMA variants, to produce a column major result we can + // simply swap the operands to the MFMA. + if (colMajor) { + std::swap(lhs, rhs); + } return builder .create(loc, resultType, layout.mSize, layout.nSize, layout.kSize, getBlockSize(intrinsic), lhs, rhs, @@ -458,7 +488,7 @@ FailureOr MMAAttr::buildMmaOperation(OpBuilder &builder, Location loc, return failure(); } if (Value value = createMmaOp(builder, loc, getIntrinsic(), resultType, lhs, - rhs, acc)) { + rhs, acc, getColMajor())) { return value; } return failure(); @@ -543,8 +573,8 @@ LogicalResult MMAAttr::populateOperandOffsetsSizesStrides( SmallVector &offsets, SmallVector &sizes, SmallVector &strides) const { - MMASingleSubgroupLayout subgroupLayout = - getSingleSubgroupLayout(getIntrinsic(), fragment); + MMASingleSubgroupLayout subgroupLayout = getSingleSubgroupLayout( + getIntrinsic(), fragment, fragment == MMAFragment::Acc && getColMajor()); SmallVector canonicalOffsets; SmallVector canonicalSizes; if (failed(populateCanonicalOffsetsSizesAndStrides( diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h index c5f23adb398d..144bfee21910 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h @@ -63,9 +63,17 @@ int64_t getMSize(MMAIntrinsic intrinsic); int64_t getNSize(MMAIntrinsic intrinsic); int64_t getKSize(MMAIntrinsic intrinsic); +bool is_AMD_MFMA(MMAIntrinsic intrinsic); +bool is_AMD_WMMA(MMAIntrinsic intrinsic); +bool is_AMD(MMAIntrinsic intrinsic); + MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic, MMAFragment fragment); +MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic, + MMAFragment fragment, + bool colMajor); + MMASingleSubgroupLayout getSingleSubgroupLayout(VirtualMMAIntrinsic intrinsic, MMAFragment fragment); diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td index ece2c50955e2..9b5487fc89cb 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td @@ -153,10 +153,15 @@ def IREEGPU_MMAAttr : AttrDef:$intrinsic + EnumParameter:$intrinsic, + DefaultValuedParameter<"bool", "false">:$col_major ); - let assemblyFormat = "`<` params `>`"; + let assemblyFormat = "`<` $intrinsic (`,` `col_major` `=` $col_major^)? `>`"; + + let builders = [ + AttrBuilder<(ins "MMAIntrinsic":$intrinsic)> + ]; let extraClassDeclaration = [{ int64_t getBlockSize() const; diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir index d506140ea27a..8013b72c1f62 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir @@ -18,6 +18,15 @@ module { // CHECK-LABEL: func @test_mfma_f16_32x32x8_f32 // CHECK-SAME: mma_types = #iree_gpu.mma_layout +module { + func.func @test_col_major_mfma_f16_16x16x16_f32() attributes { + mma_types = #iree_gpu.mma_layout} { + return + } +} +// CHECK-LABEL: func @test_col_major_mfma_f16_16x16x16_f32 +// CHECK-SAME: mma_types = #iree_gpu.mma_layout + module { func.func @test_wmma_f16_16x16x16_f32() attributes { mma_types = #iree_gpu.mma_layout} { diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 48bfcc9a7c2a..681cc809e6a4 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -14,6 +14,7 @@ #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h" #include "iree/compiler/Codegen/Utils/Utils.h" +#include "iree/compiler/Dialect/Flow/IR/FlowOps.h" #include "iree/compiler/Dialect/LinalgExt/Utils/Utils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Casting.h" @@ -172,12 +173,40 @@ static std::optional getMmaScheduleFromProblemAndTarget( return schedule; } +/// A coarse approximation of when the given value |v| is consumed column +/// major. +bool coarseIsResultColumnMajor(Value v, int64_t dim0, int64_t dim1) { + if (!v.hasOneUse()) { + return false; + } + + auto consumer = dyn_cast(*v.user_begin()); + if (!consumer) { + return false; + } + + OpOperand &operand = *v.use_begin(); + AffineMap indexingMap = consumer.getMatchingIndexingMap(&operand); + + SmallVector permutedDims; + auto d0 = dyn_cast(indexingMap.getResult(dim0)); + auto d1 = dyn_cast(indexingMap.getResult(dim1)); + + // If dim0 (outer dim) has a smaller position than dim1, then assume the + // consumer is not using |v| transposed. + if (!d0 || !d1 || d0.getPosition() < d1.getPosition()) { + return false; + } + return true; +} + /// Create a matmul lowering config based on iteration bounds and indexing /// maps for a given target. This function computes contraction dimensions /// and deduces an MMA intrinsic schedule to choose tile sizes and the /// workgroup size. static FailureOr> -getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, +getMatmulLoweringConfigAndWorkgroupSize(Value result, + SmallVector bounds, ArrayRef maps, ArrayRef operands, IREE::GPU::TargetAttr target) { @@ -327,6 +356,22 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, IREE::GPU::MmaInterfaceAttr mmaKind = target.getWgp().getMma()[schedule->index]; + if (auto mma = dyn_cast(mmaKind)) { + bool preferColumnMajor = + coarseIsResultColumnMajor(result, mDims.back(), nDims.back()); + + // Note that "column major" is overloaded here. |preferColumnMajor| is in + // reference to the computation itself, while |colMajor| on MMAAttr refers + // to whether the result of the MMA instruction should be column major. MFMA + // only vectorizes along columns, so we want to pick the *opposite* of + // whatever the computation prefers (e.g. row-major compute => do MFMA + // column major). + if (IREE::GPU::is_AMD_MFMA(mma.getIntrinsic()) && !preferColumnMajor) { + mmaKind = + MMAAttr::get(mma.getContext(), mma.getIntrinsic(), /*colMajor=*/true); + } + } + // Attach the MMA schedule as an attribute to the entry point export function // for later access in the pipeline. MLIRContext *context = lhs.getContext(); @@ -374,7 +419,8 @@ setIGEMMConvolutionLoweringConfig(IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint, Operation *op) { auto linalgOp = dyn_cast(op); - if (!linalgOp || !linalg::isaConvolutionOpInterface(linalgOp)) { + if (!linalgOp || !linalg::isaConvolutionOpInterface(linalgOp) || + !linalgOp.hasPureTensorSemantics()) { return failure(); } @@ -396,7 +442,8 @@ setIGEMMConvolutionLoweringConfig(IREE::GPU::TargetAttr target, SmallVector bounds = igemmLoopBounds; FailureOr> configAndWgSize = - getMatmulLoweringConfigAndWorkgroupSize(bounds, igemmContractionMaps, + getMatmulLoweringConfigAndWorkgroupSize(linalgOp->getResult(0), bounds, + igemmContractionMaps, igemmOperands, target); if (failed(configAndWgSize)) { return failure(); @@ -429,7 +476,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint, Operation *op) { auto linalgOp = dyn_cast(op); - if (!linalgOp || !linalg::isaContractionOpInterface(linalgOp)) { + if (!linalgOp || !linalg::isaContractionOpInterface(linalgOp) || + !linalgOp.hasPureTensorSemantics()) { return failure(); } @@ -440,7 +488,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target, LDBG("Matmul TileAndFuse Config"); FailureOr> configAndWgSize = - getMatmulLoweringConfigAndWorkgroupSize(bounds, maps, operands, target); + getMatmulLoweringConfigAndWorkgroupSize(linalgOp->getResult(0), bounds, + maps, operands, target); if (failed(configAndWgSize)) { return failure(); } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir index 893994160471..9cc04926932e 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir @@ -68,6 +68,40 @@ module attributes { transform.with_named_sequence } { // ----- +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @lower_col_major_multi_mma_mfma_32x32x8(%lhs: vector<4xf16>, %rhs: vector<4xf16>, %acc: vector<16xf32>) -> vector<16xf32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : vector<4xf16>, vector<4xf16> into vector<16xf32> + return %0 : vector<16xf32> +} + +module attributes { transform.with_named_sequence } { + transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.iree.lower_multi_mma + } : !transform.any_op + transform.yield + } +} + +// CHECK-LABEL: func @lower_col_major_multi_mma_mfma_32x32x8 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: vector<4xf16> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: vector<4xf16> +// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: vector<16xf32> +// CHECK: amdgpu.mfma %[[RHS]] * %[[LHS]] + %[[ACC]] +// CHECK-SAME: blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32 +// CHECK-SAME: blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> + +// ----- + #contraction_accesses = [ affine_map<() -> ()>, affine_map<() -> ()>, diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir index d8af13ab5916..ff2109c3e647 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir @@ -107,6 +107,45 @@ module { // ----- +#contraction_accesses = [ + affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)> +] +module { + func.func @col_major_matmul_32x32x8(%arg0: tensor<2x8x32x8xf16>, %arg1: tensor<8x2x32x8xf16>, %arg2: tensor<2x2x32x4x8xf32>) -> tensor<2x2x32x4x8xf32> { + %mm = iree_gpu.multi_mma %arg0, %arg1, %arg2 { + indexing_maps = #contraction_accesses, + iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], + kind = #iree_gpu.mma_layout, + rhs_permutation = array + } : tensor<2x8x32x8xf16>, tensor<8x2x32x8xf16> into tensor<2x2x32x4x8xf32> + return %mm : tensor<2x2x32x4x8xf32> + } +} + +// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func @col_major_matmul_32x32x8 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x8x32x8xf16> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<8x2x32x8xf16> +// CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<2x2x32x4x8xf32>) +// CHECK-DAG: %[[ID:.+]]:3 = affine.delinearize_index %[[LANEID]] into (2, 32) +// CHECK-DAG: %[[IDY:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (2, 4) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]]#2, %[[IDY]]] [2, 8, 1, 4] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID]]#2, %[[IDY]]] [8, 2, 1, 4] +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, %[[ID]]#2, 0, %[[IDY]]] [2, 2, 1, 4, 4] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<2x8x1x4xf16>, tensor<8x2x1x4xf16> into tensor<2x2x1x4x4xf32> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, %[[ID]]#2, 0, %[[IDY]]] [2, 2, 1, 4, 4] +// CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + #contraction_accesses = [ affine_map<(i, j, k) -> (i, k)>, affine_map<(i, j, k) -> (k, j)>, diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir index cf170ef7d930..c73960a07e55 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir @@ -22,7 +22,7 @@ func.func @nhwc_conv_mfma() { // CHECK-SAME: use_igemm_convolution = true // CHECK: linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config -// CHECK-SAME: mma_kind = #iree_gpu.mma_layout +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 0, 0, 8] // CHECK-SAME: subgroup = [1, 2, 2, 1, 0] @@ -51,7 +51,7 @@ func.func @nchw_conv_mfma() { // CHECK-SAME: use_igemm_convolution = true // CHECK: linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config -// CHECK-SAME: mma_kind = #iree_gpu.mma_layout +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 0, 0, 8] // CHECK-SAME: subgroup = [1, 2, 2, 1, 0] @@ -80,7 +80,7 @@ func.func @nhwc_conv_unaligned_mfma() { // CHECK-SAME: use_igemm_convolution = true // CHECK: linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config -// CHECK-SAME: mma_kind = #iree_gpu.mma_layout +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: padding = [2, 1, 32, 64, 32] // CHECK-SAME: promote_operands = [0, 1, 2] // CHECK-SAME: reduction = [0, 0, 0, 0, 8] @@ -110,7 +110,7 @@ func.func @nchw_conv_unaligned_mfma() { // CHECK-SAME: use_igemm_convolution = true // CHECK: linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config -// CHECK-SAME: mma_kind = #iree_gpu.mma_layout +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: padding = [1, 64, 2, 32, 32] // CHECK-SAME: promote_operands = [0, 1, 2] // CHECK-SAME: reduction = [0, 0, 0, 0, 8] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir index ec6038f47dee..957a57a03ec5 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir @@ -41,7 +41,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor // CHECK: linalg.fill ins // CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config -// CHECK-SAME: mma_kind = #iree_gpu.mma_layout +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 0, 0, 4] // CHECK-SAME: subgroup = [1, 1, 4, 1, 0] @@ -78,7 +78,7 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4 // CHECK-SAME: #iree_gpu.pipeline_options // CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config -// CHECK-SAME: mma_kind = #iree_gpu.mma_layout +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 0, 0, 4, 1] // CHECK-SAME: subgroup = [2, 2, 1, 1, 0, 0] @@ -117,7 +117,7 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor, %rhs: t // CHECK-SAME: #iree_gpu.pipeline_options // CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config -// CHECK-SAME: mma_kind = #iree_gpu.mma_layout +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 0, 0, 0, 1, 1] // CHECK-SAME: subgroup = [0, 1, 0, 1, 1, 0, 0] @@ -143,6 +143,38 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor< // Verify that the fill does not have the lowering config propagated to it. // CHECK: linalg.fill ins +// CHECK: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout +// CHECK-SAME: promote_operands = [0, 1] +// CHECK-SAME: reduction = [0, 0, 2] +// CHECK-SAME: subgroup = [4, 4, 0] +// CHECK-SAME: workgroup = [128, 128, 0] + +// LATE: LLVMGPUVectorDistribute + +// ----- + +func.func @transposed_mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<1024x1024xf16>) -> tensor<1024x1024xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %5 = tensor.empty() : tensor<1024x1024xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + %7 = linalg.matmul ins(%lhs, %rhs : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + %8 = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"]} + ins(%7 : tensor<1024x1024xf32>) outs(%5 : tensor<1024x1024xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1024x1024xf32> + return %8 : tensor<1024x1024xf32> +} + +// CHECK-LABEL: func.func @transposed_mfma_matmul_1024x1024x1024 +// CHECK-SAME: #iree_codegen.translation_info + +// Verify that we do not choose to use a column major layout. // CHECK: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config // CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: promote_operands = [0, 1] @@ -321,7 +353,7 @@ func.func @unaligned_matmul_with_two_reduce_dim(%arg0: tensor<196x9x4xf32>, %arg // LATE-LABEL: func.func @unaligned_matmul_with_two_reduce_dim // LATE-SAME: {translation_info = #iree_codegen.translation_info +// LATE-SAME: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout // LATE-SAME: padding = [16, 1, 16, 4] // LATE-SAME: promote_operands = [0, 1, 2] // LATE-SAME: reduction = [0, 1, 0, 1],