Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LLVMGPU] Prefer column major MFMA to vectorize stores #19919

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,15 @@ static uint32_t getArchID(MMAIntrinsic intrinsic) {
return static_cast<int>(intrinsic) & 0xFF00;
}

static bool is_AMD_MFMA(MMAIntrinsic intrinsic) {
bool is_AMD_MFMA(MMAIntrinsic intrinsic) {
return getArchID(intrinsic) >= 0x1000 && getArchID(intrinsic) <= 0x17FF;
}

static bool is_AMD_WMMA(MMAIntrinsic intrinsic) {
bool is_AMD_WMMA(MMAIntrinsic intrinsic) {
return getArchID(intrinsic) >= 0x1800 && getArchID(intrinsic) <= 0x1FFF;
}

static bool is_AMD(MMAIntrinsic intrinsic) {
bool is_AMD(MMAIntrinsic intrinsic) {
return is_AMD_MFMA(intrinsic) || is_AMD_WMMA(intrinsic);
}

Expand Down Expand Up @@ -296,6 +296,21 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
return {};
}

MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
MMAFragment fragment,
bool colMajor) {
MMASingleSubgroupLayout baseLayout =
getSingleSubgroupLayout(intrinsic, fragment);
assert(baseLayout.element.size() == 2 && "expected 2d layout");
if (colMajor) {
std::swap(baseLayout.element[0], baseLayout.element[1]);
std::swap(baseLayout.thread[0], baseLayout.thread[1]);
std::swap(baseLayout.outer[0], baseLayout.outer[1]);
std::swap(baseLayout.tstrides[0], baseLayout.tstrides[1]);
}
return baseLayout;
}

// Struct describing the shape of a MMA operation, but not the detailed layout.
struct OpaqueMmaLayout {
int64_t mSize = 0;
Expand Down Expand Up @@ -339,7 +354,11 @@ static OpaqueMmaLayout getOpaqueMMALayout(MLIRContext *context,
MMASingleSubgroupLayout getSingleSubgroupLayout(MmaInterfaceAttr mmaKind,
MMAFragment fragment) {
if (auto mmaAttr = dyn_cast<MMAAttr>(mmaKind)) {
return getSingleSubgroupLayout(mmaAttr.getIntrinsic(), fragment);
// |colMajor| indicates that the accumulator layout should be returned
// column major.
return getSingleSubgroupLayout(mmaAttr.getIntrinsic(), fragment,
fragment == MMAFragment::Acc &&
mmaAttr.getColMajor());
}
if (auto vmmaAttr = dyn_cast<VirtualMMAAttr>(mmaKind)) {
return getSingleSubgroupLayout(vmmaAttr.getIntrinsic(), fragment);
Expand All @@ -352,6 +371,10 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(MmaInterfaceAttr mmaKind,
// MMA Attributes
//===----------------------------------------------------------------------===//

MMAAttr MMAAttr::get(MLIRContext *context, MMAIntrinsic type) {
return Base::get(context, type, /*colMajor=*/false);
}

std::tuple<Type, Type, Type> MMAAttr::getABCElementTypes() const {
return IREE::GPU::getABCElementTypes(getContext(), getIntrinsic());
}
Expand Down Expand Up @@ -419,7 +442,7 @@ SmallVector<VirtualMMAIntrinsic> MMAAttr::getVirtualIntrinsics() const {

static Value createMmaOp(OpBuilder &builder, Location loc,
MMAIntrinsic intrinsic, Type resultType, Value lhs,
Value rhs, Value acc) {
Value rhs, Value acc, bool colMajor = false) {
auto getVecOrSingleElem = [&](Value vec) -> Value {
bool one = llvm::cast<VectorType>(vec.getType()).getNumElements() == 1;
return one ? builder.create<vector::ExtractOp>(loc, vec, 0) : vec;
Expand All @@ -429,6 +452,13 @@ static Value createMmaOp(OpBuilder &builder, Location loc,
// MFMA intrinsics want single-element operands of element type, not vector.
lhs = getVecOrSingleElem(lhs);
rhs = getVecOrSingleElem(rhs);

// Because the thread layout of the lhs and rhs are transpositions of one
// another for all MFMA variants, to produce a column major result we can
// simply swap the operands to the MFMA.
if (colMajor) {
std::swap(lhs, rhs);
}
return builder
.create<amdgpu::MFMAOp>(loc, resultType, layout.mSize, layout.nSize,
layout.kSize, getBlockSize(intrinsic), lhs, rhs,
Expand Down Expand Up @@ -458,7 +488,7 @@ FailureOr<Value> MMAAttr::buildMmaOperation(OpBuilder &builder, Location loc,
return failure();
}
if (Value value = createMmaOp(builder, loc, getIntrinsic(), resultType, lhs,
rhs, acc)) {
rhs, acc, getColMajor())) {
return value;
}
return failure();
Expand Down Expand Up @@ -543,8 +573,8 @@ LogicalResult MMAAttr::populateOperandOffsetsSizesStrides(
SmallVector<OpFoldResult> &offsets, SmallVector<OpFoldResult> &sizes,
SmallVector<OpFoldResult> &strides) const {

MMASingleSubgroupLayout subgroupLayout =
getSingleSubgroupLayout(getIntrinsic(), fragment);
MMASingleSubgroupLayout subgroupLayout = getSingleSubgroupLayout(
getIntrinsic(), fragment, fragment == MMAFragment::Acc && getColMajor());
SmallVector<OpFoldResult> canonicalOffsets;
SmallVector<OpFoldResult> canonicalSizes;
if (failed(populateCanonicalOffsetsSizesAndStrides(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,17 @@ int64_t getMSize(MMAIntrinsic intrinsic);
int64_t getNSize(MMAIntrinsic intrinsic);
int64_t getKSize(MMAIntrinsic intrinsic);

bool is_AMD_MFMA(MMAIntrinsic intrinsic);
bool is_AMD_WMMA(MMAIntrinsic intrinsic);
bool is_AMD(MMAIntrinsic intrinsic);

MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
MMAFragment fragment);

MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
MMAFragment fragment,
bool colMajor);

MMASingleSubgroupLayout getSingleSubgroupLayout(VirtualMMAIntrinsic intrinsic,
MMAFragment fragment);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,15 @@ def IREEGPU_MMAAttr : AttrDef<IREEGPU_Dialect, "MMA", [
}];

let parameters = (ins
EnumParameter<IREEGPU_MMAIntrinsic>:$intrinsic
EnumParameter<IREEGPU_MMAIntrinsic>:$intrinsic,
DefaultValuedParameter<"bool", "false">:$col_major
);

let assemblyFormat = "`<` params `>`";
let assemblyFormat = "`<` $intrinsic (`,` `col_major` `=` $col_major^)? `>`";

let builders = [
AttrBuilder<(ins "MMAIntrinsic":$intrinsic)>
];

let extraClassDeclaration = [{
int64_t getBlockSize() const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,15 @@ module {
// CHECK-LABEL: func @test_mfma_f16_32x32x8_f32
// CHECK-SAME: mma_types = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>

module {
func.func @test_col_major_mfma_f16_16x16x16_f32() attributes {
mma_types = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16, col_major = true>} {
return
}
}
// CHECK-LABEL: func @test_col_major_mfma_f16_16x16x16_f32
// CHECK-SAME: mma_types = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16, col_major = true>

module {
func.func @test_wmma_f16_16x16x16_f32() attributes {
mma_types = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>} {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
#include "iree/compiler/Dialect/LinalgExt/Utils/Utils.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Casting.h"
Expand Down Expand Up @@ -172,12 +173,40 @@ static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
return schedule;
}

/// A coarse approximation of when the given value |v| is consumed column
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this "coarseness" be refined into something more precise in any useful way?

/// major.
bool coarseIsResultColumnMajor(Value v, int64_t dim0, int64_t dim1) {
if (!v.hasOneUse()) {
return false;
}

auto consumer = dyn_cast<linalg::LinalgOp>(*v.user_begin());
if (!consumer) {
return false;
}

OpOperand &operand = *v.use_begin();
AffineMap indexingMap = consumer.getMatchingIndexingMap(&operand);

SmallVector<unsigned int> permutedDims;
auto d0 = dyn_cast<AffineDimExpr>(indexingMap.getResult(dim0));
auto d1 = dyn_cast<AffineDimExpr>(indexingMap.getResult(dim1));

// If dim0 (outer dim) has a smaller position than dim1, then assume the
// consumer is not using |v| transposed.
if (!d0 || !d1 || d0.getPosition() < d1.getPosition()) {
return false;
}
return true;
}

/// Create a matmul lowering config based on iteration bounds and indexing
/// maps for a given target. This function computes contraction dimensions
/// and deduces an MMA intrinsic schedule to choose tile sizes and the
/// workgroup size.
static FailureOr<std::pair<LoweringConfigAttr, int64_t>>
getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
getMatmulLoweringConfigAndWorkgroupSize(Value result,
SmallVector<int64_t> bounds,
ArrayRef<AffineMap> maps,
ArrayRef<Value> operands,
IREE::GPU::TargetAttr target) {
Expand Down Expand Up @@ -327,6 +356,22 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
IREE::GPU::MmaInterfaceAttr mmaKind =
target.getWgp().getMma()[schedule->index];

if (auto mma = dyn_cast<MMAAttr>(mmaKind)) {
bool preferColumnMajor =
coarseIsResultColumnMajor(result, mDims.back(), nDims.back());

// Note that "column major" is overloaded here. |preferColumnMajor| is in
// reference to the computation itself, while |colMajor| on MMAAttr refers
// to whether the result of the MMA instruction should be column major. MFMA
// only vectorizes along columns, so we want to pick the *opposite* of
// whatever the computation prefers (e.g. row-major compute => do MFMA
// column major).
if (IREE::GPU::is_AMD_MFMA(mma.getIntrinsic()) && !preferColumnMajor) {
mmaKind =
MMAAttr::get(mma.getContext(), mma.getIntrinsic(), /*colMajor=*/true);
}
}

// Attach the MMA schedule as an attribute to the entry point export function
// for later access in the pipeline.
MLIRContext *context = lhs.getContext();
Expand Down Expand Up @@ -374,7 +419,8 @@ setIGEMMConvolutionLoweringConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
Operation *op) {
auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
if (!linalgOp || !linalg::isaConvolutionOpInterface(linalgOp)) {
if (!linalgOp || !linalg::isaConvolutionOpInterface(linalgOp) ||
!linalgOp.hasPureTensorSemantics()) {
return failure();
}

Expand All @@ -396,7 +442,8 @@ setIGEMMConvolutionLoweringConfig(IREE::GPU::TargetAttr target,

SmallVector<int64_t> bounds = igemmLoopBounds;
FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
getMatmulLoweringConfigAndWorkgroupSize(bounds, igemmContractionMaps,
getMatmulLoweringConfigAndWorkgroupSize(linalgOp->getResult(0), bounds,
igemmContractionMaps,
igemmOperands, target);
if (failed(configAndWgSize)) {
return failure();
Expand Down Expand Up @@ -429,7 +476,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
Operation *op) {
auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
if (!linalgOp || !linalg::isaContractionOpInterface(linalgOp)) {
if (!linalgOp || !linalg::isaContractionOpInterface(linalgOp) ||
!linalgOp.hasPureTensorSemantics()) {
return failure();
}

Expand All @@ -440,7 +488,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
LDBG("Matmul TileAndFuse Config");

FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
getMatmulLoweringConfigAndWorkgroupSize(bounds, maps, operands, target);
getMatmulLoweringConfigAndWorkgroupSize(linalgOp->getResult(0), bounds,
maps, operands, target);
if (failed(configAndWgSize)) {
return failure();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,40 @@ module attributes { transform.with_named_sequence } {

// -----

#contraction_accesses = [
affine_map<() -> ()>,
affine_map<() -> ()>,
affine_map<() -> ()>
]
func.func @lower_col_major_multi_mma_mfma_32x32x8(%lhs: vector<4xf16>, %rhs: vector<4xf16>, %acc: vector<16xf32>) -> vector<16xf32> {
%0 = iree_gpu.multi_mma %lhs, %rhs, %acc {
indexing_maps = #contraction_accesses,
iterator_types = [],
kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16, col_major = true>
} : vector<4xf16>, vector<4xf16> into vector<16xf32>
return %0 : vector<16xf32>
}

module attributes { transform.with_named_sequence } {
transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
%func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %func {
transform.apply_patterns.iree.lower_multi_mma
} : !transform.any_op
transform.yield
}
}

// CHECK-LABEL: func @lower_col_major_multi_mma_mfma_32x32x8
// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: vector<4xf16>
// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: vector<4xf16>
// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: vector<16xf32>
// CHECK: amdgpu.mfma %[[RHS]] * %[[LHS]] + %[[ACC]]
// CHECK-SAME: blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32
// CHECK-SAME: blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32>

// -----

#contraction_accesses = [
affine_map<() -> ()>,
affine_map<() -> ()>,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,45 @@ module {

// -----

#contraction_accesses = [
affine_map<(i, j, k) -> (i, k)>,
affine_map<(i, j, k) -> (k, j)>,
affine_map<(i, j, k) -> (i, j)>
]
module {
func.func @col_major_matmul_32x32x8(%arg0: tensor<2x8x32x8xf16>, %arg1: tensor<8x2x32x8xf16>, %arg2: tensor<2x2x32x4x8xf32>) -> tensor<2x2x32x4x8xf32> {
%mm = iree_gpu.multi_mma %arg0, %arg1, %arg2 {
indexing_maps = #contraction_accesses,
iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>],
kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16, col_major = true>,
rhs_permutation = array<i64: 1, 0>
} : tensor<2x8x32x8xf16>, tensor<8x2x32x8xf16> into tensor<2x2x32x4x8xf32>
return %mm : tensor<2x2x32x4x8xf32>
}
}

// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>

// CHECK-LABEL: func @col_major_matmul_32x32x8
// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x8x32x8xf16>
// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<8x2x32x8xf16>
// CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<2x2x32x4x8xf32>)
// CHECK-DAG: %[[ID:.+]]:3 = affine.delinearize_index %[[LANEID]] into (2, 32)
// CHECK-DAG: %[[IDY:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (2, 4)
// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]]#2, %[[IDY]]] [2, 8, 1, 4]
// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID]]#2, %[[IDY]]] [8, 2, 1, 4]
// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, %[[ID]]#2, 0, %[[IDY]]] [2, 2, 1, 4, 4]
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]]
// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
// CHECK-SAME: kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16, col_major = true>
// CHECK-SAME: : tensor<2x8x1x4xf16>, tensor<8x2x1x4xf16> into tensor<2x2x1x4x4xf32>
// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, %[[ID]]#2, 0, %[[IDY]]] [2, 2, 1, 4, 4]
// CHECK: mapping = [#iree_gpu.lane_id<0>]

// -----

#contraction_accesses = [
affine_map<(i, j, k) -> (i, k)>,
affine_map<(i, j, k) -> (k, j)>,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func.func @nhwc_conv_mfma() {
// CHECK-SAME: use_igemm_convolution = true

// CHECK: linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32, col_major = true>
// CHECK-SAME: promote_operands = [0, 1]
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
// CHECK-SAME: subgroup = [1, 2, 2, 1, 0]
Expand Down Expand Up @@ -51,7 +51,7 @@ func.func @nchw_conv_mfma() {
// CHECK-SAME: use_igemm_convolution = true

// CHECK: linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32, col_major = true>
// CHECK-SAME: promote_operands = [0, 1]
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
// CHECK-SAME: subgroup = [1, 2, 2, 1, 0]
Expand Down Expand Up @@ -80,7 +80,7 @@ func.func @nhwc_conv_unaligned_mfma() {
// CHECK-SAME: use_igemm_convolution = true

// CHECK: linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32, col_major = true>
// CHECK-SAME: padding = [2, 1, 32, 64, 32]
// CHECK-SAME: promote_operands = [0, 1, 2]
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
Expand Down Expand Up @@ -110,7 +110,7 @@ func.func @nchw_conv_unaligned_mfma() {
// CHECK-SAME: use_igemm_convolution = true

// CHECK: linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32, col_major = true>
// CHECK-SAME: padding = [1, 64, 2, 32, 32]
// CHECK-SAME: promote_operands = [0, 1, 2]
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
Expand Down
Loading
Loading