Skip to content

Commit

Permalink
multi_mma
Browse files Browse the repository at this point in the history
Signed-off-by: Benoit Jacob <[email protected]>
  • Loading branch information
bjacob committed Dec 17, 2024
1 parent ebebe8f commit 503a4e1
Show file tree
Hide file tree
Showing 11 changed files with 164 additions and 39 deletions.
16 changes: 14 additions & 2 deletions compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ argmax_types = [
[iree_amdgpu_bitcode_library(
name = "iree_uk_amdgpu_argmax_%s_%s" % (type, gpu_arch),
srcs = [
"iree_uk_amdgpu_argmax_%s.c" % type,
"common.h",
"iree_uk_amdgpu_argmax_%s.c" % type,
],
out = "iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch),
gpu_arch = gpu_arch,
Expand All @@ -59,9 +59,21 @@ argmax_bc_files = [
for gpu_arch in gpu_archs
]

iree_amdgpu_bitcode_library(
name = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4_gfx942",
srcs = [
"common.h",
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c",
],
out = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc",
gpu_arch = "gfx942",
)

iree_c_embed_data(
name = "iree_uk_amdgpu_bitcode",
srcs = argmax_bc_files,
srcs = argmax_bc_files + [
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc",
],
c_file_output = "iree_uk_amdgpu_bitcode.c",
flatten = True,
h_file_output = "iree_uk_amdgpu_bitcode.h",
Expand Down
13 changes: 13 additions & 0 deletions compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,18 @@ iree_amdgpu_bitcode_library(
"iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
)

iree_amdgpu_bitcode_library(
NAME
iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4_gfx942
GPU_ARCH
gfx942
SRCS
"common.h"
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c"
OUT
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
)

iree_c_embed_data(
NAME
iree_uk_amdgpu_bitcode
Expand All @@ -226,6 +238,7 @@ iree_c_embed_data(
"iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
"iree_uk_amdgpu_argmax_f32i64.gfx90a.bc"
"iree_uk_amdgpu_argmax_f32i64.gfx942.bc"
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
C_FILE_OUTPUT
"iree_uk_amdgpu_bitcode.c"
H_FILE_OUTPUT
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"

// Very naive kernel. TODO(bjacob):
// 1. Shared memory: can't allocate it within the microkernel (which is just a
// helper device function, not the actual amdgpu_kernel). Need to get it
// passed down here as a `T [[clang::address_space(3)]] *` parameter.
// 2. Better scheduling via either barrier intrinsics or inline assemby.
// 3. Subgroups1x4 being asymmetric is a historical accident... should be 2x2.
[[clang::always_inline, clang::convergent]] void
iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4(
const int8_t *a_buffer, int64_t a_offset, const int8_t *b_buffer,
int64_t b_offset, int32_t *c_buffer, int64_t c_offset, int64_t k_size) {
typedef __attribute__((__vector_size__(8 * 2))) int64_t int64x2_t;
typedef __attribute__((__vector_size__(4 * 4))) int32_t int32x4_t;
int tid = __builtin_amdgcn_workitem_id_x();

// Load existing accumulators.
int32x4_t acc[8][2] = {{0}};
int32x4_t *c_global = (int32x4_t *)(c_buffer + c_offset);
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 2; ++j) {
acc[i][j] = c_global[256 * (2 * i + j) + tid];
}
}

// Arithmetic loop.
const int64x2_t *a_global =
(const int64x2_t *)(a_buffer + a_offset) + (tid % 64);
const int64x2_t *b_global = (const int64x2_t *)(b_buffer + b_offset) + tid;
for (int k_outer = 0; k_outer < k_size; ++k_outer) {
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 2; ++j) {
for (int k = 0; k < 2; ++k) {
acc[i][j] = __builtin_amdgcn_mfma_i32_16x16x32_i8(
a_global[64 * i][k], b_global[256 * j][k], acc[i][j], 0, 0, 0);
}
}
}
a_global += 512;
b_global += 512;
}

// Store accumulators.
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 2; ++j) {
c_global[256 * (2 * i + j) + tid] = acc[i][j];
}
}
}
1 change: 1 addition & 0 deletions compiler/plugins/target/ROCM/test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ iree_lit_test_suite(
srcs = [
"config_ukernel_argmax_gfx908.mlir",
"config_ukernel_argmax_gfx942.mlir",
"config_ukernel_multi_mma_gfx942.mlir",
"default_tuning_specs_amdgpu.mlir",
"lowering_strategy_from_tuning_spec.mlir",
"ukernel_pipeline_transform.mlir",
Expand Down
1 change: 1 addition & 0 deletions compiler/plugins/target/ROCM/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ iree_lit_test_suite(
SRCS
"config_ukernel_argmax_gfx908.mlir"
"config_ukernel_argmax_gfx942.mlir"
"config_ukernel_multi_mma_gfx942.mlir"
"default_tuning_specs_amdgpu.mlir"
"lowering_strategy_from_tuning_spec.mlir"
"ukernel_pipeline_transform.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s

func.func @multi_mma_mfma_i32_16x16x32_i8(%a : tensor<1x2x8x4x16x2x8xi8>,
%b : tensor<1x2x4x2x4x16x2x8xi8>,
%c : tensor<1x1x8x4x2x4x16x4xi32>)
-> tensor<1x1x8x4x2x4x16x4xi32> attributes {
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "multi_mma"}>
} {
%d = iree_gpu.multi_mma %a, %b, %c {indexing_maps = [
affine_map<(d0, d1, d2) -> (d0, d2)>,
affine_map<(d0, d1, d2) -> (d1, d2)>,
affine_map<(d0, d1, d2) -> (d0, d1)>
], iterator_types = [
#iree_gpu.iterator_type<parallel>,
#iree_gpu.iterator_type<parallel>,
#iree_gpu.iterator_type<reduction>
], kind = #iree_gpu.data_tiled_mma_layout<
intrinsic = MFMA_I32_16x16x32_I8,
unroll_m = 8, unroll_n = 2, subgroups_n = 4, unroll_k = 2
>} : tensor<1x2x8x4x16x2x8xi8>, tensor<1x2x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
return %d : tensor<1x1x8x4x2x4x16x4xi32>
}

// CHECK-LABEL: @multi_mma_mfma_i32_16x16x32_i8
// CHECK: iree_gpu.multi_mma
// CHECK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
// CHECK-NOT: promote_operands
// CHECK-SAME: reduction = [0, 0, 0]
// CHECK-SAME: #iree_gpu.ukernel_config<name = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4"
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,9 @@ namespace mlir::iree_compiler::IREE::GPU {

constexpr int64_t kCacheLineSizeBits = 128 * 8;

LogicalResult
setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
Operation *op) {
LogicalResult setDataTiledMultiMmaLoweringConfig(
IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
Operation *op, IREE::GPU::UKernelConfigAttr ukernelConfig) {
auto multiMmaOp = dyn_cast<IREE::GPU::MultiMmaOp>(op);
if (!multiMmaOp) {
return failure();
Expand Down Expand Up @@ -70,7 +69,7 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
SmallVector<int64_t> reductionTileSizes(iterationRank, 0);
for (int64_t kDim : contractionDims.k) {
workgroupTileSizes[kDim] = 0;
reductionTileSizes[kDim] = 1;
reductionTileSizes[kDim] = ukernelConfig ? 0 : 1;
}

// Set tile sizes.
Expand All @@ -81,8 +80,16 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
b.getI64ArrayAttr(workgroupTileSizes));
attrs.emplace_back(b.getStringAttr("reduction"),
b.getI64ArrayAttr(reductionTileSizes));
// Promote operands to use shared memory for LHS and RHS.
GPU::setPromotedOperandList(context, attrs, {0, 1});
if (ukernelConfig) {
attrs.emplace_back(b.getStringAttr("ukernel"), ukernelConfig);
} else {
// Promote operands to use shared memory for LHS and RHS.
// Don't do that with ukernels: their untiled reduction dimension is too
// large to fit in shared memory, so they just want global memory and they
// will take care of moving small chunks at a time into a shared memory
// operand that will be created together with the ukernel op.
GPU::setPromotedOperandList(context, attrs, {0, 1});
}
auto configDict = b.getDictionaryAttr(attrs);
auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@ namespace mlir::iree_compiler::IREE::GPU {

/// Helper for setting up a data tiled multi_mma config based on the specified
/// target.
LogicalResult
setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
Operation *op);
LogicalResult setDataTiledMultiMmaLoweringConfig(
IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
Operation *op, IREE::GPU::UKernelConfigAttr ukernelConfig);

/// Helper for setting up a convolution config using IGEMM based on the
/// specified target.
Expand Down
33 changes: 8 additions & 25 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2043,15 +2043,9 @@ static LogicalResult setTransposeConfig(mlir::FunctionOpInterface entryPoint,
/// Set the configuration for argmax when ukernels are enabled.
/// Distribute all parallel dim across different workgroups, and only use single
/// subgroup per workgroup.
static LogicalResult
setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
linalg::GenericOp op) {
IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(op);
if (!ukernelConfig) {
return failure();
}

static LogicalResult setArgmaxUkernelConfig(
IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
linalg::GenericOp op, IREE::GPU::UKernelConfigAttr ukernelConfig) {
SmallVector<unsigned> parallelDims;
SmallVector<unsigned> reductionDims;
op.getParallelDims(parallelDims);
Expand Down Expand Up @@ -2114,15 +2108,6 @@ setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
return success();
}

/// Make UKernels take the LLVMGPUDefault lowering pipeline.
static LogicalResult
setUKernelConfig(mlir::FunctionOpInterface entryPoint,
IREE::Codegen::UKernelOpInterface ukernelOp) {
auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
entryPoint->getContext(), CodeGenPipeline::LLVMGPUDefault);
return setTranslationInfo(entryPoint, translationInfo);
}

/// Decides the tiling and distribution parameters for one convolution
/// dimension. Returns true if we can succesfully deduce.
///
Expand Down Expand Up @@ -2302,13 +2287,14 @@ static LogicalResult setConvolutionConfig(
static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPointFn,
Operation *computeOp) {
IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(computeOp);
LLVM_DEBUG({
DBGS() << "Selecting root config for: ";
computeOp->print(llvm::dbgs(), OpPrintingFlags().skipRegions());
llvm::dbgs() << "\n";
});
if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn,
computeOp))) {
computeOp, ukernelConfig))) {
LDBG("Tile and fuse data tiled multi_mma config");
return success();
}
Expand Down Expand Up @@ -2354,8 +2340,9 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
if (genericOp && succeeded(setTransposeConfig(entryPointFn, genericOp))) {
LDBG("Transpose Config");
return success();
} else if (genericOp && succeeded(setArgmaxUkernelConfig(
target, entryPointFn, genericOp))) {
} else if (genericOp && ukernelConfig &&
succeeded(setArgmaxUkernelConfig(target, entryPointFn, genericOp,
ukernelConfig))) {
LDBG("Argmax Ukernel Config");
return success();
}
Expand All @@ -2379,10 +2366,6 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
LDBG("Pack Config");
return setPackConfig(target, entryPointFn, packOp);
})
.Case<IREE::Codegen::UKernelOpInterface>([&](auto ukernelOp) {
LDBG("Ukernel Config");
return setUKernelConfig(entryPointFn, ukernelOp);
})
.Case<IREE::LinalgExt::CustomOp>([&](auto customOp) {
LDBG("CustomOp Config");
return setDefaultCustomOpLoweringConfig(entryPointFn, customOp,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.h"

#include "compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
Expand Down Expand Up @@ -272,8 +273,9 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target,
static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPointFn,
Operation *computeOp) {
IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(computeOp);
if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn,
computeOp))) {
computeOp, ukernelConfig))) {
return success();
}
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "iree/compiler/Utils/EmbeddedDataDirectory.h"
Expand All @@ -29,13 +30,35 @@ getUKernelNameAndSuffixForArgmax(linalg::GenericOp op) {
indexType.getElementType())};
}

// Returns ukernel name and suffix for multi_mma. Empty name = no ukernel.
static std::tuple<std::string, std::string>
getUKernelNameAndSuffixForMultiMma(IREE::GPU::MultiMmaOp op) {
auto mma = dyn_cast<IREE::GPU::DataTiledMMAAttr>(op.getKind());
if (!mma) {
return {}; // Only handling DataTiledMMAAttr for now.
}
std::string suffix{
stringifyMMAIntrinsic(mma.getIntrinsic().getValue()).lower()};
if (mma.getUnrollM() != 1 || mma.getUnrollN() != 1 || mma.getUnrollK() != 1) {
suffix += llvm::formatv("_unroll{}x{}x{}", mma.getUnrollM(),
mma.getUnrollN(), mma.getUnrollK());
}
if (mma.getSubgroupsM() != 1 || mma.getSubgroupsN() != 1) {
suffix += llvm::formatv("_subgroups{}x{}", mma.getSubgroupsM(),
mma.getSubgroupsN());
}
return {"multi_mma", suffix};
}

// Returns ukernel name and suffix for any op. Empty name = no ukernel.
static std::tuple<std::string, std::string>
getUKernelNameAndSuffix(Operation *op) {
if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
if (succeeded(isArgmaxOp(genericOp))) {
return getUKernelNameAndSuffixForArgmax(genericOp);
}
} else if (auto multiMmaOp = dyn_cast<IREE::GPU::MultiMmaOp>(op)) {
return getUKernelNameAndSuffixForMultiMma(multiMmaOp);
}
return {};
}
Expand Down

0 comments on commit 503a4e1

Please sign in to comment.