Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU ukernel lowering config for data-tiled multi_mma, and a simple ukernel. #19504

Merged
merged 2 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ argmax_types = [
[iree_amdgpu_bitcode_library(
name = "iree_uk_amdgpu_argmax_%s_%s" % (type, gpu_arch),
srcs = [
"iree_uk_amdgpu_argmax_%s.c" % type,
"common.h",
"iree_uk_amdgpu_argmax_%s.c" % type,
],
out = "iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch),
gpu_arch = gpu_arch,
Expand All @@ -59,9 +59,21 @@ argmax_bc_files = [
for gpu_arch in gpu_archs
]

iree_amdgpu_bitcode_library(
name = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4_gfx942",
srcs = [
"common.h",
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c",
],
out = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc",
gpu_arch = "gfx942",
)

iree_c_embed_data(
name = "iree_uk_amdgpu_bitcode",
srcs = argmax_bc_files,
srcs = argmax_bc_files + [
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc",
],
c_file_output = "iree_uk_amdgpu_bitcode.c",
flatten = True,
h_file_output = "iree_uk_amdgpu_bitcode.h",
Expand Down
13 changes: 13 additions & 0 deletions compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,18 @@ iree_amdgpu_bitcode_library(
"iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
)

iree_amdgpu_bitcode_library(
NAME
iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4_gfx942
GPU_ARCH
gfx942
SRCS
"common.h"
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c"
OUT
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
)

iree_c_embed_data(
NAME
iree_uk_amdgpu_bitcode
Expand All @@ -226,6 +238,7 @@ iree_c_embed_data(
"iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
"iree_uk_amdgpu_argmax_f32i64.gfx90a.bc"
"iree_uk_amdgpu_argmax_f32i64.gfx942.bc"
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
C_FILE_OUTPUT
"iree_uk_amdgpu_bitcode.c"
H_FILE_OUTPUT
Expand Down
7 changes: 7 additions & 0 deletions compiler/plugins/target/ROCM/builtins/ukernel/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ typedef __UINT64_TYPE__ uint64_t;
#define FLT_MIN __FLT_MIN__
#define FLT_MAX __FLT_MAX__

//===----------------------------------------------------------------------===//
// Vector typedefs
//===----------------------------------------------------------------------===//

typedef __attribute__((__vector_size__(8 * 2))) int64_t int64x2_t;
typedef __attribute__((__vector_size__(4 * 4))) int32_t int32x4_t;

//===----------------------------------------------------------------------===//
// Declarations for Clangd, which may be slightly older than actual clang.
// Drop these as clangd versions used in practice gain these builtins.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"

// Very naive kernel. TODO(bjacob):
// 1. Shared memory: can't allocate it within the microkernel (which is just a
// helper device function, not the actual amdgpu_kernel). Need to get it
// passed down here as a `T [[clang::address_space(3)]] *` parameter.
// 2. Better scheduling via either barrier intrinsics or inline assemby.
// 3. Subgroups1x4 being asymmetric is a historical accident... should be 2x2.
[[clang::always_inline]] void
iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4(
const int8_t *a_buffer, int64_t a_offset, const int8_t *b_buffer,
int64_t b_offset, int32_t *c_buffer, int64_t c_offset, int64_t k_size) {
int tid = __builtin_amdgcn_workitem_id_x();

// Load existing accumulators.
int32x4_t acc[8][2] = {{0}};
int32x4_t *c_global = (int32x4_t *)(c_buffer + c_offset);
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 2; ++j) {
acc[i][j] = c_global[256 * (2 * i + j) + tid];
}
}

// Arithmetic loop.
const int64x2_t *a_global =
(const int64x2_t *)(a_buffer + a_offset) + (tid % 64);
const int64x2_t *b_global = (const int64x2_t *)(b_buffer + b_offset) + tid;
for (int k_outer = 0; k_outer < k_size; ++k_outer) {
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 2; ++j) {
for (int k = 0; k < 2; ++k) {
acc[i][j] = __builtin_amdgcn_mfma_i32_16x16x32_i8(
a_global[64 * i][k], b_global[256 * j][k], acc[i][j], 0, 0, 0);
}
}
}
a_global += 512;
b_global += 512;
}

// Store accumulators.
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 2; ++j) {
c_global[256 * (2 * i + j) + tid] = acc[i][j];
}
}
}
1 change: 1 addition & 0 deletions compiler/plugins/target/ROCM/test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ iree_lit_test_suite(
srcs = [
"config_ukernel_argmax_gfx908.mlir",
"config_ukernel_argmax_gfx942.mlir",
"config_ukernel_multi_mma_gfx942.mlir",
"default_tuning_specs_amdgpu.mlir",
"lowering_strategy_from_tuning_spec.mlir",
"ukernel_pipeline_transform.mlir",
Expand Down
1 change: 1 addition & 0 deletions compiler/plugins/target/ROCM/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ iree_lit_test_suite(
SRCS
"config_ukernel_argmax_gfx908.mlir"
"config_ukernel_argmax_gfx942.mlir"
"config_ukernel_multi_mma_gfx942.mlir"
"default_tuning_specs_amdgpu.mlir"
"lowering_strategy_from_tuning_spec.mlir"
"ukernel_pipeline_transform.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s

func.func @multi_mma_mfma_i32_16x16x32_i8(%a : tensor<1x2x8x4x16x2x8xi8>,
%b : tensor<1x2x4x2x4x16x2x8xi8>,
%c : tensor<1x1x8x4x2x4x16x4xi32>)
-> tensor<1x1x8x4x2x4x16x4xi32> attributes {
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "multi_mma"}>
} {
%d = iree_gpu.multi_mma %a, %b, %c {indexing_maps = [
affine_map<(d0, d1, d2) -> (d0, d2)>,
affine_map<(d0, d1, d2) -> (d1, d2)>,
affine_map<(d0, d1, d2) -> (d0, d1)>
], iterator_types = [
#iree_gpu.iterator_type<parallel>,
#iree_gpu.iterator_type<parallel>,
#iree_gpu.iterator_type<reduction>
], kind = #iree_gpu.data_tiled_mma_layout<
intrinsic = MFMA_I32_16x16x32_I8,
unroll_m = 8, unroll_n = 2, subgroups_n = 4, unroll_k = 2
>} : tensor<1x2x8x4x16x2x8xi8>, tensor<1x2x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
return %d : tensor<1x1x8x4x2x4x16x4xi32>
}

// CHECK-LABEL: @multi_mma_mfma_i32_16x16x32_i8
// CHECK: iree_gpu.multi_mma
// CHECK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
// CHECK-NOT: promote_operands
// CHECK-SAME: reduction = [0, 0, 0]
// CHECK-SAME: #iree_gpu.ukernel_config<name = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4"
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,9 @@ namespace mlir::iree_compiler::IREE::GPU {

constexpr int64_t kCacheLineSizeBits = 128 * 8;

LogicalResult
setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
Operation *op) {
LogicalResult setDataTiledMultiMmaLoweringConfig(
IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
Operation *op, IREE::GPU::UKernelConfigAttr ukernelConfig) {
auto multiMmaOp = dyn_cast<IREE::GPU::MultiMmaOp>(op);
if (!multiMmaOp) {
return failure();
Expand Down Expand Up @@ -70,7 +69,7 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
SmallVector<int64_t> reductionTileSizes(iterationRank, 0);
for (int64_t kDim : contractionDims.k) {
workgroupTileSizes[kDim] = 0;
reductionTileSizes[kDim] = 1;
reductionTileSizes[kDim] = ukernelConfig ? 0 : 1;
}

// Set tile sizes.
Expand All @@ -81,8 +80,16 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
b.getI64ArrayAttr(workgroupTileSizes));
attrs.emplace_back(b.getStringAttr("reduction"),
b.getI64ArrayAttr(reductionTileSizes));
// Promote operands to use shared memory for LHS and RHS.
GPU::setPromotedOperandList(context, attrs, {0, 1});
if (ukernelConfig) {
attrs.emplace_back(b.getStringAttr("ukernel"), ukernelConfig);
} else {
// Promote operands to use shared memory for LHS and RHS.
// Don't do that with ukernels: their untiled reduction dimension is too
// large to fit in shared memory, so they just want global memory and they
// will take care of moving small chunks at a time into a shared memory
// operand that will be created together with the ukernel op.
GPU::setPromotedOperandList(context, attrs, {0, 1});
}
auto configDict = b.getDictionaryAttr(attrs);
auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@ namespace mlir::iree_compiler::IREE::GPU {

/// Helper for setting up a data tiled multi_mma config based on the specified
/// target.
LogicalResult
setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
Operation *op);
LogicalResult setDataTiledMultiMmaLoweringConfig(
IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
Operation *op, IREE::GPU::UKernelConfigAttr ukernelConfig);

/// Helper for setting up a convolution config using IGEMM based on the
/// specified target.
Expand Down
33 changes: 8 additions & 25 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2099,15 +2099,9 @@ static LogicalResult setTransposeConfig(mlir::FunctionOpInterface entryPoint,
/// Set the configuration for argmax when ukernels are enabled.
/// Distribute all parallel dim across different workgroups, and only use single
/// subgroup per workgroup.
static LogicalResult
setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
linalg::GenericOp op) {
IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(op);
if (!ukernelConfig) {
return failure();
}

static LogicalResult setArgmaxUkernelConfig(
IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
linalg::GenericOp op, IREE::GPU::UKernelConfigAttr ukernelConfig) {
SmallVector<unsigned> parallelDims;
SmallVector<unsigned> reductionDims;
op.getParallelDims(parallelDims);
Expand Down Expand Up @@ -2170,15 +2164,6 @@ setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
return success();
}

/// Make UKernels take the LLVMGPUDefault lowering pipeline.
static LogicalResult
setUKernelConfig(mlir::FunctionOpInterface entryPoint,
IREE::Codegen::UKernelOpInterface ukernelOp) {
auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
entryPoint->getContext(), CodeGenPipeline::LLVMGPUDefault);
return setTranslationInfo(entryPoint, translationInfo);
}

/// Decides the tiling and distribution parameters for one convolution
/// dimension. Returns true if we can succesfully deduce.
///
Expand Down Expand Up @@ -2358,13 +2343,14 @@ static LogicalResult setConvolutionConfig(
static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPointFn,
Operation *computeOp) {
IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(computeOp);
LLVM_DEBUG({
DBGS() << "Selecting root config for: ";
computeOp->print(llvm::dbgs(), OpPrintingFlags().skipRegions());
llvm::dbgs() << "\n";
});
if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn,
computeOp))) {
computeOp, ukernelConfig))) {
LDBG("Tile and fuse data tiled multi_mma config");
return success();
}
Expand Down Expand Up @@ -2410,8 +2396,9 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
if (genericOp && succeeded(setTransposeConfig(entryPointFn, genericOp))) {
LDBG("Transpose Config");
return success();
} else if (genericOp && succeeded(setArgmaxUkernelConfig(
target, entryPointFn, genericOp))) {
} else if (genericOp && ukernelConfig &&
succeeded(setArgmaxUkernelConfig(target, entryPointFn, genericOp,
ukernelConfig))) {
LDBG("Argmax Ukernel Config");
return success();
}
Expand All @@ -2435,10 +2422,6 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
LDBG("Pack Config");
return setPackConfig(target, entryPointFn, packOp);
})
.Case<IREE::Codegen::UKernelOpInterface>([&](auto ukernelOp) {
LDBG("Ukernel Config");
return setUKernelConfig(entryPointFn, ukernelOp);
})
.Case<IREE::LinalgExt::CustomOp>([&](auto customOp) {
LDBG("CustomOp Config");
return setDefaultCustomOpLoweringConfig(entryPointFn, customOp,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.h"

#include "compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
Expand Down Expand Up @@ -272,8 +273,9 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target,
static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPointFn,
Operation *computeOp) {
IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(computeOp);
if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn,
computeOp))) {
computeOp, ukernelConfig))) {
return success();
}
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "iree/compiler/Utils/EmbeddedDataDirectory.h"
Expand All @@ -18,8 +19,13 @@ namespace mlir::iree_compiler {

namespace {

struct UKernelNameAndSuffix {
std::string name;
std::string suffix;
};

// Returns ukernel name and suffix for argmax. Empty name = no ukernel.
static std::tuple<std::string, std::string>
static UKernelNameAndSuffix
getUKernelNameAndSuffixForArgmax(linalg::GenericOp op) {
Value input = op.getDpsInputOperand(0)->get();
auto inputType = cast<ShapedType>(input.getType());
Expand All @@ -29,13 +35,34 @@ getUKernelNameAndSuffixForArgmax(linalg::GenericOp op) {
indexType.getElementType())};
}

// Returns ukernel name and suffix for multi_mma. Empty name = no ukernel.
static UKernelNameAndSuffix
getUKernelNameAndSuffixForMultiMma(IREE::GPU::MultiMmaOp op) {
auto mma = dyn_cast<IREE::GPU::DataTiledMMAAttr>(op.getKind());
if (!mma) {
return {}; // Only handling DataTiledMMAAttr for now.
}
std::string suffix{
stringifyMMAIntrinsic(mma.getIntrinsic().getValue()).lower()};
if (mma.getUnrollM() != 1 || mma.getUnrollN() != 1 || mma.getUnrollK() != 1) {
suffix += llvm::formatv("_unroll{}x{}x{}", mma.getUnrollM(),
mma.getUnrollN(), mma.getUnrollK());
}
if (mma.getSubgroupsM() != 1 || mma.getSubgroupsN() != 1) {
suffix += llvm::formatv("_subgroups{}x{}", mma.getSubgroupsM(),
mma.getSubgroupsN());
}
return {"multi_mma", suffix};
}

// Returns ukernel name and suffix for any op. Empty name = no ukernel.
static std::tuple<std::string, std::string>
getUKernelNameAndSuffix(Operation *op) {
static UKernelNameAndSuffix getUKernelNameAndSuffix(Operation *op) {
if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
if (succeeded(isArgmaxOp(genericOp))) {
return getUKernelNameAndSuffixForArgmax(genericOp);
}
} else if (auto multiMmaOp = dyn_cast<IREE::GPU::MultiMmaOp>(op)) {
return getUKernelNameAndSuffixForMultiMma(multiMmaOp);
}
return {};
}
Expand All @@ -44,7 +71,7 @@ getUKernelNameAndSuffix(Operation *op) {
static IREE::GPU::UKernelConfigAttr getUKernelConfig(Operation *op) {
MLIRContext *context = op->getContext();
auto [name, suffix] = getUKernelNameAndSuffix(op);
if (name.empty() || suffix.empty()) {
if (name.empty()) {
return {};
}
auto target = IREE::HAL::ExecutableTargetAttr::lookup(op);
Expand Down
Loading