diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel index aff7b8965b320..840d45fc27cb4 100644 --- a/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel +++ b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel @@ -46,8 +46,8 @@ argmax_types = [ [iree_amdgpu_bitcode_library( name = "iree_uk_amdgpu_argmax_%s_%s" % (type, gpu_arch), srcs = [ - "iree_uk_amdgpu_argmax_%s.c" % type, "common.h", + "iree_uk_amdgpu_argmax_%s.c" % type, ], out = "iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch), gpu_arch = gpu_arch, @@ -59,9 +59,21 @@ argmax_bc_files = [ for gpu_arch in gpu_archs ] +iree_amdgpu_bitcode_library( + name = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4_gfx942", + srcs = [ + "common.h", + "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c", + ], + out = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc", + gpu_arch = "gfx942", +) + iree_c_embed_data( name = "iree_uk_amdgpu_bitcode", - srcs = argmax_bc_files, + srcs = argmax_bc_files + [ + "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc", + ], c_file_output = "iree_uk_amdgpu_bitcode.c", flatten = True, h_file_output = "iree_uk_amdgpu_bitcode.h", diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt index 71d4705eed1a6..ad1a19028a5b4 100644 --- a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt +++ b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt @@ -206,6 +206,18 @@ iree_amdgpu_bitcode_library( "iree_uk_amdgpu_argmax_f32i64.gfx1100.bc" ) +iree_amdgpu_bitcode_library( + NAME + iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4_gfx942 + GPU_ARCH + gfx942 + SRCS + "common.h" + "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c" + OUT + "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc" +) + iree_c_embed_data( NAME iree_uk_amdgpu_bitcode @@ -226,6 +238,7 @@ iree_c_embed_data( "iree_uk_amdgpu_argmax_f32i64.gfx1100.bc" "iree_uk_amdgpu_argmax_f32i64.gfx90a.bc" "iree_uk_amdgpu_argmax_f32i64.gfx942.bc" + "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc" C_FILE_OUTPUT "iree_uk_amdgpu_bitcode.c" H_FILE_OUTPUT diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c new file mode 100644 index 0000000000000..e7fd7649ad336 --- /dev/null +++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c @@ -0,0 +1,55 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "compiler/plugins/target/ROCM/builtins/ukernel/common.h" + +// Very naive kernel. TODO(bjacob): +// 1. Shared memory: can't allocate it within the microkernel (which is just a +// helper device function, not the actual amdgpu_kernel). Need to get it +// passed down here as a `T [[clang::address_space(3)]] *` parameter. +// 2. Better scheduling via either barrier intrinsics or inline assemby. +// 3. Subgroups1x4 being asymmetric is a historical accident... should be 2x2. +[[clang::always_inline, clang::convergent]] void +iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4( + const int8_t *a_buffer, int64_t a_offset, const int8_t *b_buffer, + int64_t b_offset, int32_t *c_buffer, int64_t c_offset, int64_t k_size) { + typedef __attribute__((__vector_size__(8 * 2))) int64_t int64x2_t; + typedef __attribute__((__vector_size__(4 * 4))) int32_t int32x4_t; + int tid = __builtin_amdgcn_workitem_id_x(); + + // Load existing accumulators. + int32x4_t acc[8][2] = {{0}}; + int32x4_t *c_global = (int32x4_t *)(c_buffer + c_offset); + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 2; ++j) { + acc[i][j] = c_global[256 * (2 * i + j) + tid]; + } + } + + // Arithmetic loop. + const int64x2_t *a_global = + (const int64x2_t *)(a_buffer + a_offset) + (tid % 64); + const int64x2_t *b_global = (const int64x2_t *)(b_buffer + b_offset) + tid; + for (int k_outer = 0; k_outer < k_size; ++k_outer) { + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 2; ++k) { + acc[i][j] = __builtin_amdgcn_mfma_i32_16x16x32_i8( + a_global[64 * i][k], b_global[256 * j][k], acc[i][j], 0, 0, 0); + } + } + } + a_global += 512; + b_global += 512; + } + + // Store accumulators. + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 2; ++j) { + c_global[256 * (2 * i + j) + tid] = acc[i][j]; + } + } +} diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel index 2a71f590c6e30..7201e4b988e84 100644 --- a/compiler/plugins/target/ROCM/test/BUILD.bazel +++ b/compiler/plugins/target/ROCM/test/BUILD.bazel @@ -17,6 +17,7 @@ iree_lit_test_suite( srcs = [ "config_ukernel_argmax_gfx908.mlir", "config_ukernel_argmax_gfx942.mlir", + "config_ukernel_multi_mma_gfx942.mlir", "default_tuning_specs_amdgpu.mlir", "lowering_strategy_from_tuning_spec.mlir", "ukernel_pipeline_transform.mlir", diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt index bab88582a8b0e..06249daa0039e 100644 --- a/compiler/plugins/target/ROCM/test/CMakeLists.txt +++ b/compiler/plugins/target/ROCM/test/CMakeLists.txt @@ -16,6 +16,7 @@ iree_lit_test_suite( SRCS "config_ukernel_argmax_gfx908.mlir" "config_ukernel_argmax_gfx942.mlir" + "config_ukernel_multi_mma_gfx942.mlir" "default_tuning_specs_amdgpu.mlir" "lowering_strategy_from_tuning_spec.mlir" "ukernel_pipeline_transform.mlir" diff --git a/compiler/plugins/target/ROCM/test/config_ukernel_multi_mma_gfx942.mlir b/compiler/plugins/target/ROCM/test/config_ukernel_multi_mma_gfx942.mlir new file mode 100644 index 0000000000000..646418f806663 --- /dev/null +++ b/compiler/plugins/target/ROCM/test/config_ukernel_multi_mma_gfx942.mlir @@ -0,0 +1,29 @@ +// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s + +func.func @multi_mma_mfma_i32_16x16x32_i8(%a : tensor<1x2x8x4x16x2x8xi8>, + %b : tensor<1x2x4x2x4x16x2x8xi8>, + %c : tensor<1x1x8x4x2x4x16x4xi32>) + -> tensor<1x1x8x4x2x4x16x4xi32> attributes { + hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "multi_mma"}> +} { + %d = iree_gpu.multi_mma %a, %b, %c {indexing_maps = [ + affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d1, d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)> + ], iterator_types = [ + #iree_gpu.iterator_type, + #iree_gpu.iterator_type, + #iree_gpu.iterator_type + ], kind = #iree_gpu.data_tiled_mma_layout< + intrinsic = MFMA_I32_16x16x32_I8, + unroll_m = 8, unroll_n = 2, subgroups_n = 4, unroll_k = 2 + >} : tensor<1x2x8x4x16x2x8xi8>, tensor<1x2x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> + return %d : tensor<1x1x8x4x2x4x16x4xi32> +} + +// CHECK-LABEL: @multi_mma_mfma_i32_16x16x32_i8 +// CHECK: iree_gpu.multi_mma +// CHECK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc" +// CHECK-NOT: promote_operands +// CHECK-SAME: reduction = [0, 0, 0] +// CHECK-SAME: #iree_gpu.ukernel_config(op); if (!multiMmaOp) { return failure(); @@ -70,7 +69,7 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target, SmallVector reductionTileSizes(iterationRank, 0); for (int64_t kDim : contractionDims.k) { workgroupTileSizes[kDim] = 0; - reductionTileSizes[kDim] = 1; + reductionTileSizes[kDim] = ukernelConfig ? 0 : 1; } // Set tile sizes. @@ -81,8 +80,16 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target, b.getI64ArrayAttr(workgroupTileSizes)); attrs.emplace_back(b.getStringAttr("reduction"), b.getI64ArrayAttr(reductionTileSizes)); - // Promote operands to use shared memory for LHS and RHS. - GPU::setPromotedOperandList(context, attrs, {0, 1}); + if (ukernelConfig) { + attrs.emplace_back(b.getStringAttr("ukernel"), ukernelConfig); + } else { + // Promote operands to use shared memory for LHS and RHS. + // Don't do that with ukernels: their untiled reduction dimension is too + // large to fit in shared memory, so they just want global memory and they + // will take care of moving small chunks at a time into a shared memory + // operand that will be created together with the ukernel op. + GPU::setPromotedOperandList(context, attrs, {0, 1}); + } auto configDict = b.getDictionaryAttr(attrs); auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict); diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h index 0458ea91d6adf..636ffe5f0898e 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h @@ -16,10 +16,9 @@ namespace mlir::iree_compiler::IREE::GPU { /// Helper for setting up a data tiled multi_mma config based on the specified /// target. -LogicalResult -setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target, - mlir::FunctionOpInterface entryPoint, - Operation *op); +LogicalResult setDataTiledMultiMmaLoweringConfig( + IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint, + Operation *op, IREE::GPU::UKernelConfigAttr ukernelConfig); /// Helper for setting up a convolution config using IGEMM based on the /// specified target. diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index be069e7282993..fc890d1db70da 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -2043,15 +2043,9 @@ static LogicalResult setTransposeConfig(mlir::FunctionOpInterface entryPoint, /// Set the configuration for argmax when ukernels are enabled. /// Distribute all parallel dim across different workgroups, and only use single /// subgroup per workgroup. -static LogicalResult -setArgmaxUkernelConfig(IREE::GPU::TargetAttr target, - mlir::FunctionOpInterface entryPoint, - linalg::GenericOp op) { - IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(op); - if (!ukernelConfig) { - return failure(); - } - +static LogicalResult setArgmaxUkernelConfig( + IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint, + linalg::GenericOp op, IREE::GPU::UKernelConfigAttr ukernelConfig) { SmallVector parallelDims; SmallVector reductionDims; op.getParallelDims(parallelDims); @@ -2114,15 +2108,6 @@ setArgmaxUkernelConfig(IREE::GPU::TargetAttr target, return success(); } -/// Make UKernels take the LLVMGPUDefault lowering pipeline. -static LogicalResult -setUKernelConfig(mlir::FunctionOpInterface entryPoint, - IREE::Codegen::UKernelOpInterface ukernelOp) { - auto translationInfo = IREE::Codegen::TranslationInfoAttr::get( - entryPoint->getContext(), CodeGenPipeline::LLVMGPUDefault); - return setTranslationInfo(entryPoint, translationInfo); -} - /// Decides the tiling and distribution parameters for one convolution /// dimension. Returns true if we can succesfully deduce. /// @@ -2302,13 +2287,14 @@ static LogicalResult setConvolutionConfig( static LogicalResult setRootConfig(IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPointFn, Operation *computeOp) { + IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(computeOp); LLVM_DEBUG({ DBGS() << "Selecting root config for: "; computeOp->print(llvm::dbgs(), OpPrintingFlags().skipRegions()); llvm::dbgs() << "\n"; }); if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn, - computeOp))) { + computeOp, ukernelConfig))) { LDBG("Tile and fuse data tiled multi_mma config"); return success(); } @@ -2354,8 +2340,9 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target, if (genericOp && succeeded(setTransposeConfig(entryPointFn, genericOp))) { LDBG("Transpose Config"); return success(); - } else if (genericOp && succeeded(setArgmaxUkernelConfig( - target, entryPointFn, genericOp))) { + } else if (genericOp && ukernelConfig && + succeeded(setArgmaxUkernelConfig(target, entryPointFn, genericOp, + ukernelConfig))) { LDBG("Argmax Ukernel Config"); return success(); } @@ -2379,10 +2366,6 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target, LDBG("Pack Config"); return setPackConfig(target, entryPointFn, packOp); }) - .Case([&](auto ukernelOp) { - LDBG("Ukernel Config"); - return setUKernelConfig(entryPointFn, ukernelOp); - }) .Case([&](auto customOp) { LDBG("CustomOp Config"); return setDefaultCustomOpLoweringConfig(entryPointFn, customOp, diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp index 26245cba1b257..c52cd07a1cd14 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp @@ -6,6 +6,7 @@ #include "iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.h" +#include "compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h" #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" #include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h" @@ -272,8 +273,9 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target, static LogicalResult setRootConfig(IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPointFn, Operation *computeOp) { + IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(computeOp); if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn, - computeOp))) { + computeOp, ukernelConfig))) { return success(); } if (auto linalgOp = dyn_cast(computeOp)) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp index 2f2861f926cc4..83946299e26b4 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h" +#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "iree/compiler/Codegen/Utils/Utils.h" #include "iree/compiler/Utils/EmbeddedDataDirectory.h" @@ -29,6 +30,26 @@ getUKernelNameAndSuffixForArgmax(linalg::GenericOp op) { indexType.getElementType())}; } +// Returns ukernel name and suffix for multi_mma. Empty name = no ukernel. +static std::tuple +getUKernelNameAndSuffixForMultiMma(IREE::GPU::MultiMmaOp op) { + auto mma = dyn_cast(op.getKind()); + if (!mma) { + return {}; // Only handling DataTiledMMAAttr for now. + } + std::string suffix{ + stringifyMMAIntrinsic(mma.getIntrinsic().getValue()).lower()}; + if (mma.getUnrollM() != 1 || mma.getUnrollN() != 1 || mma.getUnrollK() != 1) { + suffix += llvm::formatv("_unroll{}x{}x{}", mma.getUnrollM(), + mma.getUnrollN(), mma.getUnrollK()); + } + if (mma.getSubgroupsM() != 1 || mma.getSubgroupsN() != 1) { + suffix += llvm::formatv("_subgroups{}x{}", mma.getSubgroupsM(), + mma.getSubgroupsN()); + } + return {"multi_mma", suffix}; +} + // Returns ukernel name and suffix for any op. Empty name = no ukernel. static std::tuple getUKernelNameAndSuffix(Operation *op) { @@ -36,6 +57,8 @@ getUKernelNameAndSuffix(Operation *op) { if (succeeded(isArgmaxOp(genericOp))) { return getUKernelNameAndSuffixForArgmax(genericOp); } + } else if (auto multiMmaOp = dyn_cast(op)) { + return getUKernelNameAndSuffixForMultiMma(multiMmaOp); } return {}; }