From 53b96d5e967358134bd1e5ba62283c6d5be4cac4 Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Thu, 9 Jan 2025 23:12:08 +0100 Subject: [PATCH] [KernelDispatch] Add matmul RHS outer permutation (#1016) --- .../Transforms/KernelDispatch.cpp | 60 +++++++++++++++---- .../test/lowering_strategy_air.mlir | 10 ++-- .../test/lowering_strategy_generic.mlir | 4 +- .../lowering_strategy_objectfifo_npu1.mlir | 30 +++++----- .../lowering_strategy_objectfifo_npu4.mlir | 2 +- 5 files changed, 72 insertions(+), 34 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp index 8b0b5c95b..6d72a91e5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp @@ -316,6 +316,23 @@ FailureOr ParameterSetting::create( } } // namespace +/// Utility to set the packing inner permutation for A/LHS so that is packed as +/// [? ? m k] in case of matmul and [? ? ? m k] in case of batch_matmul. +static SmallVector setInnerPermA(bool isMatmulTransposeA) { + SmallVector innerPerm; + if (isMatmulTransposeA) { + innerPerm = {1, 0}; + } else { + innerPerm = {0, 1}; + } + return innerPerm; +} + +/// Utility to set the packing inner permutation for B/RHS so that is packed as +/// - [? ? k n] in case of matmul +/// - [? ? ? k n] in case of batch_matmul +/// - [? ? n k] in case of matmul_transpose_b +/// - [? ? ? n k] in case of batch_matmul_transpose_b. static SmallVector setInnerPermB(bool isMatmulTransposeB) { SmallVector innerPerm; if (isMatmulTransposeB) { @@ -326,14 +343,34 @@ static SmallVector setInnerPermB(bool isMatmulTransposeB) { return innerPerm; } -static SmallVector setInnerPermA(bool isMatmulTransposeA) { - SmallVector innerPerm; +/// Utility to set the packing outer permutation for A/LHS so that is packed as +/// [M K ? ?] in case of matmul and [Batch M K ? ?] in case of batch_matmul. +static SmallVector setOuterPermA(bool isMatmulTransposeA, + bool isBatchMatmul) { + SmallVector outerPerm; if (isMatmulTransposeA) { - innerPerm = {1, 0}; + outerPerm = isBatchMatmul ? SmallVector{0, 2, 1} + : SmallVector{1, 0}; } else { - innerPerm = {0, 1}; + outerPerm = isBatchMatmul ? SmallVector{0, 1, 2} + : SmallVector{0, 1}; } - return innerPerm; + return outerPerm; +} + +/// Utility to set the packing outer permutation for B/RHS so that is packed as +/// [N K ? ?] in case of matmul and [Batch N K ? ?] in case of batch_matmul. +static SmallVector setOuterPermB(bool isMatmulTransposeB, + bool isBatchMatmul) { + SmallVector outerPerm; + if (isMatmulTransposeB) { + outerPerm = isBatchMatmul ? SmallVector{0, 1, 2} + : SmallVector{0, 1}; + } else { + outerPerm = isBatchMatmul ? SmallVector{0, 2, 1} + : SmallVector{1, 0}; + } + return outerPerm; } //===----------------------------------------------------------------------===// @@ -362,7 +399,7 @@ static LogicalResult setRootConfigForPackPeelPipeline( packedSizesL0.insert(packedSizesL0.begin(), 0); } - // For matmul, transpose B matrix from [K N n k] to [K N k n] + // For matmul, transpose B matrix from [K N n k] to [N K k n] // For matmul_transpose_b, we don't have to transpose the B matrix, // since it is already [N K n k] SmallVector transposePackIndices = {0, 1}; @@ -372,11 +409,12 @@ static LogicalResult setRootConfigForPackPeelPipeline( SmallVector innerPermA = setInnerPermA(isMatmulTransposeA(linalgOp)); SmallVector innerPermB = setInnerPermB(isMatmulTransposeB(linalgOp)); SmallVector> innerPerm = {innerPermA, innerPermB}; - SmallVector outerPermVec = {0, 1}; - if (isa(linalgOp)) { - outerPermVec.push_back(2); - } - SmallVector> outerPerm = {outerPermVec, outerPermVec}; + bool isBatchMatmul = isa(linalgOp); + SmallVector outerPermA = + setOuterPermA(isMatmulTransposeA(linalgOp), isBatchMatmul); + SmallVector outerPermB = + setOuterPermB(isMatmulTransposeB(linalgOp), isBatchMatmul); + SmallVector> outerPerm = {outerPermA, outerPermB}; if (isObjectFifo) { // Add outer permutation for unpack. NOTE: This currently fails for some // tests in the AIR pipeline. diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir index 56e05d1a0..93445cd35 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir @@ -2,7 +2,7 @@ // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{use-lower-to-aie-pipeline=air use-tile-pipeline=pack-peel})' %s | FileCheck %s --check-prefix=CHECK-PACK-PEEL // CHECK-PAD-PACK{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PAD-PACK{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PAD-PACK{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -29,7 +29,7 @@ builtin.module { // ----- // CHECK-PAD-PACK{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PAD-PACK{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PAD-PACK{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -190,7 +190,7 @@ builtin.module { // ----- // CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PACK-PEEL{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PACK-PEEL{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -217,7 +217,7 @@ builtin.module { // ----- // CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PACK-PEEL{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PACK-PEEL{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -245,7 +245,7 @@ module { // CHECK-PAD-PACK{LITERAL}: #config = #iree_codegen.lowering_config // CHECK-PAD-PACK{LITERAL}: #packingConfig = #amdaie.packing_config // CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PACK-PEEL{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PACK-PEEL{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir index c5b66817f..0e8e80a01 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir @@ -3,7 +3,7 @@ // Test generic version of matmul. // CHECK{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK{LITERAL}: #amdaie.packing_config +// CHECK{LITERAL}: #amdaie.packing_config module { func.func @matmul_generic_128x128x256_i32() { %c0_i32 = arith.constant 0 : i32 @@ -63,7 +63,7 @@ module { // Test generic version of matmul_transpose_a. // CHECK{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK{LITERAL}: #amdaie.packing_config +// CHECK{LITERAL}: #amdaie.packing_config module { func.func @matmul_transpose_a_generic_128x128x256_i32() { %c0_i32 = arith.constant 0 : i32 diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir index ff53e7862..0638b7af9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir @@ -3,13 +3,13 @@ // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{target-device=npu1_4col})' %s | FileCheck %s --check-prefix=CHECK-4x4 // CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-2x2{LITERAL}: #amdaie.packing_config +// CHECK-2x2{LITERAL}: #amdaie.packing_config // CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x2{LITERAL}: #amdaie.packing_config +// CHECK-4x2{LITERAL}: #amdaie.packing_config // CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x4{LITERAL}: #amdaie.packing_config +// CHECK-4x4{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -36,13 +36,13 @@ module { // ----- // CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-2x2{LITERAL}: #amdaie.packing_config +// CHECK-2x2{LITERAL}: #amdaie.packing_config // CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x2{LITERAL}: #amdaie.packing_config +// CHECK-4x2{LITERAL}: #amdaie.packing_config // CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x4{LITERAL}: #amdaie.packing_config +// CHECK-4x4{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -69,13 +69,13 @@ module { // ----- // CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-2x2{LITERAL}: #amdaie.packing_config +// CHECK-2x2{LITERAL}: #amdaie.packing_config // CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x2{LITERAL}: #amdaie.packing_config +// CHECK-4x2{LITERAL}: #amdaie.packing_config // CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x4{LITERAL}: #amdaie.packing_config +// CHECK-4x4{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -102,13 +102,13 @@ module { // ----- // CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-2x2{LITERAL}: #amdaie.packing_config +// CHECK-2x2{LITERAL}: #amdaie.packing_config // CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x2{LITERAL}: #amdaie.packing_config +// CHECK-4x2{LITERAL}: #amdaie.packing_config // CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x4{LITERAL}: #amdaie.packing_config +// CHECK-4x4{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -168,13 +168,13 @@ module { // ----- // CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-2x2{LITERAL}: #amdaie.packing_config +// CHECK-2x2{LITERAL}: #amdaie.packing_config // CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x2{LITERAL}: #amdaie.packing_config +// CHECK-4x2{LITERAL}: #amdaie.packing_config // CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x4{LITERAL}: #amdaie.packing_config +// CHECK-4x4{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir index 2450a69d0..3863e4e27 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir @@ -7,7 +7,7 @@ // CHECK-SAME: innerPerm = [ // CHECK-SAME: [0, 1], [1, 0], [0, 1] // CHECK-SAME: ], outerPerm = [ -// CHECK-SAME: [0, 1], [0, 1], [1, 0] +// CHECK-SAME: [0, 1], [1, 0], [1, 0] // CHECK-SAME: ]}, {packedSizes = [0, 0, 0, 8, 8, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], // CHECK-SAME: innerPerm = [ // CHECK-SAME: [0, 1], [1, 0], [0, 1]