From ba32d19b85f031600527ae7d2d65958854f0dcd3 Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Wed, 17 Jul 2024 13:48:19 +0200 Subject: [PATCH] [CanonicalizeDoublyStridedOp] Fix for interleaved unit and linear dims (#564) Fixes an issue exposed by a 128x32x64 matmul: https://github.com/nod-ai/iree-amd-aie/issues/556. In the case of a strided pattern like: ``` offsets: [0, 0, 0, 0] sizes: [2, 1, 64, 64] strides: [4096, 64, 64, 1] ``` the unit dimension (size == 1) in the middle will block the recognition that this is a linear access pattern, resulting in the following canonicalized strided pattern: ``` offsets: [0, 0] sizes: [2, 4096] strides: [4096, 1] ``` If the unit dimension is first removed, the strided pattern can be canonicalized further: ``` offsets: [] sizes: [] strides: [] ``` meaning a complete linear access. NOTE: with this fix the above matmul shape is still not functional, but exhibits the same behaviour as `128x32x128` etc --- .../AMDAIECanonicalizeDoublyStridedOp.cpp | 10 +++--- .../test/canonicalize_doubly_strided_op.mlir | 36 +++++++++++++++++++ 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp index ea3fcf4d9..6e0697195 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECanonicalizeDoublyStridedOp.cpp @@ -121,14 +121,16 @@ void AMDAIECanonicalizeDoublyStridedOpPass::runOnOperation() { Operation *parentOp = getOperation(); IRRewriter rewriter(parentOp->getContext()); - // Fold linear dimensions within a DMA op. + // Fold DMA unit dimensions. Needs to happen before folding linear dimensions + // to avoid blocking detection of linear dimension folding opportunities due + // to a unit dimension in between. parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) { - (void)foldDmaOpLinearDims(rewriter, dmaOp); + (void)foldDmaOpUnitDims(rewriter, dmaOp); }); - // Fold DMA unit dimensions. + // Fold linear dimensions within a DMA op. parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) { - (void)foldDmaOpUnitDims(rewriter, dmaOp); + (void)foldDmaOpLinearDims(rewriter, dmaOp); }); // Make DMA accesses with single dimension implicit. diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir index 7707241a6..8be21809b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir @@ -76,6 +76,18 @@ func.func @circular_dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 2, 2, 4, 1, 8] [128, 64, 32, 8, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [2, 2, 1, 4, 8, 1] [64, 32, 32, 8, 1, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.logicalobjectfifo.consume(%0) + return +} + +// ----- + // CHECK-LABEL: func.func @circular_dma_cpy_nd_non_zero_offset // CHECK: amdaie.circular_dma_cpy_nd // CHECK-SAME: [1, 1, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1] @@ -179,6 +191,18 @@ func.func @dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [2, 2, 1, 1, 4, 8] [64, 32, 32, 32, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [2, 1, 2, 1, 4, 8] [64, 64, 32, 32, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.logicalobjectfifo.consume(%0) + return +} + +// ----- + // CHECK-LABEL: func.func @dma_cpy_nd_non_zero_offset // CHECK: amdaie.dma_cpy_nd // CHECK-SAME: [1, 1, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1] @@ -282,6 +306,18 @@ func.func @npu_dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, 0] [2, 1, 64, 64] [4096, 64, 64, 1], [0, 0, 0, 0] [2, 1, 1, 64] [64, 64, 64, 1]) + return +} + +// ----- + // CHECK-LABEL: func.func @npu_dma_cpy_nd_non_zero_offset // CHECK: amdaie.npu.dma_cpy_nd // CHECK-SAME: [1, 1, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1]