diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp index 8a0e65e1d..0a77f17dc 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp @@ -52,60 +52,6 @@ int64_t calculateNbIterations(int64_t lowerBound, int64_t upperBound, namespace { -/// Utility affine expression visitor to retrieve the scale and optional bias -/// from the expression. -struct RetrieveScaleAndBias - : public AffineExprVisitor { - std::optional scale; - std::optional bias; - LogicalResult visitAffineBinaryOpExpr(AffineBinaryOpExpr /*expr*/) { - return failure(); - } - LogicalResult visitConstantExpr(AffineConstantExpr /*expr*/) { - return failure(); - } - LogicalResult visitDimExpr(AffineDimExpr /*expr*/) { return failure(); } - LogicalResult visitSymbolExpr(AffineSymbolExpr /*expr*/) { return failure(); } - LogicalResult visitMulExpr(AffineBinaryOpExpr expr) { - if (auto rhsSize = dyn_cast(expr.getRHS()); - isa(expr.getLHS())) { - scale = rhsSize.getValue(); - } else if (auto lhsSize = dyn_cast(expr.getLHS()); - isa(expr.getRHS())) { - scale = lhsSize.getValue(); - } - return success(); - } - LogicalResult visitAddExpr(AffineBinaryOpExpr expr) { - if (bias) return failure(); - if (auto rhsSize = dyn_cast(expr.getRHS())) { - bias = rhsSize.getValue(); - if (bias.value() < 0) return failure(); - if (isa(expr.getLHS())) { - return visit(expr.getLHS()); - } else if (isa(expr.getLHS())) { - scale = 1; - return success(); - } else { - return failure(); - } - } else if (auto lhsSize = dyn_cast(expr.getLHS())) { - bias = lhsSize.getValue(); - if (bias.value() < 0) return failure(); - if (isa(expr.getRHS())) { - return visit(expr.getRHS()); - } else if (isa(expr.getRHS())) { - scale = 1; - return success(); - } else { - return failure(); - } - } else { - return failure(); - } - } -}; - struct SubsumeLoopIntoDMA : public OpInterfaceRewritePattern { using OpInterfaceRewritePattern::OpInterfaceRewritePattern; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h index e628cc739..f24ed3196 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h @@ -12,12 +12,71 @@ #include "iree-amd-aie/IR/AMDAIEOps.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "llvm/ADT/SmallVector.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/MLIRContext.h" #include "mlir/IR/OpDefinition.h" #include "mlir/IR/PatternMatch.h" namespace mlir::iree_compiler::AMDAIE { +/// Utility to retrieve a constant index from an OpFoldResult. +int64_t getConstantIndexOrAssert(OpFoldResult dim); + +/// Utility affine expression visitor to retrieve the scale and optional bias +/// from the expression. +struct RetrieveScaleAndBias + : public AffineExprVisitor { + std::optional scale; + std::optional bias; + LogicalResult visitAffineBinaryOpExpr(AffineBinaryOpExpr /*expr*/) { + return failure(); + } + LogicalResult visitConstantExpr(AffineConstantExpr /*expr*/) { + return failure(); + } + LogicalResult visitDimExpr(AffineDimExpr /*expr*/) { return failure(); } + LogicalResult visitSymbolExpr(AffineSymbolExpr /*expr*/) { return failure(); } + LogicalResult visitMulExpr(AffineBinaryOpExpr expr) { + if (auto rhsSize = dyn_cast(expr.getRHS()); + isa(expr.getLHS())) { + scale = rhsSize.getValue(); + } else if (auto lhsSize = dyn_cast(expr.getLHS()); + isa(expr.getRHS())) { + scale = lhsSize.getValue(); + } + return success(); + } + LogicalResult visitAddExpr(AffineBinaryOpExpr expr) { + if (bias) return failure(); + if (auto rhsSize = dyn_cast(expr.getRHS())) { + bias = rhsSize.getValue(); + if (bias.value() < 0) return failure(); + if (isa(expr.getLHS())) { + return visit(expr.getLHS()); + } else if (isa(expr.getLHS())) { + scale = 1; + return success(); + } else { + return failure(); + } + } else if (auto lhsSize = dyn_cast(expr.getLHS())) { + bias = lhsSize.getValue(); + if (bias.value() < 0) return failure(); + if (isa(expr.getRHS())) { + return visit(expr.getRHS()); + } else if (isa(expr.getRHS())) { + scale = 1; + return success(); + } else { + return failure(); + } + } else { + return failure(); + } + } +}; + // Constant specifying the number of inter-iteration dimension for DMA // operations. // diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp index 6b2fda49e..420920a6e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp @@ -8,6 +8,7 @@ #include +#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/Debug.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" @@ -21,6 +22,56 @@ namespace mlir::iree_compiler::AMDAIE { +/// Utility to create a new logical objectfifo based on shape defined by +/// `newSizesOpFoldResultArr`. +static AMDAIE::LogicalObjectFifoFromMemrefOp createNewLogicalObjectFifo( + IRRewriter &rewriter, + AMDAIE::LogicalObjectFifoFromMemrefOp &oldLogicalObjectFifo, + SmallVectorImpl &newSizesOpFoldResultArr) { + OpBuilder::InsertionGuard guard(rewriter); + SmallVector newSizes = llvm::map_to_vector( + newSizesOpFoldResultArr, + [](OpFoldResult sizeVal) { return getConstantIndexOrAssert(sizeVal); }); + Value oldAllocOp = oldLogicalObjectFifo.getMemref(); + auto oldMemRefType = cast(oldAllocOp.getType()); + MemRefType newAllocType = MemRefType::get( + newSizes, oldMemRefType.getElementType(), MemRefLayoutAttrInterface{}, + oldMemRefType.getMemorySpace()); + assert(oldAllocOp.getDefiningOp() && "expected a defining op for the value"); + rewriter.setInsertionPoint(oldAllocOp.getDefiningOp()); + auto newAllocOp = + rewriter.create(rewriter.getUnknownLoc(), newAllocType); + auto newDeallocOp = + rewriter.create(rewriter.getUnknownLoc(), newAllocOp); + newDeallocOp->moveBefore(&newAllocOp->getBlock()->back()); + auto type = cast(newAllocOp.getType()); + // Create new logical objectfifo. + rewriter.setInsertionPoint(oldLogicalObjectFifo); + auto newLogicalObjectFifo = + rewriter.create( + rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type), + newAllocOp.getResult(), oldLogicalObjectFifo.getTiles()); + return newLogicalObjectFifo; +} + +/// Utility to help fetch those input DmaCpyNd Ops which needs to be split. +SmallVector fetchDmaCpyNdOpsToSplitOrCombine( + Operation *op) { + SmallVector l2ToL1DmaOps; + // We are currently walking through CoreOps gathering 3rd Input DmaOp (if + // applicable) from them. + // TODO(avarma): We will generalize this later. + op->walk([&](AMDAIE::CoreOp coreOp) { + SmallVector inputDmas = coreOp.getInputDmas(); + if (inputDmas.size() != 3) return WalkResult::skip(); + auto dmaCpyNdOp = inputDmas[2].getDefiningOp(); + assert(dmaCpyNdOp && "expected an amdaie.dma_cpy_nd op"); + l2ToL1DmaOps.push_back(dmaCpyNdOp); + return WalkResult::advance(); + }); + return l2ToL1DmaOps; +} + /// Utility to verify that the split dimensions for L2 are contiguous. static LogicalResult checkIsRangeFromZero( SmallVector &splitDimsSetForL2) { @@ -124,6 +175,33 @@ static FailureOr updateL3SourceOffset(IRRewriter &rewriter, return newL3AsSourceOffset; } +/// Given a L2->L1 DmaCpyNd op, find the unique L3->L2 DmaCpyNd op. +static FailureOr fetchL3ToL2DmaCpyNdOp( + AMDAIE::DmaCpyNdOp l2ToL1DmaOp) { + LogicalObjectFifoFromMemrefOp sourceObjectFifo = + l2ToL1DmaOp.getSourceObjectFifo(); + SmallVector l3ToL2DmaOps; + AMDAIE::DmaCpyNdOp l3ToL2DmaOp; + for (Operation *objFifoUserOp : sourceObjectFifo->getUsers()) { + if (auto dmaOp = dyn_cast(objFifoUserOp); + dmaOp.getTargetObjectFifo() == sourceObjectFifo) { + l3ToL2DmaOps.push_back(dmaOp); + } + } + if (l3ToL2DmaOps.size() == 0) { + LLVM_DEBUG(llvm::dbgs() << "no corresponding L3->L2 dma op found for " + << sourceObjectFifo << "\n"); + return failure(); + } + if (l3ToL2DmaOps.size() > 1) { + LLVM_DEBUG(llvm::dbgs() << "found more than one L3->L2 dma ops for " + << sourceObjectFifo << "\n"); + return failure(); + } + l3ToL2DmaOp = l3ToL2DmaOps[0]; + return l3ToL2DmaOp; +} + /// A struct utility to encapsulate all the data required to perform splitting /// of logicalobjectfifos. struct SplittingLogicalObjectFifoData { @@ -186,25 +264,10 @@ static LogicalResult checkWhetherSplitIsPossible( } // Fetch the L3 -> L2 Dma Op corresponding to the L2 buffer as target. - SmallVector l3ToL2DmaOps; - AMDAIE::DmaCpyNdOp l3ToL2DmaOp; - for (Operation *objFifoUserOp : sourceObjectFifo->getUsers()) { - if (auto dmaOp = dyn_cast(objFifoUserOp); - dmaOp.getTargetObjectFifo() == sourceObjectFifo) { - l3ToL2DmaOps.push_back(dmaOp); - } - } - if (l3ToL2DmaOps.size() == 0) { - LLVM_DEBUG(llvm::dbgs() << "no corresponding L3->L2 dma op found for " - << sourceObjectFifo << "\n"); - return failure(); - } - if (l3ToL2DmaOps.size() > 1) { - LLVM_DEBUG(llvm::dbgs() << "found more than one L3->L2 dma ops for " - << sourceObjectFifo << "\n"); - return failure(); - } - l3ToL2DmaOp = l3ToL2DmaOps[0]; + FailureOr maybeL3ToL2DmaOp = + fetchL3ToL2DmaCpyNdOp(l2ToL1DmaOps[0]); + if (failed(maybeL3ToL2DmaOp)) return failure(); + AMDAIE::DmaCpyNdOp l3ToL2DmaOp = maybeL3ToL2DmaOp.value(); if ((l3ToL2DmaOp.getTargetMixedOffsets().size() != l3ToL2DmaOp.getSourceMixedOffsets().size()) || (l3ToL2DmaOp.getTargetMixedSizes().size() != @@ -293,9 +356,6 @@ LogicalResult splitLogicalObjectFifos( l3ToL2DmaOp.getTargetMixedOffsets(); SmallVector staticL2AsTargetSizes = l3ToL2DmaOp.getTargetMixedSizes(); - SmallVector l2ShapeAsTarget = llvm::to_vector( - cast(l3ToL2DmaOp.getTargetObjectFifo().getMemref().getType()) - .getShape()); SmallVector staticL3AsSourceOffsets = l3ToL2DmaOp.getSourceMixedOffsets(); SmallVector staticL3AsSourceSizes = @@ -310,7 +370,6 @@ LogicalResult splitLogicalObjectFifos( staticL2AsTargetSizes[dim] = oneVal; staticL3AsSourceOffsets[dim] = zeroVal; staticL3AsSourceSizes[dim] = oneVal; - l2ShapeAsTarget[dim] = 1; } // Traverse each L2->L1 DmaCpyNd op and split them. @@ -321,34 +380,18 @@ LogicalResult splitLogicalObjectFifos( l2ToL1DmaOp.getSourceMixedSizes(); // Now we'll create a new L2 buffer based on the new shape inferred earlier - // via `l2ShapeAsTarget`. - rewriter.setInsertionPoint(sourceAllocOp); - LogicalObjectFifoFromMemrefOp targetObjectFifo = - l2ToL1DmaOp.getTargetObjectFifo(); - Value targetAllocOp = targetObjectFifo.getMemref(); - auto oldSourceMemRefType = cast(sourceAllocOp.getType()); - auto targetMemRefType = cast(targetAllocOp.getType()); - MemRefType newAllocType = MemRefType::get( - l2ShapeAsTarget, targetMemRefType.getElementType(), - MemRefLayoutAttrInterface{}, oldSourceMemRefType.getMemorySpace()); - auto newAllocOp = rewriter.create(rewriter.getUnknownLoc(), - newAllocType); - auto newDeallocOp = rewriter.create( - rewriter.getUnknownLoc(), newAllocOp); - newDeallocOp->moveBefore(&newAllocOp->getBlock()->back()); - auto type = cast(newAllocOp.getType()); - // Create new logicalobjectfifo.from_memref for the newly created L2 buffer. - rewriter.setInsertionPoint(l2ToL1DmaOp.getSourceObjectFifo()); - auto source = rewriter.create( - rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type), - newAllocOp.getResult(), sourceObjectFifo.getTiles()); + // via `staticL2AsTargetSizes`. + LogicalObjectFifoFromMemrefOp oldL2ObjectFifo = + l2ToL1DmaOp.getSourceObjectFifo(); + AMDAIE::LogicalObjectFifoFromMemrefOp source = createNewLogicalObjectFifo( + rewriter, oldL2ObjectFifo, staticL2AsTargetSizes); // -------------------------------------------- // ---------- L3 -> L2 splitting -------------- // -------------------------------------------- // Update L3 source offsets for non-split dimensions. Refer doc comment of // `updateL3SourceOffset` for the computation rationale involved. - SmallVector staticL3AsSourceOffsets = + SmallVector staticL3AsSourceOffsets = l3ToL2DmaOp.getSourceMixedOffsets(); for (auto &&[splitDim, nonSplitdim] : llvm::zip_equal(splitDimsForL2, nonSplitDimsForL2)) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h index 919004949..f9339b2ac 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h @@ -11,9 +11,10 @@ namespace mlir::iree_compiler::AMDAIE { -/// Utility to split logicalobjectfifos given a struct -/// `SplittingLogicalObjectFifoData` which contains all the required data to -/// perform the splitting. +/// Utility to help fetch those input DmaCpyNd Ops which needs to be split. +SmallVector fetchDmaCpyNdOpsToSplitOrCombine(Operation *op); + +/// Utility to split logicalobjectfifos given a vector of L2->L1 dma ops. LogicalResult splitLogicalObjectFifos( IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, MLIRContext *context); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp index e6736a7c9..4839246a4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp @@ -16,24 +16,6 @@ namespace mlir::iree_compiler::AMDAIE { namespace { -/// Utility to help fetch those input DmaCpyNd Ops which needs to be split. -static SmallVector fetchDmaCpyNdOpsToSplit( - ModuleOp moduleOp) { - SmallVector l2ToL1DmaOps; - // We are currently walking through CoreOps gathering 3rd Input DmaOp (if - // applicable) from them. - // TODO(avarma): We will generalize this later. - moduleOp.walk([&](AMDAIE::CoreOp coreOp) { - SmallVector inputDmas = coreOp.getInputDmas(); - if (inputDmas.size() != 3) return WalkResult::skip(); - auto dmaCpyNdOp = inputDmas[2].getDefiningOp(); - assert(dmaCpyNdOp && "expected an amdaie.dma_cpy_nd op"); - l2ToL1DmaOps.push_back(dmaCpyNdOp); - return WalkResult::advance(); - }); - return l2ToL1DmaOps; -} - class AMDAIESplitLogicalObjFifosForConnectionReusePass : public impl::AMDAIESplitLogicalObjFifosForConnectionReuseBase< AMDAIESplitLogicalObjFifosForConnectionReusePass> { @@ -53,7 +35,7 @@ void AMDAIESplitLogicalObjFifosForConnectionReusePass::runOnOperation() { IRRewriter rewriter(context); SmallVector l2ToL1DmaOps = - fetchDmaCpyNdOpsToSplit(moduleOp); + fetchDmaCpyNdOpsToSplitOrCombine(moduleOp); if (failed(splitLogicalObjectFifos(rewriter, l2ToL1DmaOps, context))) { LLVM_DEBUG(llvm::dbgs() diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index c1aa45c0b..fa2f73482 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -595,6 +595,8 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIESplitLogicalObjFifosForConnectionReusePass()); + passManager.addPass(createCSEPass()); + passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEDmaToCircularDmaPass()); passManager.addNestedPass(createAMDAIECreateAIEWorkgroupPass());