From 4fffb0ebff3f85147e7a991cda352da324e8761b Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Mon, 3 Feb 2025 17:34:06 -0800 Subject: [PATCH] Fixing execution region result placement. (#19872) This uses transfer ops to place allocations that escape execution regions. In the future we'll need something more sophisticated (AffinityAnalysis for the escaped results, etc) but in simple programs today where transfers are used to indicate resource movement it correctly picks up the destinations. This also fixes `flow.tensor.barrier` and `stream.async.barrier` to be tied ops (as they are metadata-only) and a few issues with affinity attribute assignment identified while tracking down the placement affinities. --- .../iree/compiler/Dialect/Flow/IR/FlowOps.cpp | 33 ++- .../iree/compiler/Dialect/Flow/IR/FlowOps.td | 36 +-- .../Conversion/FlowToStream/Patterns.cpp | 6 +- .../FlowToStream/test/tensor_ops.mlir | 12 +- .../compiler/Dialect/Stream/IR/StreamOps.cpp | 13 ++ .../compiler/Dialect/Stream/IR/StreamOps.td | 16 +- .../Stream/Transforms/ScheduleAllocation.cpp | 206 ++++++++++++------ .../Stream/Transforms/ScheduleExecution.cpp | 11 +- .../Transforms/test/schedule_allocation.mlir | 58 +++++ .../Transforms/test/schedule_execution.mlir | 39 ++-- 10 files changed, 309 insertions(+), 121 deletions(-) diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp index df56f0d76e67..f4c289aec413 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp @@ -1828,9 +1828,9 @@ LogicalResult TensorSplatOp::verify() { LogicalResult TensorCloneOp::verify() { if (failed(verifyOpDynamicDims(getOperation(), {getOperand()}, - getArgumentDims())) || + getOperandDims())) || failed(verifyOpDynamicDims(getOperation(), {getResult()}, - getArgumentDims()))) { + getOperandDims()))) { return failure(); } return success(); @@ -1840,7 +1840,30 @@ LogicalResult TensorCloneOp::verify() { // flow.tensor.barrier //===----------------------------------------------------------------------===// -LogicalResult TensorBarrierOp::verify() { return success(); } +LogicalResult TensorBarrierOp::verify() { + if (failed(verifyOpDynamicDims(getOperation(), {getOperand()}, + getOperandDims()))) { + return failure(); + } + return success(); +} + +Value TensorBarrierOp::getTiedResult(unsigned resultIndex) { + return IREE::Util::TiedOpInterface::findTiedBaseValue(getOperand()); +} + +Value TensorBarrierOp::getTiedResultOperand(Value result) { + return getOperand(); +} + +::std::optional +TensorBarrierOp::getTiedResultOperandIndex(unsigned resultIndex) { + return {0}; // operand +} + +SmallVector TensorBarrierOp::getTiedResultOperandIndices() { + return {0}; // operand +} //===----------------------------------------------------------------------===// // flow.tensor.transfer @@ -1848,9 +1871,9 @@ LogicalResult TensorBarrierOp::verify() { return success(); } LogicalResult TensorTransferOp::verify() { if (failed(verifyOpDynamicDims(getOperation(), {getOperand()}, - getArgumentDims())) || + getOperandDims())) || failed(verifyOpDynamicDims(getOperation(), {getResult()}, - getArgumentDims()))) { + getOperandDims()))) { return failure(); } return success(); diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td index a10982b7fba0..0241a843b906 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td +++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td @@ -1469,14 +1469,14 @@ def FLOW_TensorCloneOp : FLOW_PureOp<"tensor.clone", [ let arguments = (ins FLOW_Tensor:$operand, - FLOW_ShapeDynamicDims:$argument_dims + FLOW_ShapeDynamicDims:$operand_dims ); let results = (outs FLOW_Tensor:$result ); let assemblyFormat = [{ - $operand `:` type($result) (`{` $argument_dims^ `}`)? + $operand `:` type($result) (`{` $operand_dims^ `}`)? attr-dict-with-keyword }]; @@ -1493,8 +1493,8 @@ def FLOW_TensorCloneOp : FLOW_PureOp<"tensor.clone", [ let extraClassDeclaration = [{ bool isHoistableLeafOp() { return false; } - ValueRange getOperandDynamicDims(unsigned idx) { return getArgumentDims(); } - ValueRange getResultDynamicDims(unsigned idx) { return getArgumentDims(); } + ValueRange getOperandDynamicDims(unsigned idx) { return getOperandDims(); } + ValueRange getResultDynamicDims(unsigned idx) { return getOperandDims(); } }]; let hasVerifier = 1; @@ -1506,14 +1506,24 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [ AllTypesMatch<["operand", "result"]>, DeclareOpInterfaceMethods, Util_ShapeAwareOp, + DeclareOpInterfaceMethods, ]> { - let summary = [{}]; + let summary = [{indicates a value that must have a specific affinity}]; let description = [{ + Prevents fusion and scheduling of a value across an affinity boundary. + May introduce copy-on-write behavior if the operand value is used as well as + the result and users should try to keep the operand to a single use by this + op. }]; let arguments = (ins FLOW_Tensor:$operand, - FLOW_ShapeDynamicDims:$argument_dims, + FLOW_ShapeDynamicDims:$operand_dims, AnyAttr:$target ); let results = (outs @@ -1521,7 +1531,7 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [ ); let assemblyFormat = [{ - $operand `:` type($result) (`{` $argument_dims^ `}`)? + $operand `:` type($result) (`{` $operand_dims^ `}`)? `on` $target attr-dict-with-keyword }]; @@ -1540,8 +1550,8 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [ let extraClassDeclaration = [{ bool isHoistableLeafOp() { return false; } - ValueRange getOperandDynamicDims(unsigned idx) { return getArgumentDims(); } - ValueRange getResultDynamicDims(unsigned idx) { return getArgumentDims(); } + ValueRange getOperandDynamicDims(unsigned idx) { return getOperandDims(); } + ValueRange getResultDynamicDims(unsigned idx) { return getOperandDims(); } }]; let hasVerifier = 1; @@ -1564,7 +1574,7 @@ def FLOW_TensorTransferOp : FLOW_PureOp<"tensor.transfer", [ let arguments = (ins FLOW_Tensor:$operand, - FLOW_ShapeDynamicDims:$argument_dims, + FLOW_ShapeDynamicDims:$operand_dims, AnyAttr:$target ); let results = (outs @@ -1572,7 +1582,7 @@ def FLOW_TensorTransferOp : FLOW_PureOp<"tensor.transfer", [ ); let assemblyFormat = [{ - $operand `:` type($result) (`{` $argument_dims^ `}`)? + $operand `:` type($result) (`{` $operand_dims^ `}`)? `to` $target attr-dict-with-keyword }]; @@ -1591,8 +1601,8 @@ def FLOW_TensorTransferOp : FLOW_PureOp<"tensor.transfer", [ let extraClassDeclaration = [{ bool isHoistableLeafOp() { return false; } - ValueRange getOperandDynamicDims(unsigned idx) { return getArgumentDims(); } - ValueRange getResultDynamicDims(unsigned idx) { return getArgumentDims(); } + ValueRange getOperandDynamicDims(unsigned idx) { return getOperandDims(); } + ValueRange getResultDynamicDims(unsigned idx) { return getOperandDims(); } }]; let hasVerifier = 1; diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp index d60e6b19c447..00288cc640d6 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp @@ -229,8 +229,8 @@ struct ConvertTensorCloneOp auto unknownType = rewriter.getType(); auto cloneOp = rewriter.create( op.getLoc(), unknownType, operand.resource, op.getOperand().getType(), - op.getArgumentDims(), operand.resourceSize, op.getResult().getType(), - flattenValues(adaptor.getArgumentDims()), operand.resourceSize, + op.getOperandDims(), operand.resourceSize, op.getResult().getType(), + flattenValues(adaptor.getOperandDims()), operand.resourceSize, executionAffinityAttr); rewriter.replaceOpWithMultiple(op, {{cloneOp, operand.resourceSize}}); return success(); @@ -249,7 +249,7 @@ struct ConvertTensorBarrierOp auto barrierOp = rewriter.create( op.getLoc(), operand.resource.getType(), operand.resource, operand.resourceSize, - /*affinity=*/operand.affinity); + /*affinity=*/executionAffinityAttr); rewriter.replaceOpWithMultiple(op, {{barrierOp, operand.resourceSize}}); return success(); } diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir index 4f61917ed439..640da7802f1a 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir +++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir @@ -139,14 +139,14 @@ util.func public @tensorSplat(%value: i8, %dim0: index) -> tensor { util.global private @device : !hal.device // CHECK-LABEL: @tensorBarrierDispatch -// CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index) +// CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM0:.+]]: index) util.func public @tensorBarrierDispatch(%input: tensor, %dim0: index) -> tensor { - // CHECK: %[[BARRIER:.+]] = stream.async.barrier %[[INPUT]] : !stream.resource<*>{%[[DIM0]]} -> !stream.resource<*> + // CHECK: %[[BARRIER:.+]] = stream.async.barrier on(#hal.device.affinity<@device>) %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} %barrier = flow.tensor.barrier %input : tensor{%dim0} on #hal.device.affinity<@device> - // CHECK: %[[SIZE:.+]] = stream.tensor.sizeof on(#hal.device.affinity<@device>) tensor{%arg2} : index - // CHECK: %[[RESULT:.+]] = stream.tensor.dispatch on(#hal.device.affinity<@device>) @ex::@entry(%[[BARRIER]]) + // CHECK: %[[RESULT_SIZE:.+]] = stream.tensor.sizeof tensor{%[[DIM0]]} : index + // CHECK: %[[RESULT:.+]] = stream.tensor.dispatch @ex::@entry(%[[BARRIER]]) %0 = flow.dispatch @ex::@entry(%barrier) : (tensor{%dim0}) -> tensor{%dim0} - // CHECK: util.return %[[RESULT]], %[[SIZE]] + // CHECK: util.return %[[RESULT]], %[[RESULT_SIZE]] util.return %0 : tensor } @@ -170,7 +170,7 @@ util.global private @device : !hal.device // CHECK-LABEL: @tensorBarrier // CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM0:.+]]: index) util.func public @tensorBarrier(%input: tensor, %dim0: index) -> tensor { - // CHECK: %[[TRANSFER:.+]] = stream.async.barrier %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} -> !stream.resource<*> + // CHECK: %[[TRANSFER:.+]] = stream.async.barrier on(#hal.device.affinity<@device>) %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} %transfer = flow.tensor.barrier %input : tensor{%dim0} on #hal.device.affinity<@device> // CHECK: util.return %[[TRANSFER]], %[[INPUT_SIZE]] util.return %transfer : tensor diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp index 4623a7bd6c64..13988a999b2f 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp @@ -2469,6 +2469,19 @@ bool AsyncBarrierOp::isMetadata() { return true; } LogicalResult AsyncBarrierOp::verify() { return success(); } +Value AsyncBarrierOp::getTiedResult(unsigned resultIndex) { + return IREE::Util::TiedOpInterface::findTiedBaseValue(getSource()); +} + +::std::optional +AsyncBarrierOp::getTiedResultOperandIndex(unsigned resultIndex) { + return {0}; // source +} + +SmallVector AsyncBarrierOp::getTiedResultOperandIndices() { + return {0}; // source +} + //===----------------------------------------------------------------------===// // stream.async.transfer //===----------------------------------------------------------------------===// diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td index 8ed4bca948fa..62a44d5bac66 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td +++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td @@ -2291,15 +2291,25 @@ def Stream_AsyncCollectiveOp : Stream_Op<"async.collective", [ } def Stream_AsyncBarrierOp : Stream_Op<"async.barrier", [ + AllTypesMatch<["source", "result"]>, Stream_AffinityOp, Stream_AsyncPhaseOp, DeclareOpInterfaceMethods, Util_SizeAwareOp, + DeclareOpInterfaceMethods, ]> { - let summary = [{ }]; + let summary = [{indicates a value that must have a specific affinity}]; let description = [{ + Prevents fusion and scheduling of a value across an affinity boundary. + May introduce copy-on-write behavior if the operand value is used as well as + the result and users should try to keep the operand to a single use by this + op. }]; let arguments = (ins @@ -2318,11 +2328,9 @@ def Stream_AsyncBarrierOp : Stream_Op<"async.barrier", [ ); let assemblyFormat = [{ + (`on` `(` $affinity^ `)`)? $source `:` type($source) `` `{` $size `}` - (`from` `(` $affinity^ `)`)? - `->` - type($result) attr-dict-with-keyword }]; diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp index 6cb8a2a1ce42..b7ce5dfece90 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp @@ -694,10 +694,14 @@ static LogicalResult applyAsyncTransferOp(IREE::Stream::AsyncTransferOp asyncOp, }; auto currentAffinityAttr = IREE::Stream::AffinityAttr::lookupOrDefault(asyncOp); - bool transferIn = asyncOp.getSourceAffinityAttr() != currentAffinityAttr || - isStaging(asyncOp.getSource()); - bool transferOut = asyncOp.getResultAffinityAttr() != currentAffinityAttr || - isStaging(asyncOp.getResult()); + auto sourceAffinityAttr = asyncOp.getSourceAffinityAttr(); + auto resultAffinityAttr = asyncOp.getResultAffinityAttr(); + bool transferIn = + (sourceAffinityAttr && sourceAffinityAttr != currentAffinityAttr) || + isStaging(asyncOp.getSource()); + bool transferOut = + (resultAffinityAttr && resultAffinityAttr != currentAffinityAttr) || + isStaging(asyncOp.getResult()); auto sourceRange = scope.lookupResourceRange(asyncOp.getSource()); auto targetRange = scope.lookupResourceRange(asyncOp.getResult()); @@ -1274,35 +1278,47 @@ struct ResultReservationSet { }; struct ResultAllocation { + // Affinity for the allocations. + IREE::Stream::AffinityAttr affinityAttr; // Reservations bucketed by lifetime. SmallVector reservationSets; }; +// A map of allocation placement affinities to the alloc reservations requested. +using ResultAllocationMap = + llvm::MapVector>; + // Produces parameters for one or more result allocations composed of an ordered -// set of |reservations| with matching lifetimes. -static ResultAllocation -reserveResultAllocation(ArrayRef reservations) { - // We want deterministic ordering of the allocations for each lifetime type - // so we build them all here and then just nuke the ones we don't end up - // using. - SmallVector sets( - IREE::Stream::getMaxEnumValForLifetime() + 1); - for (auto &reservation : reservations) { - auto &set = - sets[static_cast(reservation.resultType.getLifetime())]; - set.reservationLocs.push_back(reservation.loc); - set.reservationTypes.push_back(reservation.resultType); - set.reservationSizes.push_back(reservation.resultSize); - set.reservations.push_back(std::move(reservation)); - } +// set of |reservations| with matching lifetimes. Allocations will be bucketed +// both by their allocation affinity (where they should be placed) and their +// lifetime (how long they're expected to live). +static std::vector +reserveResultAllocations(ResultAllocationMap &reservationMap) { + std::vector result; + for (auto &[affinityAttr, reservations] : reservationMap) { + // We want deterministic ordering of the allocations for each lifetime type + // so we build them all here and then just nuke the ones we don't end up + // using. + SmallVector sets( + IREE::Stream::getMaxEnumValForLifetime() + 1); + for (auto &reservation : reservations) { + auto &set = + sets[static_cast(reservation.resultType.getLifetime())]; + set.reservationLocs.push_back(reservation.loc); + set.reservationTypes.push_back(reservation.resultType); + set.reservationSizes.push_back(reservation.resultSize); + set.reservations.push_back(std::move(reservation)); + } - // Remove unused sets. This does a bunch of moves and is really bad but eh. - for (int i = sets.size() - 1; i >= 0; --i) { - if (sets[i].reservations.empty()) { - sets.erase(sets.begin() + i); + // Remove unused sets. This does a bunch of moves and is really bad but eh. + for (int i = sets.size() - 1; i >= 0; --i) { + if (sets[i].reservations.empty()) { + sets.erase(sets.begin() + i); + } } + result.push_back(ResultAllocation{affinityAttr, sets}); } - return ResultAllocation{sets}; + return result; } //===----------------------------------------------------------------------===// @@ -1333,6 +1349,49 @@ static Value findTiedYieldResult(Value seedValue) { return {}; } +// Walks up the use-def chain to find an affinity the given local value is +// pinned to. May return nullptr if there's no assigned affinity and the +// enclosing execution region affinity should be used. +// +// TODO(benvanik): change this to use an affinity analysis on the escaping +// value instead. The local value may not have a transfer associated with it. +static IREE::Stream::AffinityAttr findLocalValueAffinity(Value value) { + while (value) { + auto definingOp = value.getDefiningOp(); + if (!definingOp) { + // Block argument or something we don't track locally. + return {}; + } else if (auto transferOp = + dyn_cast(definingOp)) { + return transferOp.getResultAffinityAttr(); + } else if (auto regionOp = dyn_cast(definingOp)) { + // A region op with a yielded value (like stream.async.concurrent). + // Note that we always want to check for tied ops first as that will let + // us skip over the region entirely. + if (auto tiedOp = dyn_cast(definingOp)) { + if (auto tiedValue = tiedOp.getTiedResultOperand(value)) { + value = tiedValue; + continue; + } + } + unsigned resultIndex = cast(value).getResultNumber(); + auto &block = regionOp.getOperation()->getRegion(0).front(); + auto terminatorOp = + cast(block.getTerminator()); + value = terminatorOp.getSuccessorOperands( + RegionBranchPoint::parent())[resultIndex]; + } else if (auto tiedOp = + dyn_cast(definingOp)) { + // If the producer is tied then try to get the operand. + value = tiedOp.getTiedResultOperand(value); + } else { + // Analysis blocked. + break; + } + } + return {}; +} + // Returns a reversed list of subrange operations that lead from an initial // resource down a sequence to |derivedValue|. The first element in the list // will be the last subview of |derivedValue| and the last element will be the @@ -1541,7 +1600,7 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) { auto resourceRange = ResourceRange(arg, operandSize); scope.mapResourceRange(arg, resourceRange, asmState.get()); } - SmallVector resultReservations; + ResultAllocationMap resultReservations; for (auto [result, resultSize] : llvm::zip_equal(executeOp.getResults(), executeOp.getResultSizes())) { auto resultType = llvm::cast(result.getType()); @@ -1623,6 +1682,13 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) { continue; } + // Find a pinned affinity for the value or inherit the execution region + // affinity. + auto allocationAffinity = findLocalValueAffinity(yieldValue); + if (!allocationAffinity) { + allocationAffinity = executeOp.getAffinityAttr(); + } + // Queue up the allocation for packing. ResultReservation resultReservation = { definingOp->getLoc(), result, resultType, resultSize, yieldValue, @@ -1633,54 +1699,56 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) { resultReservation.result.printAsOperand(llvm::dbgs(), asmState); llvm::dbgs() << "\n"; }); - resultReservations.push_back(resultReservation); + resultReservations[allocationAffinity].push_back(resultReservation); } - auto resultAllocation = reserveResultAllocation(resultReservations); - for (auto &reservationSet : resultAllocation.reservationSets) { - // Allocate and tie an operand to the result. - auto timepointType = externalBuilder.getType(); - auto [allocaOp, suballocations] = - IREE::Stream::ResourceAllocaOp::createSuballocations( - timepointType, reservationSet.reservationTypes.front(), - reservationSet.reservationLocs, reservationSet.reservationSizes, - executeOp.getAwaitTimepoint(), executeOp.getAffinityAttr(), - externalBuilder); - newAwaitTimepoints.push_back(allocaOp.getResultTimepoint()); - - auto asmState = getRootAsmState(executeOp->getParentOp()); - LLVM_DEBUG({ - llvm::dbgs() << " + alloc for result reservation set: "; - allocaOp.print(llvm::dbgs(), *asmState); - llvm::dbgs() << ":\n"; - }); - - for (auto [reservation, suballocation] : - llvm::zip_equal(reservationSet.reservations, suballocations)) { - newOperands.push_back(suballocation); - newOperandSizes.push_back(reservation.resultSize); - resultReplacements.push_back( - std::make_pair(reservation.result, suballocation)); - - // Insert entry arg for the new operand tied all the way to the yield. - auto arg = - entryBlock.addArgument(reservation.resultType, reservation.loc); + for (auto &resultAllocation : reserveResultAllocations(resultReservations)) { + for (auto &reservationSet : resultAllocation.reservationSets) { + // Allocate and tie an operand to the result. + auto timepointType = + externalBuilder.getType(); + auto [allocaOp, suballocations] = + IREE::Stream::ResourceAllocaOp::createSuballocations( + timepointType, reservationSet.reservationTypes.front(), + reservationSet.reservationLocs, reservationSet.reservationSizes, + executeOp.getAwaitTimepoint(), resultAllocation.affinityAttr, + externalBuilder); + newAwaitTimepoints.push_back(allocaOp.getResultTimepoint()); + auto asmState = getRootAsmState(executeOp->getParentOp()); LLVM_DEBUG({ - llvm::dbgs() << " + adding entry arg for reservation "; - reservation.result.printAsOperand(llvm::dbgs(), *asmState); - llvm::dbgs() << "{"; - reservation.resultSize.printAsOperand(llvm::dbgs(), *asmState); - llvm::dbgs() << "} from "; - reservation.yieldValue.printAsOperand(llvm::dbgs(), *asmState); - llvm::dbgs() << " as "; - arg.printAsOperand(llvm::dbgs(), *asmState); - llvm::dbgs() << "\n"; + llvm::dbgs() << " + alloc for result reservation set: "; + allocaOp.print(llvm::dbgs(), *asmState); + llvm::dbgs() << ":\n"; }); - // Map into scope, updating all aliases. - auto resourceRange = ResourceRange(arg, reservation.resultSize); - scope.mapResourceRange(reservation.yieldValue, resourceRange, - asmState.get()); + for (auto [reservation, suballocation] : + llvm::zip_equal(reservationSet.reservations, suballocations)) { + newOperands.push_back(suballocation); + newOperandSizes.push_back(reservation.resultSize); + resultReplacements.push_back( + std::make_pair(reservation.result, suballocation)); + + // Insert entry arg for the new operand tied all the way to the yield. + auto arg = + entryBlock.addArgument(reservation.resultType, reservation.loc); + + LLVM_DEBUG({ + llvm::dbgs() << " + adding entry arg for reservation "; + reservation.result.printAsOperand(llvm::dbgs(), *asmState); + llvm::dbgs() << "{"; + reservation.resultSize.printAsOperand(llvm::dbgs(), *asmState); + llvm::dbgs() << "} from "; + reservation.yieldValue.printAsOperand(llvm::dbgs(), *asmState); + llvm::dbgs() << " as "; + arg.printAsOperand(llvm::dbgs(), *asmState); + llvm::dbgs() << "\n"; + }); + + // Map into scope, updating all aliases. + auto resourceRange = ResourceRange(arg, reservation.resultSize); + scope.mapResourceRange(reservation.yieldValue, resourceRange, + asmState.get()); + } } } diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp index a793ad4aaee8..3a26bb9a95bb 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp @@ -150,8 +150,15 @@ struct ExecutePartitionBuilder { // If the op has the same affinity as the partition region we can strip it. // Note that some ops may have affinities that are more specific and we // want to preserve those as long as possible. - if (auto affinityOp = - dyn_cast(clonedOp)) { + if (auto transferOp = dyn_cast(clonedOp)) { + if (transferOp.getSourceAffinityAttr() == partition->affinity) { + transferOp.setSourceAffinityAttr(nullptr); + } + if (transferOp.getResultAffinityAttr() == partition->affinity) { + transferOp.setResultAffinityAttr(nullptr); + } + } else if (auto affinityOp = + dyn_cast(clonedOp)) { if (affinityOp.getAffinityAttr() == partition->affinity) { affinityOp.setAffinityAttr(nullptr); } diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir index 8c2d35fa73a0..79e28ac9ac27 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir @@ -530,6 +530,64 @@ util.func public @applyAsyncTransferOp(%operand: !stream.resource, %s // ----- +// CHECK-LABEL: @applyAsyncTransferMultiScopeOp +// CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource, %[[SIZE:.+]]: index) +util.func public @applyAsyncTransferMultiScopeOp(%operand: !stream.resource, %size: index) { + // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device>) : !stream.resource{%[[SIZE]]} + // CHECK: stream.cmd.execute on(#hal.device.affinity<@execution_device>) await(%[[ALLOCA_TIMEPOINT]]) + // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}, + // CHECK-SAME: %[[ALLOCA]] as %[[ALLOCA_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) + %result, %result_timepoint = stream.async.execute on(#hal.device.affinity<@execution_device>) with(%operand as %capture: !stream.resource{%size}) -> !stream.resource{%size} { + // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_CAPTURE]][%c0], %[[SIZE]] + // CHECK-SAME: : !stream.resource{%[[SIZE]]} -> !stream.resource{%[[SIZE]]} + // CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device>) %[[ALLOCA_CAPTURE]][%c0 for %[[SIZE]]] + // CHECK-SAME: : !stream.resource{%[[SIZE]]} + %0 = stream.async.transfer %capture : !stream.resource{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device>) !stream.resource{%size} + stream.yield %0 : !stream.resource{%size} + } => !stream.timepoint + // CHECK: util.optimization_barrier %[[ALLOCA]] + util.optimization_barrier %result : !stream.resource + util.return +} + +// ----- + +// CHECK-LABEL: @applyAsyncConcurrentTransferMultiScopeOp +// CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource, %[[SIZE:.+]]: index) +util.func public @applyAsyncConcurrentTransferMultiScopeOp(%operand: !stream.resource, %size: index) { + // CHECK-DAG: %[[ALLOCA_A:.+]], %[[ALLOCA_A_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device_a>) : !stream.resource{%[[SIZE]]} + // CHECK-DAG: %[[ALLOCA_B:.+]], %[[ALLOCA_B_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device_b>) : !stream.resource{%[[SIZE]]} + // CHECK-DAG: %[[ALLOCA_TIMEPOINTS:.+]] = stream.timepoint.join max(%[[ALLOCA_A_TIMEPOINT]], %[[ALLOCA_B_TIMEPOINT]]) + // CHECK: stream.cmd.execute on(#hal.device.affinity<@execution_device>) await(%[[ALLOCA_TIMEPOINTS]]) + // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:[a-z0-9_]+]]: !stream.resource{%[[SIZE]]}, + // CHECK-SAME: %[[ALLOCA_A]] as %[[ALLOCA_A_CAPTURE:[a-z0-9_]+]]: !stream.resource{%[[SIZE]]}, + // CHECK-SAME: %[[ALLOCA_B]] as %[[ALLOCA_B_CAPTURE:[a-z0-9_]+]]: !stream.resource{%[[SIZE]]}) + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@execution_device>) with(%operand as %capture: !stream.resource{%size}) -> (!stream.resource{%size}, !stream.resource{%size}) { + // CHECK: stream.cmd.concurrent + %concurrent:2 = stream.async.concurrent with(%capture as %concurrent_capture: !stream.resource{%size}) -> (!stream.resource{%size}, !stream.resource{%size}) { + // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_A_CAPTURE]][%c0], %[[SIZE]] + // CHECK-SAME: : !stream.resource{%[[SIZE]]} -> !stream.resource{%[[SIZE]]} + // CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device_a>) %[[ALLOCA_A_CAPTURE]][%c0 for %[[SIZE]]] + // CHECK-SAME: : !stream.resource{%[[SIZE]]} + %transfer_a = stream.async.transfer %concurrent_capture : !stream.resource{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device_a>) !stream.resource{%size} + // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_B_CAPTURE]][%c0], %[[SIZE]] + // CHECK-SAME: : !stream.resource{%[[SIZE]]} -> !stream.resource{%[[SIZE]]} + // CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device_b>) %[[ALLOCA_B_CAPTURE]][%c0 for %[[SIZE]]] + // CHECK-SAME: : !stream.resource{%[[SIZE]]} + %transfer_b = stream.async.transfer %concurrent_capture : !stream.resource{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device_b>) !stream.resource{%size} + stream.yield %transfer_a, %transfer_b : !stream.resource{%size}, !stream.resource{%size} + } + stream.yield %concurrent#0, %concurrent#1 : !stream.resource{%size}, !stream.resource{%size} + } => !stream.timepoint + // CHECK: util.optimization_barrier %[[ALLOCA_A]] + util.optimization_barrier %results#0 : !stream.resource + // CHECK: util.optimization_barrier %[[ALLOCA_B]] + util.optimization_barrier %results#1 : !stream.resource + util.return +} + +// ----- + // CHECK-LABEL: @applyAsyncDispatchOp // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource, %[[SIZE:.+]]: index, %[[OFFSET:.+]]: index, %[[END:.+]]: index, %[[LENGTH:.+]]: index) util.func public @applyAsyncDispatchOp(%operand: !stream.resource, %size: index, %offset: index, %end: index, %length: index) { diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir index 7d3c2284aa86..cd6ccca95a20 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir @@ -34,8 +34,8 @@ util.func public @partitioning(%arg0: !stream.resource, %arg1: !stream // ----- -// Tests partitioning multi device execution with barriers and transfers. -// It validates that multi stream commands are created and run in parallel: +// Tests partitioning multi-device execution with barriers and transfers. +// It validates that multi-stream commands are created and run in parallel. // CHECK-LABEL: util.func public @deviceMultiDeviceSync util.func public @deviceMultiDeviceSync(%arg0: i1) -> !stream.resource { @@ -43,37 +43,38 @@ util.func public @deviceMultiDeviceSync(%arg0: i1) -> !stream.resource !stream.resource{%c128} - %1 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @ex::@dispatch0[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource{%c128}) -> !stream.resource{%c128} - %3 = stream.async.barrier %1 : !stream.resource{%c128} -> !stream.resource - %4 = stream.async.transfer %1 : !stream.resource{%c128} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_1>) !stream.resource{%c128} - // CHECK: stream.async.execute on(#hal.device.affinity<@__device_0>) + + // CHECK: stream.async.execute on(#hal.device.affinity<@device0>) // CHECK: stream.async.splat // CHECK: stream.async.dispatch // CHECK: stream.async.transfer + %0 = stream.async.splat %c255_i32 : i32 -> !stream.resource{%c128} + %1 = stream.async.dispatch on(#hal.device.affinity<@device0>) @ex::@dispatch0[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource{%c128}) -> !stream.resource{%c128} + %3 = stream.async.barrier %1 : !stream.resource{%c128} + %4 = stream.async.transfer %1 : !stream.resource{%c128} from(#hal.device.affinity<@device0>) -> to(#hal.device.affinity<@device1>) !stream.resource{%c128} - %2 = stream.async.dispatch on(#hal.device.affinity<@__device_1>) @ex::@dispatch1[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource{%c128}) -> !stream.resource{%c128} - %5 = stream.async.barrier %2 : !stream.resource{%c128} -> !stream.resource - %6 = stream.async.transfer %2 : !stream.resource{%c128} from(#hal.device.affinity<@__device_1>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c128} - // CHECK: stream.async.execute on(#hal.device.affinity<@__device_1>) + // CHECK: stream.async.execute on(#hal.device.affinity<@device1>) // CHECK: stream.async.splat // CHECK: stream.async.dispatch // CHECK: stream.async.transfer + %2 = stream.async.dispatch on(#hal.device.affinity<@device1>) @ex::@dispatch1[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource{%c128}) -> !stream.resource{%c128} + %5 = stream.async.barrier %2 : !stream.resource{%c128} + %6 = stream.async.transfer %2 : !stream.resource{%c128} from(#hal.device.affinity<@device1>) -> to(#hal.device.affinity<@device0>) !stream.resource{%c128} - %7 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @ex::@dispatch2[%c1, %c1, %c1](%3[%c0 to %c128 for %c128], %6[%c0 to %c128 for %c128]) : (!stream.resource{%c128}, !stream.resource{%c128}) -> !stream.resource{%c128} - %9 = stream.async.barrier %7 : !stream.resource{%c128} -> !stream.resource - // CHECK: stream.async.execute on(#hal.device.affinity<@__device_0>) + // CHECK: stream.async.execute on(#hal.device.affinity<@device0>) // CHECK: stream.async.dispatch + %7 = stream.async.dispatch on(#hal.device.affinity<@device0>) @ex::@dispatch2[%c1, %c1, %c1](%3[%c0 to %c128 for %c128], %6[%c0 to %c128 for %c128]) : (!stream.resource{%c128}, !stream.resource{%c128}) -> !stream.resource{%c128} + %8 = stream.async.barrier %7 : !stream.resource{%c128} - %8 = stream.async.dispatch on(#hal.device.affinity<@__device_1>) @ex::@dispatch2[%c1, %c1, %c1](%4[%c0 to %c128 for %c128], %5[%c0 to %c128 for %c128]) : (!stream.resource{%c128}, !stream.resource{%c128}) -> !stream.resource{%c128} - %10 = stream.async.transfer %8 : !stream.resource{%c128} from(#hal.device.affinity<@__device_1>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c128} - // CHECK: stream.async.execute on(#hal.device.affinity<@__device_1>) + // CHECK: stream.async.execute on(#hal.device.affinity<@device1>) // CHECK: stream.async.dispatch // CHECK: stream.async.transfer + %9 = stream.async.dispatch on(#hal.device.affinity<@device1>) @ex::@dispatch2[%c1, %c1, %c1](%4[%c0 to %c128 for %c128], %5[%c0 to %c128 for %c128]) : (!stream.resource{%c128}, !stream.resource{%c128}) -> !stream.resource{%c128} + %10 = stream.async.transfer %9 : !stream.resource{%c128} from(#hal.device.affinity<@device1>) -> to(#hal.device.affinity<@device0>) !stream.resource{%c128} - %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @ex::@dispatch2[%c1, %c1, %c1](%9[%c0 to %c128 for %c128], %10[%c0 to %c128 for %c128]) : (!stream.resource{%c128}, !stream.resource{%c128}) -> !stream.resource{%c128} - // CHECK: stream.async.execute on(#hal.device.affinity<@__device_0>) + // CHECK: stream.async.execute on(#hal.device.affinity<@device0>) // CHECK: stream.async.dispatch + %11 = stream.async.dispatch on(#hal.device.affinity<@device0>) @ex::@dispatch2[%c1, %c1, %c1](%8[%c0 to %c128 for %c128], %10[%c0 to %c128 for %c128]) : (!stream.resource{%c128}, !stream.resource{%c128}) -> !stream.resource{%c128} util.return %11 : !stream.resource }