From 4fffb0ebff3f85147e7a991cda352da324e8761b Mon Sep 17 00:00:00 2001
From: Ben Vanik <ben.vanik@gmail.com>
Date: Mon, 3 Feb 2025 17:34:06 -0800
Subject: [PATCH] Fixing execution region result placement. (#19872)

This uses transfer ops to place allocations that escape execution
regions. In the future we'll need something more sophisticated
(AffinityAnalysis for the escaped results, etc) but in simple programs
today where transfers are used to indicate resource movement it
correctly picks up the destinations.

This also fixes `flow.tensor.barrier` and `stream.async.barrier` to be
tied ops (as they are metadata-only) and a few issues with affinity
attribute assignment identified while tracking down the placement
affinities.
---
 .../iree/compiler/Dialect/Flow/IR/FlowOps.cpp |  33 ++-
 .../iree/compiler/Dialect/Flow/IR/FlowOps.td  |  36 +--
 .../Conversion/FlowToStream/Patterns.cpp      |   6 +-
 .../FlowToStream/test/tensor_ops.mlir         |  12 +-
 .../compiler/Dialect/Stream/IR/StreamOps.cpp  |  13 ++
 .../compiler/Dialect/Stream/IR/StreamOps.td   |  16 +-
 .../Stream/Transforms/ScheduleAllocation.cpp  | 206 ++++++++++++------
 .../Stream/Transforms/ScheduleExecution.cpp   |  11 +-
 .../Transforms/test/schedule_allocation.mlir  |  58 +++++
 .../Transforms/test/schedule_execution.mlir   |  39 ++--
 10 files changed, 309 insertions(+), 121 deletions(-)
diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
index df56f0d76e67..f4c289aec413 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
@@ -1828,9 +1828,9 @@ LogicalResult TensorSplatOp::verify() {
 
 LogicalResult TensorCloneOp::verify() {
   if (failed(verifyOpDynamicDims(getOperation(), {getOperand()},
-                                 getArgumentDims())) ||
+                                 getOperandDims())) ||
       failed(verifyOpDynamicDims(getOperation(), {getResult()},
-                                 getArgumentDims()))) {
+                                 getOperandDims()))) {
     return failure();
   }
   return success();
@@ -1840,7 +1840,30 @@ LogicalResult TensorCloneOp::verify() {
 // flow.tensor.barrier
 //===----------------------------------------------------------------------===//
 
-LogicalResult TensorBarrierOp::verify() { return success(); }
+LogicalResult TensorBarrierOp::verify() {
+  if (failed(verifyOpDynamicDims(getOperation(), {getOperand()},
+                                 getOperandDims()))) {
+    return failure();
+  }
+  return success();
+}
+
+Value TensorBarrierOp::getTiedResult(unsigned resultIndex) {
+  return IREE::Util::TiedOpInterface::findTiedBaseValue(getOperand());
+}
+
+Value TensorBarrierOp::getTiedResultOperand(Value result) {
+  return getOperand();
+}
+
+::std::optional<unsigned>
+TensorBarrierOp::getTiedResultOperandIndex(unsigned resultIndex) {
+  return {0}; // operand
+}
+
+SmallVector<int64_t> TensorBarrierOp::getTiedResultOperandIndices() {
+  return {0}; // operand
+}
 
 //===----------------------------------------------------------------------===//
 // flow.tensor.transfer
@@ -1848,9 +1871,9 @@ LogicalResult TensorBarrierOp::verify() { return success(); }
 
 LogicalResult TensorTransferOp::verify() {
   if (failed(verifyOpDynamicDims(getOperation(), {getOperand()},
-                                 getArgumentDims())) ||
+                                 getOperandDims())) ||
       failed(verifyOpDynamicDims(getOperation(), {getResult()},
-                                 getArgumentDims()))) {
+                                 getOperandDims()))) {
     return failure();
   }
   return success();
diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
index a10982b7fba0..0241a843b906 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
@@ -1469,14 +1469,14 @@ def FLOW_TensorCloneOp : FLOW_PureOp<"tensor.clone", [
 
   let arguments = (ins
     FLOW_Tensor:$operand,
-    FLOW_ShapeDynamicDims:$argument_dims
+    FLOW_ShapeDynamicDims:$operand_dims
   );
   let results = (outs
     FLOW_Tensor:$result
   );
 
   let assemblyFormat = [{
-    $operand `:` type($result) (`{` $argument_dims^ `}`)?
+    $operand `:` type($result) (`{` $operand_dims^ `}`)?
     attr-dict-with-keyword
   }];
 
@@ -1493,8 +1493,8 @@ def FLOW_TensorCloneOp : FLOW_PureOp<"tensor.clone", [
   let extraClassDeclaration = [{
     bool isHoistableLeafOp() { return false; }
 
-    ValueRange getOperandDynamicDims(unsigned idx) { return getArgumentDims(); }
-    ValueRange getResultDynamicDims(unsigned idx) { return getArgumentDims(); }
+    ValueRange getOperandDynamicDims(unsigned idx) { return getOperandDims(); }
+    ValueRange getResultDynamicDims(unsigned idx) { return getOperandDims(); }
   }];
 
   let hasVerifier = 1;
@@ -1506,14 +1506,24 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [
   AllTypesMatch<["operand", "result"]>,
   DeclareOpInterfaceMethods<Util_HoistableOpInterface>,
   Util_ShapeAwareOp,
+  DeclareOpInterfaceMethods<Util_TiedOpInterface, [
+      "getTiedResult",
+      "getTiedResultOperand",
+      "getTiedResultOperandIndex",
+      "getTiedResultOperandIndices",
+  ]>,
 ]> {
-  let summary = [{}];
+  let summary = [{indicates a value that must have a specific affinity}];
   let description = [{
+    Prevents fusion and scheduling of a value across an affinity boundary.
+    May introduce copy-on-write behavior if the operand value is used as well as
+    the result and users should try to keep the operand to a single use by this
+    op.
   }];
 
   let arguments = (ins
     FLOW_Tensor:$operand,
-    FLOW_ShapeDynamicDims:$argument_dims,
+    FLOW_ShapeDynamicDims:$operand_dims,
     AnyAttr:$target
   );
   let results = (outs
@@ -1521,7 +1531,7 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [
   );
 
   let assemblyFormat = [{
-    $operand `:` type($result) (`{` $argument_dims^ `}`)?
+    $operand `:` type($result) (`{` $operand_dims^ `}`)?
     `on` $target
     attr-dict-with-keyword
   }];
@@ -1540,8 +1550,8 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [
   let extraClassDeclaration = [{
     bool isHoistableLeafOp() { return false; }
 
-    ValueRange getOperandDynamicDims(unsigned idx) { return getArgumentDims(); }
-    ValueRange getResultDynamicDims(unsigned idx) { return getArgumentDims(); }
+    ValueRange getOperandDynamicDims(unsigned idx) { return getOperandDims(); }
+    ValueRange getResultDynamicDims(unsigned idx) { return getOperandDims(); }
   }];
 
   let hasVerifier = 1;
@@ -1564,7 +1574,7 @@ def FLOW_TensorTransferOp : FLOW_PureOp<"tensor.transfer", [
 
   let arguments = (ins
     FLOW_Tensor:$operand,
-    FLOW_ShapeDynamicDims:$argument_dims,
+    FLOW_ShapeDynamicDims:$operand_dims,
     AnyAttr:$target
   );
   let results = (outs
@@ -1572,7 +1582,7 @@ def FLOW_TensorTransferOp : FLOW_PureOp<"tensor.transfer", [
   );
 
   let assemblyFormat = [{
-    $operand `:` type($result) (`{` $argument_dims^ `}`)?
+    $operand `:` type($result) (`{` $operand_dims^ `}`)?
     `to` $target
     attr-dict-with-keyword
   }];
@@ -1591,8 +1601,8 @@ def FLOW_TensorTransferOp : FLOW_PureOp<"tensor.transfer", [
   let extraClassDeclaration = [{
     bool isHoistableLeafOp() { return false; }
 
-    ValueRange getOperandDynamicDims(unsigned idx) { return getArgumentDims(); }
-    ValueRange getResultDynamicDims(unsigned idx) { return getArgumentDims(); }
+    ValueRange getOperandDynamicDims(unsigned idx) { return getOperandDims(); }
+    ValueRange getResultDynamicDims(unsigned idx) { return getOperandDims(); }
   }];
 
   let hasVerifier = 1;
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp
index d60e6b19c447..00288cc640d6 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp
@@ -229,8 +229,8 @@ struct ConvertTensorCloneOp
     auto unknownType = rewriter.getType<IREE::Stream::ResourceType>();
     auto cloneOp = rewriter.create<IREE::Stream::TensorCloneOp>(
         op.getLoc(), unknownType, operand.resource, op.getOperand().getType(),
-        op.getArgumentDims(), operand.resourceSize, op.getResult().getType(),
-        flattenValues(adaptor.getArgumentDims()), operand.resourceSize,
+        op.getOperandDims(), operand.resourceSize, op.getResult().getType(),
+        flattenValues(adaptor.getOperandDims()), operand.resourceSize,
         executionAffinityAttr);
     rewriter.replaceOpWithMultiple(op, {{cloneOp, operand.resourceSize}});
     return success();
@@ -249,7 +249,7 @@ struct ConvertTensorBarrierOp
     auto barrierOp = rewriter.create<IREE::Stream::AsyncBarrierOp>(
         op.getLoc(), operand.resource.getType(), operand.resource,
         operand.resourceSize,
-        /*affinity=*/operand.affinity);
+        /*affinity=*/executionAffinityAttr);
     rewriter.replaceOpWithMultiple(op, {{barrierOp, operand.resourceSize}});
     return success();
   }
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir
index 4f61917ed439..640da7802f1a 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir
@@ -139,14 +139,14 @@ util.func public @tensorSplat(%value: i8, %dim0: index) -> tensor<?x128xi8> {
 util.global private @device : !hal.device
 
 // CHECK-LABEL: @tensorBarrierDispatch
-//  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index)
+//  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM0:.+]]: index)
 util.func public @tensorBarrierDispatch(%input: tensor<?x128xi8>, %dim0: index) -> tensor<?x128xi8> {
-  // CHECK: %[[BARRIER:.+]] = stream.async.barrier %[[INPUT]] : !stream.resource<*>{%[[DIM0]]} -> !stream.resource<*>
+  // CHECK: %[[BARRIER:.+]] = stream.async.barrier on(#hal.device.affinity<@device>) %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]}
   %barrier = flow.tensor.barrier %input : tensor<?x128xi8>{%dim0} on #hal.device.affinity<@device>
-  // CHECK: %[[SIZE:.+]] = stream.tensor.sizeof on(#hal.device.affinity<@device>) tensor<?x128xi8>{%arg2} : index
-  // CHECK: %[[RESULT:.+]] = stream.tensor.dispatch on(#hal.device.affinity<@device>) @ex::@entry(%[[BARRIER]])
+  // CHECK: %[[RESULT_SIZE:.+]] = stream.tensor.sizeof tensor<?x128xi8>{%[[DIM0]]} : index
+  // CHECK: %[[RESULT:.+]] = stream.tensor.dispatch @ex::@entry(%[[BARRIER]])
   %0 = flow.dispatch @ex::@entry(%barrier) : (tensor<?x128xi8>{%dim0}) -> tensor<?x128xi8>{%dim0}
-  // CHECK: util.return %[[RESULT]], %[[SIZE]]
+  // CHECK: util.return %[[RESULT]], %[[RESULT_SIZE]]
   util.return %0 : tensor<?x128xi8>
 }
 
@@ -170,7 +170,7 @@ util.global private @device : !hal.device
 // CHECK-LABEL: @tensorBarrier
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM0:.+]]: index)
 util.func public @tensorBarrier(%input: tensor<?x128xi8>, %dim0: index) -> tensor<?x128xi8> {
-  // CHECK: %[[TRANSFER:.+]] = stream.async.barrier %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} -> !stream.resource<*>
+  // CHECK: %[[TRANSFER:.+]] = stream.async.barrier on(#hal.device.affinity<@device>) %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]}
   %transfer = flow.tensor.barrier %input : tensor<?x128xi8>{%dim0} on #hal.device.affinity<@device>
   // CHECK: util.return %[[TRANSFER]], %[[INPUT_SIZE]]
   util.return %transfer : tensor<?x128xi8>
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
index 4623a7bd6c64..13988a999b2f 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
@@ -2469,6 +2469,19 @@ bool AsyncBarrierOp::isMetadata() { return true; }
 
 LogicalResult AsyncBarrierOp::verify() { return success(); }
 
+Value AsyncBarrierOp::getTiedResult(unsigned resultIndex) {
+  return IREE::Util::TiedOpInterface::findTiedBaseValue(getSource());
+}
+
+::std::optional<unsigned>
+AsyncBarrierOp::getTiedResultOperandIndex(unsigned resultIndex) {
+  return {0}; // source
+}
+
+SmallVector<int64_t> AsyncBarrierOp::getTiedResultOperandIndices() {
+  return {0}; // source
+}
+
 //===----------------------------------------------------------------------===//
 // stream.async.transfer
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
index 8ed4bca948fa..62a44d5bac66 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
@@ -2291,15 +2291,25 @@ def Stream_AsyncCollectiveOp : Stream_Op<"async.collective", [
 }
 
 def Stream_AsyncBarrierOp : Stream_Op<"async.barrier", [
+  AllTypesMatch<["source", "result"]>,
   Stream_AffinityOp,
   Stream_AsyncPhaseOp,
   DeclareOpInterfaceMethods<Stream_StreamableOp, [
     "isMetadata",
   ]>,
   Util_SizeAwareOp,
+  DeclareOpInterfaceMethods<Util_TiedOpInterface, [
+    "getTiedResult",
+    "getTiedResultOperandIndex",
+    "getTiedResultOperandIndices",
+  ]>,
 ]> {
-  let summary = [{ }];
+  let summary = [{indicates a value that must have a specific affinity}];
   let description = [{
+    Prevents fusion and scheduling of a value across an affinity boundary.
+    May introduce copy-on-write behavior if the operand value is used as well as
+    the result and users should try to keep the operand to a single use by this
+    op.
   }];
 
   let arguments = (ins
@@ -2318,11 +2328,9 @@ def Stream_AsyncBarrierOp : Stream_Op<"async.barrier", [
   );
 
   let assemblyFormat = [{
+    (`on` `(` $affinity^ `)`)?
     $source `:` type($source)
     `` `{` $size `}`
-    (`from` `(` $affinity^ `)`)?
-    `->`
-    type($result)
     attr-dict-with-keyword
   }];
 
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp
index 6cb8a2a1ce42..b7ce5dfece90 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp
@@ -694,10 +694,14 @@ static LogicalResult applyAsyncTransferOp(IREE::Stream::AsyncTransferOp asyncOp,
   };
   auto currentAffinityAttr =
       IREE::Stream::AffinityAttr::lookupOrDefault(asyncOp);
-  bool transferIn = asyncOp.getSourceAffinityAttr() != currentAffinityAttr ||
-                    isStaging(asyncOp.getSource());
-  bool transferOut = asyncOp.getResultAffinityAttr() != currentAffinityAttr ||
-                     isStaging(asyncOp.getResult());
+  auto sourceAffinityAttr = asyncOp.getSourceAffinityAttr();
+  auto resultAffinityAttr = asyncOp.getResultAffinityAttr();
+  bool transferIn =
+      (sourceAffinityAttr && sourceAffinityAttr != currentAffinityAttr) ||
+      isStaging(asyncOp.getSource());
+  bool transferOut =
+      (resultAffinityAttr && resultAffinityAttr != currentAffinityAttr) ||
+      isStaging(asyncOp.getResult());
 
   auto sourceRange = scope.lookupResourceRange(asyncOp.getSource());
   auto targetRange = scope.lookupResourceRange(asyncOp.getResult());
@@ -1274,35 +1278,47 @@ struct ResultReservationSet {
 };
 
 struct ResultAllocation {
+  // Affinity for the allocations.
+  IREE::Stream::AffinityAttr affinityAttr;
   // Reservations bucketed by lifetime.
   SmallVector<ResultReservationSet> reservationSets;
 };
 
+// A map of allocation placement affinities to the alloc reservations requested.
+using ResultAllocationMap =
+    llvm::MapVector<IREE::Stream::AffinityAttr, SmallVector<ResultReservation>>;
+
 // Produces parameters for one or more result allocations composed of an ordered
-// set of |reservations| with matching lifetimes.
-static ResultAllocation
-reserveResultAllocation(ArrayRef<ResultReservation> reservations) {
-  // We want deterministic ordering of the allocations for each lifetime type
-  // so we build them all here and then just nuke the ones we don't end up
-  // using.
-  SmallVector<ResultReservationSet> sets(
-      IREE::Stream::getMaxEnumValForLifetime() + 1);
-  for (auto &reservation : reservations) {
-    auto &set =
-        sets[static_cast<unsigned>(reservation.resultType.getLifetime())];
-    set.reservationLocs.push_back(reservation.loc);
-    set.reservationTypes.push_back(reservation.resultType);
-    set.reservationSizes.push_back(reservation.resultSize);
-    set.reservations.push_back(std::move(reservation));
-  }
+// set of |reservations| with matching lifetimes. Allocations will be bucketed
+// both by their allocation affinity (where they should be placed) and their
+// lifetime (how long they're expected to live).
+static std::vector<ResultAllocation>
+reserveResultAllocations(ResultAllocationMap &reservationMap) {
+  std::vector<ResultAllocation> result;
+  for (auto &[affinityAttr, reservations] : reservationMap) {
+    // We want deterministic ordering of the allocations for each lifetime type
+    // so we build them all here and then just nuke the ones we don't end up
+    // using.
+    SmallVector<ResultReservationSet> sets(
+        IREE::Stream::getMaxEnumValForLifetime() + 1);
+    for (auto &reservation : reservations) {
+      auto &set =
+          sets[static_cast<unsigned>(reservation.resultType.getLifetime())];
+      set.reservationLocs.push_back(reservation.loc);
+      set.reservationTypes.push_back(reservation.resultType);
+      set.reservationSizes.push_back(reservation.resultSize);
+      set.reservations.push_back(std::move(reservation));
+    }
 
-  // Remove unused sets. This does a bunch of moves and is really bad but eh.
-  for (int i = sets.size() - 1; i >= 0; --i) {
-    if (sets[i].reservations.empty()) {
-      sets.erase(sets.begin() + i);
+    // Remove unused sets. This does a bunch of moves and is really bad but eh.
+    for (int i = sets.size() - 1; i >= 0; --i) {
+      if (sets[i].reservations.empty()) {
+        sets.erase(sets.begin() + i);
+      }
     }
+    result.push_back(ResultAllocation{affinityAttr, sets});
   }
-  return ResultAllocation{sets};
+  return result;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1333,6 +1349,49 @@ static Value findTiedYieldResult(Value seedValue) {
   return {};
 }
 
+// Walks up the use-def chain to find an affinity the given local value is
+// pinned to. May return nullptr if there's no assigned affinity and the
+// enclosing execution region affinity should be used.
+//
+// TODO(benvanik): change this to use an affinity analysis on the escaping
+// value instead. The local value may not have a transfer associated with it.
+static IREE::Stream::AffinityAttr findLocalValueAffinity(Value value) {
+  while (value) {
+    auto definingOp = value.getDefiningOp();
+    if (!definingOp) {
+      // Block argument or something we don't track locally.
+      return {};
+    } else if (auto transferOp =
+                   dyn_cast<IREE::Stream::AsyncTransferOp>(definingOp)) {
+      return transferOp.getResultAffinityAttr();
+    } else if (auto regionOp = dyn_cast<RegionBranchOpInterface>(definingOp)) {
+      // A region op with a yielded value (like stream.async.concurrent).
+      // Note that we always want to check for tied ops first as that will let
+      // us skip over the region entirely.
+      if (auto tiedOp = dyn_cast<IREE::Util::TiedOpInterface>(definingOp)) {
+        if (auto tiedValue = tiedOp.getTiedResultOperand(value)) {
+          value = tiedValue;
+          continue;
+        }
+      }
+      unsigned resultIndex = cast<OpResult>(value).getResultNumber();
+      auto &block = regionOp.getOperation()->getRegion(0).front();
+      auto terminatorOp =
+          cast<RegionBranchTerminatorOpInterface>(block.getTerminator());
+      value = terminatorOp.getSuccessorOperands(
+          RegionBranchPoint::parent())[resultIndex];
+    } else if (auto tiedOp =
+                   dyn_cast<IREE::Util::TiedOpInterface>(definingOp)) {
+      // If the producer is tied then try to get the operand.
+      value = tiedOp.getTiedResultOperand(value);
+    } else {
+      // Analysis blocked.
+      break;
+    }
+  }
+  return {};
+}
+
 // Returns a reversed list of subrange operations that lead from an initial
 // resource down a sequence to |derivedValue|. The first element in the list
 // will be the last subview of |derivedValue| and the last element will be the
@@ -1541,7 +1600,7 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) {
     auto resourceRange = ResourceRange(arg, operandSize);
     scope.mapResourceRange(arg, resourceRange, asmState.get());
   }
-  SmallVector<ResultReservation> resultReservations;
+  ResultAllocationMap resultReservations;
   for (auto [result, resultSize] :
        llvm::zip_equal(executeOp.getResults(), executeOp.getResultSizes())) {
     auto resultType = llvm::cast<IREE::Stream::ResourceType>(result.getType());
@@ -1623,6 +1682,13 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) {
       continue;
     }
 
+    // Find a pinned affinity for the value or inherit the execution region
+    // affinity.
+    auto allocationAffinity = findLocalValueAffinity(yieldValue);
+    if (!allocationAffinity) {
+      allocationAffinity = executeOp.getAffinityAttr();
+    }
+
     // Queue up the allocation for packing.
     ResultReservation resultReservation = {
         definingOp->getLoc(), result, resultType, resultSize, yieldValue,
@@ -1633,54 +1699,56 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) {
       resultReservation.result.printAsOperand(llvm::dbgs(), asmState);
       llvm::dbgs() << "\n";
     });
-    resultReservations.push_back(resultReservation);
+    resultReservations[allocationAffinity].push_back(resultReservation);
   }
-  auto resultAllocation = reserveResultAllocation(resultReservations);
-  for (auto &reservationSet : resultAllocation.reservationSets) {
-    // Allocate and tie an operand to the result.
-    auto timepointType = externalBuilder.getType<IREE::Stream::TimepointType>();
-    auto [allocaOp, suballocations] =
-        IREE::Stream::ResourceAllocaOp::createSuballocations(
-            timepointType, reservationSet.reservationTypes.front(),
-            reservationSet.reservationLocs, reservationSet.reservationSizes,
-            executeOp.getAwaitTimepoint(), executeOp.getAffinityAttr(),
-            externalBuilder);
-    newAwaitTimepoints.push_back(allocaOp.getResultTimepoint());
-
-    auto asmState = getRootAsmState(executeOp->getParentOp());
-    LLVM_DEBUG({
-      llvm::dbgs() << "  + alloc for result reservation set: ";
-      allocaOp.print(llvm::dbgs(), *asmState);
-      llvm::dbgs() << ":\n";
-    });
-
-    for (auto [reservation, suballocation] :
-         llvm::zip_equal(reservationSet.reservations, suballocations)) {
-      newOperands.push_back(suballocation);
-      newOperandSizes.push_back(reservation.resultSize);
-      resultReplacements.push_back(
-          std::make_pair(reservation.result, suballocation));
-
-      // Insert entry arg for the new operand tied all the way to the yield.
-      auto arg =
-          entryBlock.addArgument(reservation.resultType, reservation.loc);
+  for (auto &resultAllocation : reserveResultAllocations(resultReservations)) {
+    for (auto &reservationSet : resultAllocation.reservationSets) {
+      // Allocate and tie an operand to the result.
+      auto timepointType =
+          externalBuilder.getType<IREE::Stream::TimepointType>();
+      auto [allocaOp, suballocations] =
+          IREE::Stream::ResourceAllocaOp::createSuballocations(
+              timepointType, reservationSet.reservationTypes.front(),
+              reservationSet.reservationLocs, reservationSet.reservationSizes,
+              executeOp.getAwaitTimepoint(), resultAllocation.affinityAttr,
+              externalBuilder);
+      newAwaitTimepoints.push_back(allocaOp.getResultTimepoint());
 
+      auto asmState = getRootAsmState(executeOp->getParentOp());
       LLVM_DEBUG({
-        llvm::dbgs() << "    + adding entry arg for reservation ";
-        reservation.result.printAsOperand(llvm::dbgs(), *asmState);
-        llvm::dbgs() << "{";
-        reservation.resultSize.printAsOperand(llvm::dbgs(), *asmState);
-        llvm::dbgs() << "} from ";
-        reservation.yieldValue.printAsOperand(llvm::dbgs(), *asmState);
-        llvm::dbgs() << " as ";
-        arg.printAsOperand(llvm::dbgs(), *asmState);
-        llvm::dbgs() << "\n";
+        llvm::dbgs() << "  + alloc for result reservation set: ";
+        allocaOp.print(llvm::dbgs(), *asmState);
+        llvm::dbgs() << ":\n";
       });
 
-      // Map into scope, updating all aliases.
-      auto resourceRange = ResourceRange(arg, reservation.resultSize);
-      scope.mapResourceRange(reservation.yieldValue, resourceRange,
-                             asmState.get());
+      for (auto [reservation, suballocation] :
+           llvm::zip_equal(reservationSet.reservations, suballocations)) {
+        newOperands.push_back(suballocation);
+        newOperandSizes.push_back(reservation.resultSize);
+        resultReplacements.push_back(
+            std::make_pair(reservation.result, suballocation));
+
+        // Insert entry arg for the new operand tied all the way to the yield.
+        auto arg =
+            entryBlock.addArgument(reservation.resultType, reservation.loc);
+
+        LLVM_DEBUG({
+          llvm::dbgs() << "    + adding entry arg for reservation ";
+          reservation.result.printAsOperand(llvm::dbgs(), *asmState);
+          llvm::dbgs() << "{";
+          reservation.resultSize.printAsOperand(llvm::dbgs(), *asmState);
+          llvm::dbgs() << "} from ";
+          reservation.yieldValue.printAsOperand(llvm::dbgs(), *asmState);
+          llvm::dbgs() << " as ";
+          arg.printAsOperand(llvm::dbgs(), *asmState);
+          llvm::dbgs() << "\n";
+        });
+
+        // Map into scope, updating all aliases.
+        auto resourceRange = ResourceRange(arg, reservation.resultSize);
+        scope.mapResourceRange(reservation.yieldValue, resourceRange,
+                               asmState.get());
+      }
     }
   }
 
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp
index a793ad4aaee8..3a26bb9a95bb 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp
@@ -150,8 +150,15 @@ struct ExecutePartitionBuilder {
     // If the op has the same affinity as the partition region we can strip it.
     // Note that some ops may have affinities that are more specific and we
     // want to preserve those as long as possible.
-    if (auto affinityOp =
-            dyn_cast<IREE::Stream::AffinityOpInterface>(clonedOp)) {
+    if (auto transferOp = dyn_cast<IREE::Stream::AsyncTransferOp>(clonedOp)) {
+      if (transferOp.getSourceAffinityAttr() == partition->affinity) {
+        transferOp.setSourceAffinityAttr(nullptr);
+      }
+      if (transferOp.getResultAffinityAttr() == partition->affinity) {
+        transferOp.setResultAffinityAttr(nullptr);
+      }
+    } else if (auto affinityOp =
+                   dyn_cast<IREE::Stream::AffinityOpInterface>(clonedOp)) {
       if (affinityOp.getAffinityAttr() == partition->affinity) {
         affinityOp.setAffinityAttr(nullptr);
       }
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir
index 8c2d35fa73a0..79e28ac9ac27 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir
@@ -530,6 +530,64 @@ util.func public @applyAsyncTransferOp(%operand: !stream.resource<transient>, %s
 
 // -----
 
+// CHECK-LABEL: @applyAsyncTransferMultiScopeOp
+// CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
+util.func public @applyAsyncTransferMultiScopeOp(%operand: !stream.resource<transient>, %size: index) {
+  // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device>) : !stream.resource<transient>{%[[SIZE]]}
+  // CHECK: stream.cmd.execute on(#hal.device.affinity<@execution_device>) await(%[[ALLOCA_TIMEPOINT]])
+  // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource<transient>{%[[SIZE]]},
+  // CHECK-SAME:      %[[ALLOCA]] as %[[ALLOCA_CAPTURE:.+]]: !stream.resource<transient>{%[[SIZE]]})
+  %result, %result_timepoint = stream.async.execute on(#hal.device.affinity<@execution_device>) with(%operand as %capture: !stream.resource<transient>{%size}) -> !stream.resource<transient>{%size} {
+    // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_CAPTURE]][%c0], %[[SIZE]]
+    // CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]} -> !stream.resource<transient>{%[[SIZE]]}
+    // CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device>) %[[ALLOCA_CAPTURE]][%c0 for %[[SIZE]]]
+    // CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]}
+    %0 = stream.async.transfer %capture : !stream.resource<transient>{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device>) !stream.resource<transient>{%size}
+    stream.yield %0 : !stream.resource<transient>{%size}
+  } => !stream.timepoint
+  // CHECK: util.optimization_barrier %[[ALLOCA]]
+  util.optimization_barrier %result : !stream.resource<transient>
+  util.return
+}
+
+// -----
+
+// CHECK-LABEL: @applyAsyncConcurrentTransferMultiScopeOp
+// CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
+util.func public @applyAsyncConcurrentTransferMultiScopeOp(%operand: !stream.resource<transient>, %size: index) {
+  // CHECK-DAG: %[[ALLOCA_A:.+]], %[[ALLOCA_A_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device_a>) : !stream.resource<transient>{%[[SIZE]]}
+  // CHECK-DAG: %[[ALLOCA_B:.+]], %[[ALLOCA_B_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device_b>) : !stream.resource<transient>{%[[SIZE]]}
+  // CHECK-DAG: %[[ALLOCA_TIMEPOINTS:.+]] = stream.timepoint.join max(%[[ALLOCA_A_TIMEPOINT]], %[[ALLOCA_B_TIMEPOINT]])
+  // CHECK: stream.cmd.execute on(#hal.device.affinity<@execution_device>) await(%[[ALLOCA_TIMEPOINTS]])
+  // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:[a-z0-9_]+]]: !stream.resource<transient>{%[[SIZE]]},
+  // CHECK-SAME:      %[[ALLOCA_A]] as %[[ALLOCA_A_CAPTURE:[a-z0-9_]+]]: !stream.resource<transient>{%[[SIZE]]},
+  // CHECK-SAME:      %[[ALLOCA_B]] as %[[ALLOCA_B_CAPTURE:[a-z0-9_]+]]: !stream.resource<transient>{%[[SIZE]]})
+  %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@execution_device>) with(%operand as %capture: !stream.resource<transient>{%size}) -> (!stream.resource<transient>{%size}, !stream.resource<transient>{%size}) {
+    // CHECK: stream.cmd.concurrent
+    %concurrent:2 = stream.async.concurrent with(%capture as %concurrent_capture: !stream.resource<transient>{%size}) -> (!stream.resource<transient>{%size}, !stream.resource<transient>{%size}) {
+      // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_A_CAPTURE]][%c0], %[[SIZE]]
+      // CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]} -> !stream.resource<transient>{%[[SIZE]]}
+      // CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device_a>) %[[ALLOCA_A_CAPTURE]][%c0 for %[[SIZE]]]
+      // CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]}
+      %transfer_a = stream.async.transfer %concurrent_capture : !stream.resource<transient>{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device_a>) !stream.resource<transient>{%size}
+      // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_B_CAPTURE]][%c0], %[[SIZE]]
+      // CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]} -> !stream.resource<transient>{%[[SIZE]]}
+      // CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device_b>) %[[ALLOCA_B_CAPTURE]][%c0 for %[[SIZE]]]
+      // CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]}
+      %transfer_b = stream.async.transfer %concurrent_capture : !stream.resource<transient>{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device_b>) !stream.resource<transient>{%size}
+      stream.yield %transfer_a, %transfer_b : !stream.resource<transient>{%size}, !stream.resource<transient>{%size}
+    }
+    stream.yield %concurrent#0, %concurrent#1 : !stream.resource<transient>{%size}, !stream.resource<transient>{%size}
+  } => !stream.timepoint
+  // CHECK: util.optimization_barrier %[[ALLOCA_A]]
+  util.optimization_barrier %results#0 : !stream.resource<transient>
+  // CHECK: util.optimization_barrier %[[ALLOCA_B]]
+  util.optimization_barrier %results#1 : !stream.resource<transient>
+  util.return
+}
+
+// -----
+
 // CHECK-LABEL: @applyAsyncDispatchOp
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index, %[[OFFSET:.+]]: index, %[[END:.+]]: index, %[[LENGTH:.+]]: index)
 util.func public @applyAsyncDispatchOp(%operand: !stream.resource<transient>, %size: index, %offset: index, %end: index, %length: index) {
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir
index 7d3c2284aa86..cd6ccca95a20 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir
@@ -34,8 +34,8 @@ util.func public @partitioning(%arg0: !stream.resource<external>, %arg1: !stream
 
 // -----
 
-// Tests partitioning multi device execution with barriers and transfers.
-// It validates that multi stream commands are created and run in parallel:
+// Tests partitioning multi-device execution with barriers and transfers.
+// It validates that multi-stream commands are created and run in parallel.
 
 // CHECK-LABEL: util.func public @deviceMultiDeviceSync
 util.func public @deviceMultiDeviceSync(%arg0: i1) -> !stream.resource<transient> {
@@ -43,37 +43,38 @@ util.func public @deviceMultiDeviceSync(%arg0: i1) -> !stream.resource<transient
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
   %c255_i32 = arith.constant 255 : i32
-  %0 = stream.async.splat %c255_i32 : i32 -> !stream.resource<transient>{%c128}
-  %1 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @ex::@dispatch0[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
-  %3 = stream.async.barrier %1 : !stream.resource<transient>{%c128} -> !stream.resource<transient>
-  %4 = stream.async.transfer %1 : !stream.resource<transient>{%c128} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_1>) !stream.resource<transient>{%c128}
-  // CHECK: stream.async.execute on(#hal.device.affinity<@__device_0>)
+
+  // CHECK: stream.async.execute on(#hal.device.affinity<@device0>)
   // CHECK: stream.async.splat
   // CHECK: stream.async.dispatch
   // CHECK: stream.async.transfer
+  %0 = stream.async.splat %c255_i32 : i32 -> !stream.resource<transient>{%c128}
+  %1 = stream.async.dispatch on(#hal.device.affinity<@device0>) @ex::@dispatch0[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
+  %3 = stream.async.barrier %1 : !stream.resource<transient>{%c128}
+  %4 = stream.async.transfer %1 : !stream.resource<transient>{%c128} from(#hal.device.affinity<@device0>) -> to(#hal.device.affinity<@device1>) !stream.resource<transient>{%c128}
 
-  %2 = stream.async.dispatch on(#hal.device.affinity<@__device_1>) @ex::@dispatch1[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
-  %5 = stream.async.barrier %2 : !stream.resource<transient>{%c128} -> !stream.resource<transient>
-  %6 = stream.async.transfer %2 : !stream.resource<transient>{%c128} from(#hal.device.affinity<@__device_1>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<transient>{%c128}
-  // CHECK: stream.async.execute on(#hal.device.affinity<@__device_1>)
+  // CHECK: stream.async.execute on(#hal.device.affinity<@device1>)
   // CHECK: stream.async.splat
   // CHECK: stream.async.dispatch
   // CHECK: stream.async.transfer
+  %2 = stream.async.dispatch on(#hal.device.affinity<@device1>) @ex::@dispatch1[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
+  %5 = stream.async.barrier %2 : !stream.resource<transient>{%c128}
+  %6 = stream.async.transfer %2 : !stream.resource<transient>{%c128} from(#hal.device.affinity<@device1>) -> to(#hal.device.affinity<@device0>) !stream.resource<transient>{%c128}
 
-  %7 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @ex::@dispatch2[%c1, %c1, %c1](%3[%c0 to %c128 for %c128], %6[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
-  %9 = stream.async.barrier %7 : !stream.resource<transient>{%c128} -> !stream.resource<transient>
-  // CHECK: stream.async.execute on(#hal.device.affinity<@__device_0>)
+  // CHECK: stream.async.execute on(#hal.device.affinity<@device0>)
   // CHECK: stream.async.dispatch
+  %7 = stream.async.dispatch on(#hal.device.affinity<@device0>) @ex::@dispatch2[%c1, %c1, %c1](%3[%c0 to %c128 for %c128], %6[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
+  %8 = stream.async.barrier %7 : !stream.resource<transient>{%c128}
 
-  %8 = stream.async.dispatch on(#hal.device.affinity<@__device_1>) @ex::@dispatch2[%c1, %c1, %c1](%4[%c0 to %c128 for %c128], %5[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
-  %10 = stream.async.transfer %8 : !stream.resource<transient>{%c128} from(#hal.device.affinity<@__device_1>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<transient>{%c128}
-  // CHECK: stream.async.execute on(#hal.device.affinity<@__device_1>)
+  // CHECK: stream.async.execute on(#hal.device.affinity<@device1>)
   // CHECK: stream.async.dispatch
   // CHECK: stream.async.transfer
+  %9 = stream.async.dispatch on(#hal.device.affinity<@device1>) @ex::@dispatch2[%c1, %c1, %c1](%4[%c0 to %c128 for %c128], %5[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
+  %10 = stream.async.transfer %9 : !stream.resource<transient>{%c128} from(#hal.device.affinity<@device1>) -> to(#hal.device.affinity<@device0>) !stream.resource<transient>{%c128}
 
-  %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @ex::@dispatch2[%c1, %c1, %c1](%9[%c0 to %c128 for %c128], %10[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
-  // CHECK: stream.async.execute on(#hal.device.affinity<@__device_0>)
+  // CHECK: stream.async.execute on(#hal.device.affinity<@device0>)
   // CHECK: stream.async.dispatch
+  %11 = stream.async.dispatch on(#hal.device.affinity<@device0>) @ex::@dispatch2[%c1, %c1, %c1](%8[%c0 to %c128 for %c128], %10[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
 
   util.return %11 : !stream.resource<transient>
 }