[Codegen][GPU] Change iree_gpu.shuffle_tensor to take a region for th…

…e read (iree-org#17425) This simplifies the number of fields required for the ops and enables including reshaping of the intermediate allocation without needing to add fields to the op ad infinitum. This change has another motivation due to an issue arising from alloc reuse that naturally arises from hoisting static allocations out of loops. In short, such hoisting (and bufferization) requires a synchronization not only on the write to the allocation, but also after all reads have completed due to reusing the same allocation for each iteration of the loop. This dependency is not modeled with SSA before or after bufferization, meaning the fact that this operation represents both the write and the reads is saving us with some spooky action at a distance. This missing dependency needs more investigation in the future, but it is unclear to me at the moment how to navigate bufferization and vectorization currently. I suspect we will end up wanting a vectorization pattern for this operation, but I'm leaving that as TODO for now. This also makes the intermediate type a tensor again because we were just using `bufferization.to_memref` before to get back to a tensor and the generated IR was unnatural. Perhaps worth another look in the future as well.
daveliddell · May 20, 2024 · e0f3c05 · e0f3c05
1 parent dc61fcc
commit e0f3c05
Show file tree

Hide file tree

Showing 9 changed files with 223 additions and 175 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
@@ -408,35 +408,30 @@ struct LowerShuffleTensor
                                 PatternRewriter &rewriter) const final {
     Location loc = shuffleOp.getLoc();
 
-    MemRefType allocType = shuffleOp.getSharedAllocType();
-    auto tensorType =
-        RankedTensorType::get(allocType.getShape(), allocType.getElementType());
-    Value tensorAlloc = rewriter.create<bufferization::ToTensorOp>(
-        loc, tensorType, shuffleOp.getSharedAlloc(), /*restrict=*/true,
-        /*writeable=*/true);
-
     // Step 1. Insert the source slice into the intermediate tensor.
-    SmallVector<OpFoldResult, 4> sourceOffsets =
-        shuffleOp.getMixedSourceOffsets();
-    SmallVector<OpFoldResult, 4> sourceSizes = shuffleOp.getMixedSourceSizes();
-    SmallVector<OpFoldResult, 4> sourceStrides =
-        shuffleOp.getMixedSourceStrides();
+    SmallVector<OpFoldResult, 4> sourceOffsets = shuffleOp.getMixedOffsets();
+    SmallVector<OpFoldResult, 4> sourceSizes = shuffleOp.getMixedSizes();
+    SmallVector<OpFoldResult, 4> sourceStrides = shuffleOp.getMixedStrides();
     Value insertedSlice = rewriter.create<tensor::InsertSliceOp>(
-        loc, shuffleOp.getSource(), tensorAlloc, sourceOffsets, sourceSizes,
-        sourceStrides);
+        loc, shuffleOp.getSource(), shuffleOp.getDest(), sourceOffsets,
+        sourceSizes, sourceStrides);
 
     // Step 2. Synchronize the workers.
     rewriter.create<gpu::BarrierOp>(loc);
 
-    // Step 3. Extract the result slice.
-    SmallVector<OpFoldResult, 4> resultOffsets =
-        shuffleOp.getMixedResultOffsets();
-    SmallVector<OpFoldResult, 4> resultSizes = shuffleOp.getMixedResultSizes();
-    SmallVector<OpFoldResult, 4> resultStrides =
-        shuffleOp.getMixedResultStrides();
-    rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
-        shuffleOp, shuffleOp.getType(), insertedSlice, resultOffsets,
-        resultSizes, resultStrides);
+    auto terminator = shuffleOp.getBody()->getTerminator();
+    Value replacement = terminator->getOperand(0);
+    rewriter.inlineBlockBefore(shuffleOp.getBody(), shuffleOp, {insertedSlice});
+    rewriter.replaceAllUsesWith(shuffleOp.getResult(), replacement);
+    rewriter.setInsertionPointAfterValue(replacement);
+
+    // Step 2. Synchronize the workers again after reading the shuffled values.
+    // TODO: This barrier is an approximation for what we expect bufferization +
+    // vectorization to produce. There is no guarantee that this barrier is
+    // adhered to, but the way that bufferization and vectorization works
+    // is unfriendly towards barrier-like constructs.
+    rewriter.create<gpu::BarrierOp>(loc);
+    rewriter.eraseOp(terminator);
     return success();
   }
 };
@@ -877,35 +872,27 @@ LogicalResult compareWorkerCountsAndTypes(scf::ForallOp producer,
   return success();
 }
 
-Value getReplacementSlice(RewriterBase &rewriter, Location loc,
-                          tensor::ParallelInsertSliceOp parallelInsert,
-                          tensor::ExtractSliceOp extractSlice,
-                          std::optional<Attribute> addressSpace) {
-  RankedTensorType destTensorType = parallelInsert.getDestType();
-  MemRefType allocType =
-      addressSpace ? MemRefType::get(destTensorType.getShape(),
-                                     destTensorType.getElementType(),
-                                     MemRefLayoutAttrInterface{}, *addressSpace)
-                   : MemRefType::get(destTensorType.getShape(),
-                                     destTensorType.getElementType());
-  Value dest = Value();
-  if (auto empty = parallelInsert.getDest().getDefiningOp<tensor::EmptyOp>()) {
-    OpBuilder::InsertionGuard g(rewriter);
-    rewriter.setInsertionPoint(empty);
-    dest = rewriter.create<memref::AllocOp>(loc, allocType,
-                                            empty.getDynamicSizes());
-  } else {
-    dest = rewriter.create<bufferization::ToMemrefOp>(loc, allocType,
-                                                      parallelInsert.getDest());
-  }
-  return rewriter.create<IREE::GPU::ShuffleTensorOp>(
+void replaceExtractSlice(RewriterBase &rewriter, Location loc,
+                         tensor::ParallelInsertSliceOp parallelInsert,
+                         tensor::ExtractSliceOp extractSlice) {
+  OpBuilder::InsertionGuard g(rewriter);
+  auto shuffleOp = rewriter.create<IREE::GPU::ShuffleTensorOp>(
       loc, extractSlice.getType(), parallelInsert.getSource(),
       parallelInsert.getOffsets(), parallelInsert.getSizes(),
       parallelInsert.getStrides(), parallelInsert.getStaticOffsets(),
-      parallelInsert.getStaticSizes(), parallelInsert.getStaticStrides(), dest,
-      extractSlice.getOffsets(), extractSlice.getSizes(),
-      extractSlice.getStrides(), extractSlice.getStaticOffsets(),
-      extractSlice.getStaticSizes(), extractSlice.getStaticStrides());
+      parallelInsert.getStaticSizes(), parallelInsert.getStaticStrides(),
+      parallelInsert.getDest());
+  Region *region = &shuffleOp.getRegion();
+  rewriter.createBlock(region, region->end(),
+                       ArrayRef<Type>{parallelInsert.getDestType()},
+                       ArrayRef<Location>{loc});
+  rewriter.setInsertionPointToStart(shuffleOp.getBody());
+  auto terminator =
+      rewriter.create<IREE::GPU::YieldOp>(loc, extractSlice.getResult());
+  rewriter.moveOpBefore(extractSlice, terminator);
+  extractSlice.getSourceMutable().assign(shuffleOp.getBody()->getArgument(0));
+  rewriter.replaceAllUsesExcept(extractSlice.getResult(), shuffleOp,
+                                terminator);
 }
 
 LogicalResult fuseForallIntoSlice(RewriterBase &rewriter,
@@ -975,12 +962,9 @@ LogicalResult fuseForallIntoSlice(RewriterBase &rewriter,
   auto parallelInsert =
       cast<tensor::ParallelInsertSliceOp>(*terminator.getYieldingOps().begin());
 
-  Value replacementSlice =
-      getReplacementSlice(rewriter, loc, parallelInsert, slice, addressSpace);
-  rewriter.replaceAllUsesWith(slice, replacementSlice);
+  replaceExtractSlice(rewriter, loc, parallelInsert, slice);
 
   rewriter.eraseOp(parallelInsert);
-  rewriter.eraseOp(slice);
   rewriter.eraseOp(terminator);
   rewriter.eraseOp(producer);
   return success();

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_fuse_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_fuse_forall.mlir
@@ -32,7 +32,7 @@ module attributes { transform.with_named_sequence } {
   transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     %loops = transform.structured.match ops{["scf.forall"]} in %root : (!transform.any_op) -> !transform.any_op
     %producer, %consumer = transform.split_handle %loops : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.fuse_forall %producer into %consumer {address_space = #gpu.address_space<workgroup>} : (!transform.any_op, !transform.any_op) -> (!transform.any_op)
+    transform.iree.fuse_forall %producer into %consumer : (!transform.any_op, !transform.any_op) -> (!transform.any_op)
     transform.yield
   }
 }
@@ -45,7 +45,6 @@ module attributes { transform.with_named_sequence } {
 //  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<128x128xf32>
 
 //   CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<128x128xf32>
-//   CHECK-DAG:   %[[ALLOC:.+]] = memref.alloc() : memref<128x128xf32, #gpu.address_space<workgroup>>
 //       CHECK:   scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
 //   CHECK-DAG:     %[[OUTID0:.+]] = affine.apply #[[$MAP]](%[[IDX]])
 //   CHECK-DAG:     %[[OUTID1:.+]] = affine.apply #[[$MAP]](%[[IDY]])
@@ -55,9 +54,11 @@ module attributes { transform.with_named_sequence } {
 //       CHECK:     %[[INSLICE0:.+]] = tensor.extract_slice %[[ARG0]][%[[INID0]], %[[IDS]]#1] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
 //       CHECK:     %[[INSLICE1:.+]] = tensor.extract_slice %[[EMPTY]][%[[INID0]], %[[IDS]]#1] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
 //       CHECK:     %[[COPY:.+]] = linalg.copy ins(%[[INSLICE0]] : tensor<2x128xf32>) outs(%[[INSLICE1]] : tensor<2x128xf32>) -> tensor<2x128xf32>
-//       CHECK:     %[[SHUFFLE:.+]] = iree_gpu.shuffle_tensor %[[COPY]][%[[INID0]], %[[IDS]]#1] [2, 128] [1, 1]
-//  CHECK-SAME:       to %[[ALLOC]] [%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1]
-//  CHECK-SAME:       : tensor<2x128xf32> -> memref<128x128xf32, #gpu.address_space<workgroup>> -> tensor<16x16xf32>
+//       CHECK:     %[[SHUFFLE:.+]] = iree_gpu.shuffle_tensor %[[COPY]][%[[INID0]], %[[IDS]]#1] [2, 128] [1, 1] to %[[EMPTY]]
+//       CHECK:     ^bb0(%[[INTERMEDIATE:.+]]: tensor<128x128xf32>):
+//       CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[INTERMEDIATE]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
+//       CHECK:       iree_gpu.yield %[[SLICE]]
+//       CHECK:     } : tensor<2x128xf32> -> tensor<128x128xf32> -> tensor<16x16xf32>
 //       CHECK:     %[[OUTSLICE:.+]] = tensor.extract_slice %[[INIT]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
 //       CHECK:     %[[MM:.+]] = linalg.matmul ins(%[[SHUFFLE]], %[[SHUFFLE]] : tensor<16x16xf32>, tensor<16x16xf32>)
 //  CHECK-SAME:       outs(%[[OUTSLICE]] : tensor<16x16xf32>) -> tensor<16x16xf32>
@@ -113,7 +114,6 @@ module attributes { transform.with_named_sequence } {
 //  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor<128x128xf32>
 
 //       CHECK:   scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) shared_outs(%[[INIT:.+]] = %[[ARG1]]) -> (tensor<128x128xf32>) {
-//       CHECK:     %[[ALLOC:.+]] = bufferization.to_memref %[[ARG1]]
-//       CHECK:     %[[SHUFFLE:.+]] = iree_gpu.shuffle_tensor %{{.*}} to %[[ALLOC]]
-//  CHECK-SAME:       : tensor<2x128xf32> -> memref<128x128xf32> -> tensor<16x16xf32>
+//       CHECK:     %[[SHUFFLE:.+]] = iree_gpu.shuffle_tensor %{{.*}} to %[[ARG1]]
+//       CHECK:       } : tensor<2x128xf32> -> tensor<128x128xf32> -> tensor<16x16xf32>
 //       CHECK:   } {mapping = [#gpu.warp<y>, #gpu.warp<x>]}
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_lower_shuffle.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_lower_shuffle.mlir
@@ -1,7 +1,11 @@
 // RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s
 
-func.func @shuffle_tensor(%init: memref<6x6xf32>, %arg0: tensor<2x3xf32>, %x: index) -> tensor<3x2xf32> {
-  %0 = iree_gpu.shuffle_tensor %arg0[%x, 0] [2, 3] [1, 1] to %init[0, %x] [3, 2] [1, 1] : tensor<2x3xf32> -> memref<6x6xf32> -> tensor<3x2xf32>
+func.func @shuffle_tensor(%init: tensor<6x6xf32>, %source: tensor<2x3xf32>, %x: index) -> tensor<3x2xf32> {
+  %0 = iree_gpu.shuffle_tensor %source[%x, 0] [2, 3] [1, 1] to %init {
+  ^bb0(%intermediate: tensor<6x6xf32>):
+    %slice = tensor.extract_slice %intermediate[0, %x] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
+    iree_gpu.yield %slice : tensor<3x2xf32>
+  } : tensor<2x3xf32> -> tensor<6x6xf32> -> tensor<3x2xf32>
   return %0 : tensor<3x2xf32>
 }
 
@@ -16,22 +20,24 @@ module attributes { transform.with_named_sequence } {
 }
 
 // CHECK-LABEL: func @shuffle_tensor
-//  CHECK-SAME:   %[[INIT:[A-Za-z0-9]+]]: memref<6x6xf32>
+//  CHECK-SAME:   %[[INIT:[A-Za-z0-9]+]]: tensor<6x6xf32>
 //  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor<2x3xf32>
 //  CHECK-SAME:   %[[X:[A-Za-z0-9]+]]: index
 
-//       CHECK:   %[[TENSOR:.+]] = bufferization.to_tensor %[[INIT]]
-//  CHECK-SAME:     restrict
-//  CHECK-SAME:     writable
-//       CHECK:   %[[IN:.+]] = tensor.insert_slice %[[ARG1]] into %[[TENSOR]][%[[X]], 0] [2, 3] [1, 1] : tensor<2x3xf32> into tensor<6x6xf32>
+//       CHECK:   %[[IN:.+]] = tensor.insert_slice %[[ARG1]] into %[[INIT]][%[[X]], 0] [2, 3] [1, 1] : tensor<2x3xf32> into tensor<6x6xf32>
 //       CHECK:   gpu.barrier
 //       CHECK:   %[[OUT:.+]] = tensor.extract_slice %[[IN]][0, %[[X]]] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
+//       CHECK:   gpu.barrier
 //       CHECK:   return %[[OUT]] : tensor<3x2xf32>
 
 // -----
 
-func.func @rank_reducing_shuffle_tensor(%init: memref<1x6x6xf32>, %arg0: tensor<2x3xf32>, %x: index, %y: index) -> tensor<3x2xf32> {
-  %0 = iree_gpu.shuffle_tensor %arg0[0, %x, %y] [1, 2, 3] [1, 1, 1] to %init[0, %y, %x] [1, 3, 2] [1, 1, 1] : tensor<2x3xf32> -> memref<1x6x6xf32> -> tensor<3x2xf32>
+func.func @rank_reducing_shuffle_tensor(%init: tensor<1x6x6xf32>, %source: tensor<2x3xf32>, %x: index, %y: index) -> tensor<3x2xf32> {
+  %0 = iree_gpu.shuffle_tensor %source[0, %x, %y] [1, 2, 3] [1, 1, 1] to %init {
+  ^bb0(%intermediate: tensor<1x6x6xf32>):
+    %slice = tensor.extract_slice %intermediate[0, %y, %x] [1, 3, 2] [1, 1, 1] : tensor<1x6x6xf32> to tensor<3x2xf32>
+    iree_gpu.yield %slice : tensor<3x2xf32>
+  } : tensor<2x3xf32> -> tensor<1x6x6xf32> -> tensor<3x2xf32>
   return %0 : tensor<3x2xf32>
 }
 
@@ -46,14 +52,45 @@ module attributes { transform.with_named_sequence } {
 }
 
 // CHECK-LABEL: func @rank_reducing_shuffle_tensor
-//  CHECK-SAME:   %[[INIT:[A-Za-z0-9]+]]: memref<1x6x6xf32>
+//  CHECK-SAME:   %[[INIT:[A-Za-z0-9]+]]: tensor<1x6x6xf32>
 //  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor<2x3xf32>
 //  CHECK-SAME:   %[[X:[A-Za-z0-9]+]]: index
 //  CHECK-SAME:   %[[Y:[A-Za-z0-9]+]]: index
 
-//       CHECK:   %[[TENSOR:.+]] = bufferization.to_tensor %[[INIT]]
-//  CHECK-SAME:     restrict
-//  CHECK-SAME:     writable
-//       CHECK:   %[[IN:.+]] = tensor.insert_slice %[[ARG1]] into %[[TENSOR]][0, %[[X]], %[[Y]]] [1, 2, 3] [1, 1, 1] : tensor<2x3xf32> into tensor<1x6x6xf32>
+//       CHECK:   %[[IN:.+]] = tensor.insert_slice %[[ARG1]] into %[[INIT]][0, %[[X]], %[[Y]]] [1, 2, 3] [1, 1, 1] : tensor<2x3xf32> into tensor<1x6x6xf32>
+//       CHECK:   gpu.barrier
+//       CHECK:   %[[OUT:.+]] = tensor.extract_slice %[[IN]][0, %[[Y]], %[[X]]] [1, 3, 2] [1, 1, 1] : tensor<1x6x6xf32> to tensor<3x2xf32>
+//       CHECK:   gpu.barrier
+//       CHECK:   return %[[OUT]]
+
+// -----
+
+func.func @reshape_shuffle_tensor(%init: tensor<12x12xf32>, %source: tensor<2x3xf32>) -> tensor<2x1x3x2xf32> {
+  %0 = iree_gpu.shuffle_tensor %source[0, 0] [2, 3] [1, 1] to %init {
+  ^bb0(%intermediate: tensor<12x12xf32>):
+    %expand = tensor.expand_shape %intermediate [[0, 1], [2, 3]] output_shape [4, 3, 3, 4] : tensor<12x12xf32> into tensor<4x3x3x4xf32>
+    %slice = tensor.extract_slice %expand[0, 0, 0, 0] [2, 1, 3, 2] [1, 1, 1, 1] : tensor<4x3x3x4xf32> to tensor<2x1x3x2xf32>
+    iree_gpu.yield %slice : tensor<2x1x3x2xf32>
+  } : tensor<2x3xf32> -> tensor<12x12xf32> -> tensor<2x1x3x2xf32>
+  return %0 : tensor<2x1x3x2xf32>
+}
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.lower_shuffle_tensor
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL: func @reshape_shuffle_tensor
+//  CHECK-SAME:   %[[INIT:[A-Za-z0-9]+]]: tensor<12x12xf32>
+//  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor<2x3xf32>
+
+//       CHECK:   %[[IN:.+]] = tensor.insert_slice %[[ARG1]] into %[[INIT]][0, 0] [2, 3] [1, 1] : tensor<2x3xf32> into tensor<12x12xf32>
+//       CHECK:   gpu.barrier
+//       CHECK:   %[[EXPAND:.+]] = tensor.expand_shape %[[IN]] {{\[}}[0, 1], [2, 3]{{\]}} output_shape [4, 3, 3, 4] : tensor<12x12xf32> into tensor<4x3x3x4xf32>
+//       CHECK:   tensor.extract_slice %[[EXPAND]][0, 0, 0, 0] [2, 1, 3, 2] [1, 1, 1, 1] : tensor<4x3x3x4xf32> to tensor<2x1x3x2xf32>
 //       CHECK:   gpu.barrier
-//       CHECK:   tensor.extract_slice %[[IN]][0, %[[Y]], %[[X]]] [1, 3, 2] [1, 1, 1] : tensor<1x6x6xf32> to tensor<3x2xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/BUILD.bazel
@@ -31,6 +31,7 @@ iree_td_library(
         include = ["*.td"],
     ),
     deps = [
+        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:SideEffectInterfacesTdFiles",
     ],
@@ -69,6 +70,7 @@ iree_compiler_cc_library(
         "//llvm-external-projects/iree-dialects:IREEVectorExtDialect",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AMDGPUDialect",
+        "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgDialect",

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/CMakeLists.txt
@@ -40,6 +40,7 @@ iree_cc_library(
     IREEVectorExtDialect
     LLVMSupport
     MLIRAMDGPUDialect
+    MLIRControlFlowInterfaces
     MLIRIR
     MLIRLinalgDialect
     MLIRParser

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp
@@ -25,38 +25,46 @@ namespace mlir::iree_compiler::IREE::GPU {
 
 LogicalResult ShuffleTensorOp::verify() {
   // Get the equivalent tensor type for the alloc to verify against.
-  MemRefType allocType = getSharedAllocType();
-  Type allocElementType = allocType.getElementType();
+  RankedTensorType destType = getDestType();
+  Type allocElementType = destType.getElementType();
   RankedTensorType allocTensorType =
-      RankedTensorType::get(allocType.getShape(), allocElementType);
+      RankedTensorType::get(destType.getShape(), allocElementType);
 
   // Verify source type against inferred type. Slice insertion and extraction
   // use the same verification logic.
   RankedTensorType expectedType = tensor::ExtractSliceOp::inferResultType(
-      allocTensorType, getMixedSourceOffsets(), getMixedSourceSizes(),
-      getMixedSourceStrides());
+      allocTensorType, getMixedOffsets(), getMixedSizes(), getMixedStrides());
   SliceVerificationResult result =
       isRankReducedType(expectedType, getSourceType());
   if (result != SliceVerificationResult::Success) {
     return emitError("Invalid source slice type");
   }
 
-  // Do the same for the resulting tensor type
-  expectedType = tensor::ExtractSliceOp::inferResultType(
-      allocTensorType, getMixedResultOffsets(), getMixedResultSizes(),
-      getMixedResultStrides());
-  result = isRankReducedType(expectedType, getType());
-  if (result != SliceVerificationResult::Success) {
-    return emitError("Invalid result slice type");
-  }
-
   if (allocElementType != getSourceType().getElementType() ||
       allocElementType != getType().getElementType()) {
-    return emitError(
-        "Element type mismatch between source, allocation, and result");
+    return emitError("Element type mismatch between source and destination");
+  }
+  return success();
+}
+
+LogicalResult ShuffleTensorOp::verifyRegions() {
+  auto &region = getRegion();
+  Block &block = region.front();
+  if (block.getNumArguments() != 1) {
+    return emitError("expected the block to have a single argument");
+  }
+
+  if (block.getArgumentTypes()[0] != getDestType()) {
+    return emitError("expected block to have single argument type of")
+           << getDestType();
+  }
+
+  // Ensure that the region yields an element of the right type.
+  auto yieldOp = llvm::cast<GPU::YieldOp>(block.getTerminator());
+  if (yieldOp.getValue().getType() != getResult().getType()) {
+    return emitOpError("expected yield type to match result type");
   }
 
-  // TODO: Verification of the allocation size in the static case.
   return success();
 }
 

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h
@@ -13,6 +13,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/Operation.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 // clang-format off