[TileAndFuse] Add thread groups for convolution ops (nod-ai#695)

This works for now because the sizes of dimensions `DimZ` and `LinearDim0` are 1 with our tiling strategy, and so `DimY` and `DimX` map to the rows and columns of the AIE array. Follow-up work: pass to collapse scf.forall's with more than 2 induction variables to just 2. So instead of `(i,j,k) in (2,3,5)` for example, could be `(i,l) in (2,15)` and then j=l/5 k=l%5.
yzhang93 · Sep 4, 2024 · 994f6e3 · 994f6e3
1 parent a3c5036
commit 994f6e3
Show file tree

Hide file tree

Showing 3 changed files with 174 additions and 21 deletions.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
@@ -6,6 +6,7 @@
 
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
+#include "llvm/ADT/StringExtras.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
@@ -16,16 +17,128 @@
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/IR/Iterators.h"
 #include "mlir/IR/PatternMatch.h"
 
 #define DEBUG_TYPE "iree-amdaie-tile-and-fuse"
 
-
 namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
+enum class GPUGroupType { Block, Thread };
+
+/// Assign GPU dialect thread/block mapping attributes to tiled dimensions.
+/// The returned vector's size is the number of non-zero values in
+/// `tileSizesVal`. Failure is returned if it is not possible to assign
+/// mapping attributes to the dimensions.
+FailureOr<SmallVector<Attribute>> getGPUMappingAttributes(
+    ArrayRef<int64_t> tileSizesVal, GPUGroupType groupType,
+    TilingInterface op) {
+  MLIRContext *context = op.getContext();
+
+  // There is one induction variables in the scf.forall for each of the
+  // non-zero tile sizes. Recall that a '0' tile size corresponds to 'do
+  // not tile'.
+  uint32_t nbIndVars = std::count_if(tileSizesVal.begin(), tileSizesVal.end(),
+                                     [](int64_t t) { return t != 0; });
+
+  uint32_t nbIndVarsAboveOne =
+      std::count_if(tileSizesVal.begin(), tileSizesVal.end(),
+                    [](int64_t t) { return t > 1; });
+
+  // The mlir::gpu::MappingId enum supports 13 dimensions, see:
+  // https://github.com/llvm/llvm-project/blob/main
+  //   /mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td
+  if (nbIndVars > mlir::gpu::getMaxEnumValForMappingId()) {
+    return op->emitOpError("has too many dimensions to tile, ")
+           << "there are only " << mlir::gpu::getMaxEnumValForMappingId()
+           << " dimensions available in the mlir::gpu dialect, but "
+           << nbIndVars << " are required here..";
+  }
+
+  // Currently we expect only 2 tiled dimensions to be >1 when mapping
+  // to thread dimensions. This is to target the 2-D AIE array.
+  //
+  // TODO(newling) if there are 3+ dimensions, we probably need to collapse
+  // them into just 2. I'm leaving this as a follow-up task. Basically, instead
+  // of
+  //   ```(i,j,k) in (2,3,5)```
+  // we want
+  //   ```(i,l) in (2,15)```
+  // with then
+  //   j=l/5 and k=l%5.
+  //
+  // Once the above is implemented, we can safely remove the following check:
+  if (nbIndVarsAboveOne > 2 && groupType == GPUGroupType::Thread) {
+    auto tileSizesStr = std::to_string(tileSizesVal[0]);
+    for (unsigned i = 1; i < tileSizesVal.size(); ++i) {
+      tileSizesStr += ", " + std::to_string(tileSizesVal[i]);
+    }
+    return op->emitOpError("has requested tile sizes [")
+           << tileSizesStr
+           << "]. Currently we only support tiling thread dimensions "
+           << "with at most 2 dimensions having a tile size greater than 1, "
+           << "there are " << nbIndVarsAboveOne << " here.";
+  }
+
+  auto getMappingAttributeForDimension = [&](uint32_t i) -> Attribute {
+    auto id = static_cast<gpu::MappingId>(i);
+    if (groupType == GPUGroupType::Block)
+      return gpu::GPUBlockMappingAttr::get(context, id);
+    else if (groupType == GPUGroupType::Thread)
+      return gpu::GPUThreadMappingAttr::get(context, id);
+    else {
+      assert(false && "Unhandled group type, must be thread or block.");
+    }
+  };
+
+  // Map an integer to an Attribute as follows:
+  // 0 -> DimY
+  // 1 -> DimX
+  // 2 -> DimZ
+  // 3 -> LinearDim0
+  // 4 -> LinearDim1
+  // etc.
+  //
+  // Note that 0 and 1 are effectively swapped, because for AIE we want to
+  // map the first dimension to AIE array columns (or something like that).
+  auto getAttribute = [&](uint32_t i) -> Attribute {
+    if (i == 0)
+      return getMappingAttributeForDimension(1);
+    else if (i == 1)
+      return getMappingAttributeForDimension(0);
+    else
+      return getMappingAttributeForDimension(i);
+  };
+
+  // We give priority to tiling dimensions of size > 1, so that they
+  // preferentially get DimY and DimX.
+  SmallVector<Attribute> mapping(tileSizesVal.size(), {});
+  uint32_t nAttributes = 0;
+  for (uint32_t i = 0; i < tileSizesVal.size(); ++i) {
+    if (tileSizesVal[i] > 1) {
+      mapping[i] = getAttribute(nAttributes);
+      ++nAttributes;
+    }
+  }
+  for (uint32_t i = 0; i < tileSizesVal.size(); ++i) {
+    if (!mapping[i] && tileSizesVal[i] > 0) {
+      mapping[i] = getAttribute(nAttributes);
+      ++nAttributes;
+    }
+  }
+
+  // Squeeze out the empty attributes (corresponding to '0's in tileSizesVal).
+  SmallVector<Attribute> finalMapping;
+  finalMapping.reserve(nbIndVars);
+  for (Attribute attr : mapping) {
+    if (attr) finalMapping.push_back(attr);
+  }
+  return finalMapping;
+}
+
 /// Utility function to check if any of the reduction dimension is being tiled.
 static bool isTilingReductionDimension(TilingInterface consumerOp,
                                        SmallVector<int64_t> tileSizesVal) {
@@ -157,27 +270,33 @@ void AMDAIETileAndFusePass::runOnOperation() {
 
   SmallVector<OpFoldResult> tileSizes =
       getAsIndexOpFoldResult(context, tileSizesVal);
+
   auto options = scf::SCFTilingOptions().setTileSizes(tileSizes);
 
   // When tiling using scf.for we do not need to set any mapping.
   if (!useSCFFor) {
     options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
-    // Here we assume there are always two levels of parallel (scf.forall)
-    // loops, and the first level of tiling is always using scf.forall and
-    // mapped to blocks. Currently we are not using mapping attributes for
-    // Conv2d ops, because there could be four parallel tiling dimensions.
-    // TODO (vivian): create AIE specific mapping attributes.
-    if (!isa<linalg::ConvolutionOpInterface>(consumerOp.getOperation())) {
-      if (tilingLevel == 0) {
-        options.setMapping(
-            {gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimY),
-             gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimX)});
-      } else {
-        options.setMapping(
-            {gpu::GPUThreadMappingAttr::get(context, gpu::MappingId::DimY),
-             gpu::GPUThreadMappingAttr::get(context, gpu::MappingId::DimX)});
-      }
+
+    // Currently only thread groups are used in lowering, blocks get unrolled
+    // temporally. In theory we should be able to just not add any block group
+    // dimensions to the outer scf.forall operation, but currently this results
+    // in compilation failure. What happens is
+    // 1) without any block group dimensions, the scf.forall operation can be
+    //    be canonicalized away if the tile sizes are all 1 (small matmul, for
+    //    example). Leaving only the inner thread scf.forall. 
+    // 2) certain passes expect an outer scf.forall operation, so if it is
+    //    canonicalized away, the pass fails.
+    // So for now we're keeping the block group dimension here, but should
+    // be able to compile without any block group dimensions TODO(newling)
+    auto groupType =
+        tilingLevel == 0 ? GPUGroupType::Block : GPUGroupType::Thread;
+
+    auto maybeMapping =
+        getGPUMappingAttributes(tileSizesVal, groupType, consumerOp);
+    if (failed(maybeMapping)) {
+      return signalPassFailure();
     }
+    options.setMapping(maybeMapping.value());
   }
 
   IRRewriter rewriter(context);
@@ -205,8 +324,7 @@ void AMDAIETileAndFusePass::runOnOperation() {
                   // Fuse all Linalg ops (can be generalized later)
                   .Default([&](Operation *op) {
                     return op->getDialect() ==
-                           rewriter.getContext()
-                               ->getLoadedDialect<linalg::LinalgDialect>();
+                           context->getLoadedDialect<linalg::LinalgDialect>();
                   });
           return {fusableOp, false};
         });

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -644,6 +644,23 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) {
   passManager.addPass(memref::createFoldMemRefAliasOpsPass());
   passManager.addPass(createAMDAIEBridgeToAIRPass());
 
+  // Running canonicalization for all pipelines here results in failures.
+  // Example
+  // ```
+  // 'memref.cast' op is an unsupported operation. This pass currently only
+  // supports AllocOp and SubViewOp as inputs.
+  // ```
+  // It is currently required for the convolution pipeline though, to remove the
+  // extra (size-1) thread- and group- dimensions.
+  //
+  // TODO(newling) there are better solutions like:
+  // 1) make canonicalization work for scf.forall
+  // 2) pass to collapse rank-4 scf.foralls to rank-2 scf.foralls.
+  // 3) resolve above 'unsupproted operation' error.
+  if (clUseTilePipeline == TilePassPipeline::ConvDecomposePipeline) {
+    passManager.addPass(createCanonicalizerPass());
+  }
+
   // TODO (Erwei): Figure out a way to work with AMDAIEPackToDmaPass.
   if (clUseTilePipeline == TilePassPipeline::PackPeelPipeline)
     passManager.addPass(createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass());

diff --git a/...r/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir b/...r/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir
@@ -1,5 +1,5 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-0
-// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=1}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-1
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0}))' --split-input-file %s                        | FileCheck %s --check-prefix=TILE-LEVEL-0
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=1}))' --split-input-file --verify-diagnostics %s   | FileCheck %s --check-prefix=TILE-LEVEL-1
 // RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0 tile-elementwise=false}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-MATMUL-ONLY
 
 func.func @matmul_static(%arg0: tensor<8x16xi32>, %arg1 : tensor<16x8xi32>) -> tensor<8x8xi32> {
@@ -32,7 +32,25 @@ func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x3
 // TILE-LEVEL-0-SAME:   {
 //      TILE-LEVEL-0:       linalg.fill
 //      TILE-LEVEL-0:       linalg.conv_2d_nhwc_hwcf
-//      TILE-LEVEL-0:   }
+//      TILE-LEVEL-0:   }  {mapping = [#gpu.block<y>, #gpu.block<x>, #gpu.block<z>]}
+
+//      TILE-LEVEL-1: @conv_2d_nhwc_hwcf
+//      TILE-LEVEL-1:   scf.forall
+// TILE-LEVEL-1-SAME:   {
+//      TILE-LEVEL-1:       linalg.fill
+//      TILE-LEVEL-1:       linalg.conv_2d_nhwc_hwcf
+//      TILE-LEVEL-1:   }  {mapping = [#gpu.thread<z>, #gpu.thread<linear_dim_0>, #gpu.thread<y>, #gpu.thread<x>]}
+
+// -----
+
+func.func @conv_2d_nhwc_hwcf_unsupported_tiling(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x32x64xbf16>) -> tensor<2x12x12x64xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<2x12x12x64xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
+  // expected-error @+1 {{'linalg.conv_2d_nhwc_hwcf' op has requested tile sizes [1, 4, 4, 4, 0, 0, 0]. Currently we only support tiling thread dimensions with at most 2 dimensions having a tile size greater than 1, there are 3 here.}}
+  %2 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 4, 4, 4, 0, 0, 0], [1, 4, 4, 4, 0, 0, 0], [0, 0, 0, 0, 1, 1, 8]]>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x32xbf16>, tensor<3x3x32x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
+  return %2 : tensor<2x12x12x64xf32>
+}
 
 // -----