[AllocsToSLM] Add thread-specific offsets (#407)

dchigarev · web-flow · commit d9921c41340b · 2024-11-11T09:56:13.000+01:00
Signed-off-by: dchigarev &lt;dmitry.chigarev@intel.com&gt;
diff --git a/lib/gc/Transforms/GPU/AllocsToSLM.cpp b/lib/gc/Transforms/GPU/AllocsToSLM.cpp
@@ -45,14 +45,19 @@ bool hasAssignedMemSpace(Value value) {
   return false;
 }
 
+// Converts `memref::AllocOp` within GPU regions to the GPU shared local
+// memory. Adjusts the allocation shape based on GPU block dimensions and
+// creates a `memref::SubViewOp` for thread-specific memory access.
 struct ConvertAlloc : public OpRewritePattern<memref::AllocOp> {
   using OpRewritePattern<memref::AllocOp>::OpRewritePattern;
 
   ConvertAlloc(MLIRContext *ctx) : OpRewritePattern<memref::AllocOp>(ctx) {}
 
   LogicalResult matchAndRewrite(memref::AllocOp allocOp,
                                 PatternRewriter &rewriter) const override {
-    if (hasAssignedMemSpace(allocOp->getResult(0))) {
+    Value memref = allocOp->getResult(0);
+
+    if (hasAssignedMemSpace(memref)) {
       return rewriter.notifyMatchFailure(
           allocOp, "Memref already has some memory space attribute");
     }
@@ -62,22 +67,83 @@ struct ConvertAlloc : public OpRewritePattern<memref::AllocOp> {
                                          "Only support allocs in GPU regions");
     }
 
-    Value memref = allocOp->getResult(0);
+    auto launchOp = allocOp->getParentOfType<gpu::LaunchOp>();
+
+    auto xSz = dyn_cast<arith::ConstantIndexOp>(
+        launchOp.getBlockSizeX().getDefiningOp());
+    auto ySz = dyn_cast<arith::ConstantIndexOp>(
+        launchOp.getBlockSizeY().getDefiningOp());
+    auto zSz = dyn_cast<arith::ConstantIndexOp>(
+        launchOp.getBlockSizeZ().getDefiningOp());
+
+    if (!xSz || !ySz || !zSz)
+      return rewriter.notifyMatchFailure(
+          allocOp, "Only support constant block sizes for now");
+
+    int64_t xI = xSz.value();
+    int64_t yI = ySz.value();
+    int64_t zI = zSz.value();
+
+    if (zI != 1)
+      return rewriter.notifyMatchFailure(
+          allocOp, "Only support 2D shared memory for now");
+
     MemRefType originalMemRefType = cast<MemRefType>(memref.getType());
+    auto originalShape = originalMemRefType.getShape();
+
+    // Scale the allocation size by the number of threads in the work-group
+    int64_t newX = originalShape[0] * xI;
+    int64_t newY = originalShape[1] * yI;
+
+    SmallVector<int64_t> newShape = {newX, newY};
 
     IntegerAttr sharedAddressSpace =
         IntegerAttr::get(rewriter.getIntegerType(64),
                          static_cast<int64_t>(gpu::AddressSpace::Private));
 
-    // Create a new MemRefType with the desired address space
-    MemRefType newMemRefType = MemRefType::get(
-        originalMemRefType.getShape(), originalMemRefType.getElementType(),
-        originalMemRefType.getLayout(), sharedAddressSpace);
-
-    Value newMemRef = rewriter.create<memref::AllocOp>(
-        allocOp.getLoc(), newMemRefType, allocOp.getOperands());
-
-    memref.replaceAllUsesWith(newMemRef);
+    MemRefType newRootMemRefType =
+        MemRefType::get(newShape, originalMemRefType.getElementType(),
+                        originalMemRefType.getLayout(), sharedAddressSpace);
+
+    Value newRootMemRef =
+        rewriter
+            .create<memref::AllocOp>(allocOp.getLoc(), newRootMemRefType,
+                                     allocOp.getOperands())
+            .getResult();
+
+    // Compute the offsets in SLM chunk for the current thread
+    auto origXConst = rewriter.create<arith::ConstantIndexOp>(allocOp.getLoc(),
+                                                              originalShape[0]);
+    auto origYConst = rewriter.create<arith::ConstantIndexOp>(allocOp.getLoc(),
+                                                              originalShape[1]);
+
+    auto threadIds = launchOp.getThreadIds();
+
+    auto offX =
+        rewriter
+            .create<arith::MulIOp>(allocOp.getLoc(), threadIds.x, origXConst)
+            .getResult();
+    auto offY =
+        rewriter
+            .create<arith::MulIOp>(allocOp.getLoc(), threadIds.y, origYConst)
+            .getResult();
+
+    auto offsets = getMixedValues({ShapedType::kDynamic, ShapedType::kDynamic},
+                                  {offX, offY}, rewriter);
+    auto sizes = getMixedValues(originalShape, {}, rewriter);
+    auto strides = getMixedValues({1, 1}, {}, rewriter);
+
+    auto newSlice =
+        rewriter
+            .create<memref::SubViewOp>(allocOp.getLoc(), newRootMemRef, offsets,
+                                       sizes, strides)
+            .getResult();
+    memref.replaceAllUsesWith(newSlice);
+
+    // Erase deallocs since we don't need them for SLM
+    for (auto user : newSlice.getUsers())
+      if (auto deallocOp = dyn_cast<memref::DeallocOp>(user))
+        deallocOp->erase();
 
     return success();
   }
diff --git a/test/mlir/test/gc/Transforms/GPU/allocs-to-slm.mlir b/test/mlir/test/gc/Transforms/GPU/allocs-to-slm.mlir
@@ -2,25 +2,38 @@
 
 func.func @entry() {
   %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
 
   // Memory space wasn't assigned as it's allocated outside of gpu.launch block
-  // CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<16x16xf16>
-  %0 = memref.alloc() : memref<16x16xf16>
-  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c1, %sz_by = %c1, %sz_bz = %c1)
-             threads(%tx, %ty, %tz) in (%sz_tx = %c1, %sz_ty = %c1, %sz_tz = %c1) {
+  // CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<16x32xf16>
+  %0 = memref.alloc() : memref<16x32xf16>
+  // Capture thread-id variables
+  // CHECK: gpu.launch blocks(%[[ARG0:.+]], %[[ARG1:.+]], %[[ARG2:.+]]) in (%[[ARG6:.+]] = %c2, %[[ARG7:.+]] = %c2, %[[ARG8:.+]] = %c1) threads
+  // CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[ARG5:.+]]) in
+  // CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c4, %[[ARG11:.+]] = %c1) {
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c4, %sz_tz = %c1) {
     // Memory space was changed as it's explicitly specifided
-    // CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<16x16xf16, 1>
-    %1 = memref.alloc() : memref<16x16xf16, 1>
+    // CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<16x32xf16, 1>
+    %1 = memref.alloc() : memref<16x32xf16, 1>
     // Added 'shared' memory space
-    // CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<16x16xf16, 3>
-    %2 = memref.alloc() : memref<16x16xf16>
+    // CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<32x128xf16, 3>
+    // CHECK: %[[OFF_X:.*]] = arith.muli %[[THREAD_X]], %c16 : index
+    // CHECK: %[[OFF_Y:.*]] = arith.muli %[[THREAD_Y]], %c32 : index
+    // CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]], %[[OFF_Y]]] [16, 32] [1, 1]
+    // CHECK-SAME: memref<32x128xf16, 3> to memref<16x32xf16, strided<[128, 1], offset: ?>, 3>
+    %2 = memref.alloc() : memref<16x32xf16>
 
-    // CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_2]] : memref<16x16xf16, 1>, memref<16x16xf16, 3>) outs(%[[NEW_MEMREF_0]] : memref<16x16xf16>)
-    linalg.add ins(%1, %2 :memref<16x16xf16, 1>, memref<16x16xf16>) outs(%0 : memref<16x16xf16>)
-    // CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<16x16xf16, 1>
-    // CHECK: memref.dealloc %[[NEW_MEMREF_2]] : memref<16x16xf16, 3>
-    memref.dealloc %1 : memref<16x16xf16, 1>
-    memref.dealloc %2 : memref<16x16xf16>
+    // CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_3]] :
+    // CHECK-SAME: memref<16x32xf16, 1>, memref<16x32xf16, strided<[128, 1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<16x32xf16>)
+    linalg.add ins(%1, %2 :memref<16x32xf16, 1>, memref<16x32xf16>) outs(%0 : memref<16x32xf16>)
+    // CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<16x32xf16, 1>
+    // Verify that there are no deallocs for SLM
+    // CHECK-NOT: memref.dealloc %[[NEW_MEMREF_2]] .*
+    // CHECK-NOT: memref.dealloc %[[NEW_MEMREF_3]] .*
+    memref.dealloc %1 : memref<16x32xf16, 1>
+    memref.dealloc %2 : memref<16x32xf16>
     gpu.terminator
   }
   return