Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AllocsToSLM] Add thread-specific offsets #407

Merged
merged 2 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 77 additions & 11 deletions lib/gc/Transforms/GPU/AllocsToSLM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,19 @@ bool hasAssignedMemSpace(Value value) {
return false;
}

// Converts `memref::AllocOp` within GPU regions to the GPU shared local
// memory. Adjusts the allocation shape based on GPU block dimensions and
// creates a `memref::SubViewOp` for thread-specific memory access.
struct ConvertAlloc : public OpRewritePattern<memref::AllocOp> {
using OpRewritePattern<memref::AllocOp>::OpRewritePattern;

ConvertAlloc(MLIRContext *ctx) : OpRewritePattern<memref::AllocOp>(ctx) {}

LogicalResult matchAndRewrite(memref::AllocOp allocOp,
PatternRewriter &rewriter) const override {
if (hasAssignedMemSpace(allocOp->getResult(0))) {
Value memref = allocOp->getResult(0);

if (hasAssignedMemSpace(memref)) {
return rewriter.notifyMatchFailure(
allocOp, "Memref already has some memory space attribute");
}
Expand All @@ -62,22 +67,83 @@ struct ConvertAlloc : public OpRewritePattern<memref::AllocOp> {
"Only support allocs in GPU regions");
}

Value memref = allocOp->getResult(0);
auto launchOp = allocOp->getParentOfType<gpu::LaunchOp>();

auto xSz = dyn_cast<arith::ConstantIndexOp>(
launchOp.getBlockSizeX().getDefiningOp());
auto ySz = dyn_cast<arith::ConstantIndexOp>(
launchOp.getBlockSizeY().getDefiningOp());
auto zSz = dyn_cast<arith::ConstantIndexOp>(
launchOp.getBlockSizeZ().getDefiningOp());

if (!xSz || !ySz || !zSz)
return rewriter.notifyMatchFailure(
allocOp, "Only support constant block sizes for now");

int64_t xI = xSz.value();
int64_t yI = ySz.value();
int64_t zI = zSz.value();

if (zI != 1)
return rewriter.notifyMatchFailure(
allocOp, "Only support 2D shared memory for now");

MemRefType originalMemRefType = cast<MemRefType>(memref.getType());
auto originalShape = originalMemRefType.getShape();

// Scale the allocation size by the number of threads in the work-group
int64_t newX = originalShape[0] * xI;
int64_t newY = originalShape[1] * yI;

SmallVector<int64_t> newShape = {newX, newY};

IntegerAttr sharedAddressSpace =
IntegerAttr::get(rewriter.getIntegerType(64),
static_cast<int64_t>(gpu::AddressSpace::Private));

// Create a new MemRefType with the desired address space
MemRefType newMemRefType = MemRefType::get(
originalMemRefType.getShape(), originalMemRefType.getElementType(),
originalMemRefType.getLayout(), sharedAddressSpace);

Value newMemRef = rewriter.create<memref::AllocOp>(
allocOp.getLoc(), newMemRefType, allocOp.getOperands());

memref.replaceAllUsesWith(newMemRef);
MemRefType newRootMemRefType =
MemRefType::get(newShape, originalMemRefType.getElementType(),
originalMemRefType.getLayout(), sharedAddressSpace);

Value newRootMemRef =
rewriter
.create<memref::AllocOp>(allocOp.getLoc(), newRootMemRefType,
allocOp.getOperands())
.getResult();

// Compute the offsets in SLM chunk for the current thread
auto origXConst = rewriter.create<arith::ConstantIndexOp>(allocOp.getLoc(),
originalShape[0]);
auto origYConst = rewriter.create<arith::ConstantIndexOp>(allocOp.getLoc(),
originalShape[1]);

auto threadIds = launchOp.getThreadIds();

auto offX =
rewriter
.create<arith::MulIOp>(allocOp.getLoc(), threadIds.x, origXConst)
AndreyPavlenko marked this conversation as resolved.
Show resolved Hide resolved
.getResult();
auto offY =
rewriter
.create<arith::MulIOp>(allocOp.getLoc(), threadIds.y, origYConst)
.getResult();

auto offsets = getMixedValues({ShapedType::kDynamic, ShapedType::kDynamic},
{offX, offY}, rewriter);
auto sizes = getMixedValues(originalShape, {}, rewriter);
auto strides = getMixedValues({1, 1}, {}, rewriter);

auto newSlice =
rewriter
.create<memref::SubViewOp>(allocOp.getLoc(), newRootMemRef, offsets,
sizes, strides)
.getResult();
memref.replaceAllUsesWith(newSlice);

// Erase deallocs since we don't need them for SLM
for (auto user : newSlice.getUsers())
if (auto deallocOp = dyn_cast<memref::DeallocOp>(user))
deallocOp->erase();

return success();
}
Expand Down
41 changes: 27 additions & 14 deletions test/mlir/test/gc/Transforms/GPU/allocs-to-slm.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,38 @@

func.func @entry() {
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index

// Memory space wasn't assigned as it's allocated outside of gpu.launch block
// CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<16x16xf16>
%0 = memref.alloc() : memref<16x16xf16>
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c1, %sz_by = %c1, %sz_bz = %c1)
threads(%tx, %ty, %tz) in (%sz_tx = %c1, %sz_ty = %c1, %sz_tz = %c1) {
// CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<16x32xf16>
%0 = memref.alloc() : memref<16x32xf16>
// Capture thread-id variables
// CHECK: gpu.launch blocks(%[[ARG0:.+]], %[[ARG1:.+]], %[[ARG2:.+]]) in (%[[ARG6:.+]] = %c2, %[[ARG7:.+]] = %c2, %[[ARG8:.+]] = %c1) threads
// CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[ARG5:.+]]) in
// CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c4, %[[ARG11:.+]] = %c1) {
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c1)
threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c4, %sz_tz = %c1) {
// Memory space was changed as it's explicitly specifided
// CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<16x16xf16, 1>
%1 = memref.alloc() : memref<16x16xf16, 1>
// CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<16x32xf16, 1>
%1 = memref.alloc() : memref<16x32xf16, 1>
// Added 'shared' memory space
// CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<16x16xf16, 3>
%2 = memref.alloc() : memref<16x16xf16>
// CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<32x128xf16, 3>
// CHECK: %[[OFF_X:.*]] = arith.muli %[[THREAD_X]], %c16 : index
// CHECK: %[[OFF_Y:.*]] = arith.muli %[[THREAD_Y]], %c32 : index
// CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]], %[[OFF_Y]]] [16, 32] [1, 1]
// CHECK-SAME: memref<32x128xf16, 3> to memref<16x32xf16, strided<[128, 1], offset: ?>, 3>
%2 = memref.alloc() : memref<16x32xf16>

// CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_2]] : memref<16x16xf16, 1>, memref<16x16xf16, 3>) outs(%[[NEW_MEMREF_0]] : memref<16x16xf16>)
linalg.add ins(%1, %2 :memref<16x16xf16, 1>, memref<16x16xf16>) outs(%0 : memref<16x16xf16>)
// CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<16x16xf16, 1>
// CHECK: memref.dealloc %[[NEW_MEMREF_2]] : memref<16x16xf16, 3>
memref.dealloc %1 : memref<16x16xf16, 1>
memref.dealloc %2 : memref<16x16xf16>
// CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_3]] :
// CHECK-SAME: memref<16x32xf16, 1>, memref<16x32xf16, strided<[128, 1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<16x32xf16>)
linalg.add ins(%1, %2 :memref<16x32xf16, 1>, memref<16x32xf16>) outs(%0 : memref<16x32xf16>)
// CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<16x32xf16, 1>
// Verify that there are no deallocs for SLM
// CHECK-NOT: memref.dealloc %[[NEW_MEMREF_2]] .*
// CHECK-NOT: memref.dealloc %[[NEW_MEMREF_3]] .*
memref.dealloc %1 : memref<16x32xf16, 1>
memref.dealloc %2 : memref<16x32xf16>
gpu.terminator
}
return
Expand Down