Skip to content

Commit

Permalink
GPU memcpy draft
Browse files Browse the repository at this point in the history
Signed-off-by: dchigarev <[email protected]>
  • Loading branch information
dchigarev committed Dec 13, 2024
1 parent 08b8571 commit 7b23d43
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 37 deletions.
85 changes: 68 additions & 17 deletions lib/Transforms/InsertGPUAllocs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,32 @@ class InsertGPUAllocsPass final
return mlir::success();
}

// Go through all users of 'memory' recursively and replace
// all memref.copy with gpu.memcpy
static void replaceMemrefCopyWithGpuMemcpy(mlir::OpBuilder &builder,
mlir::Value memory) {
mlir::SmallVector<mlir::memref::CopyOp> toErase;

for (auto u : memory.getUsers()) {
if (auto copyOp = mlir::dyn_cast<mlir::memref::CopyOp>(u)) {
builder.setInsertionPoint(copyOp);
builder.create<mlir::gpu::MemcpyOp>(
copyOp.getLoc(), /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/copyOp.getTarget(),
/*src=*/copyOp.getSource());
toErase.push_back(copyOp);
} else if (u->getNumResults() == 0)
continue;
else {
for (auto result : u->getResults())
replaceMemrefCopyWithGpuMemcpy(builder, result);
}
}

for (auto copyOp : toErase)
copyOp.erase();
}

void runOnOperation() override {
auto func = getOperation();
auto &funcBody = func.getBody();
Expand Down Expand Up @@ -382,12 +408,14 @@ class InsertGPUAllocsPass final
}

if (auto copy = mlir::dyn_cast<mlir::memref::CopyOp>(user)) {
if (copy.getSource() == mem)
ret.hostRead = true;

if (copy.getTarget() == mem)
ret.hostWrite = true;

// All memref.copy ops should be replaced with gpu.memcpy by
// 'add_gpu_alloc' function, so we shouldn't count them as "host
// usage". We may need to uncomment this code and do some analysis
// here instead if a case where it doesn't work would be found. if
// (copy.getSource() == mem)
// ret.hostRead = true;
// if (copy.getTarget() == mem)
// ret.hostWrite = true;
continue;
}

Expand Down Expand Up @@ -433,23 +461,29 @@ class InsertGPUAllocsPass final
/*asyncDependencies*/ std::nullopt, alloc.getDynamicSizes(),
alloc.getSymbolOperands(), hostShared);
auto allocResult = gpuAlloc.getResult(0);

auto memory = alloc->getResult(0);
replaceMemrefCopyWithGpuMemcpy(builder, memory);

builder.setInsertionPoint(term);
for (mlir::OpOperand &use : alloc.getResult().getUses()) {
if (use.getOwner() == term) {
auto newAlloc = builder.create<mlir::memref::AllocOp>(
loc, alloc.getType(), alloc.getDynamicSizes(),
alloc.getSymbolOperands());
builder.create<mlir::memref::CopyOp>(loc, allocResult,
newAlloc.getResult());
builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/newAlloc.getResult(),
/*src=*/allocResult);
use.set(newAlloc.getResult());
}
}

// remove 'memref.dealloc' (it's later replaced with gpu.dealloc)
auto memory = alloc->getResult(0);
for (auto u : memory.getUsers()) {
if (auto dealloc = mlir::dyn_cast<mlir::memref::DeallocOp>(u)) {
dealloc.erase();
break;
}
}

Expand All @@ -469,6 +503,8 @@ class InsertGPUAllocsPass final
filter.clear();
dims.clear();

replaceMemrefCopyWithGpuMemcpy(builder, op);

// This code handles dynamic dims with known rank.
for (auto i : llvm::seq(0u, rank)) {
if (memrefType.isDynamicDim(i)) {
Expand All @@ -489,8 +525,9 @@ class InsertGPUAllocsPass final
/*symbolOperands*/ std::nullopt, hostShared);
auto allocResult = gpuAlloc.getResult(0);
if (access.hostWrite && access.deviceRead) {
auto copy =
builder.create<mlir::memref::CopyOp>(loc, op, allocResult);
auto copy = builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), allocResult, op);
filter.insert(copy);
}

Expand All @@ -501,15 +538,21 @@ class InsertGPUAllocsPass final
op.replaceAllUsesExcept(castedAllocResult, filter);
builder.setInsertionPoint(term);
if (access.hostRead && access.deviceWrite) {
builder.create<mlir::memref::CopyOp>(loc, castedAllocResult, op);
builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/op,
/*src=*/castedAllocResult);
}
builder.create<mlir::gpu::DeallocOp>(loc, std::nullopt,
castedAllocResult);
} else {
op.replaceAllUsesExcept(allocResult, filter);
builder.setInsertionPoint(term);
if (access.hostRead && access.deviceWrite) {
builder.create<mlir::memref::CopyOp>(loc, allocResult, op);
builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/op,
/*src=*/allocResult);
}
builder.create<mlir::gpu::DeallocOp>(loc, std::nullopt, allocResult);
}
Expand All @@ -518,8 +561,10 @@ class InsertGPUAllocsPass final
builder.create<mlir::memref::AllocOp>(loc, allocType, dims);
auto allocResult = gpuAlloc.getResult();
if (access.hostWrite && access.deviceRead) {
auto copy =
builder.create<mlir::memref::CopyOp>(loc, op, allocResult);
auto copy = builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/allocResult,
/*src=*/op);
filter.insert(copy);
}

Expand All @@ -530,13 +575,19 @@ class InsertGPUAllocsPass final
op.replaceAllUsesExcept(castedAllocResult, filter);
builder.setInsertionPoint(term);
if (access.hostRead && access.deviceWrite) {
builder.create<mlir::memref::CopyOp>(loc, castedAllocResult, op);
builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/op,
/*src=*/castedAllocResult);
}
} else {
op.replaceAllUsesExcept(allocResult, filter);
builder.setInsertionPoint(term);
if (access.hostRead && access.deviceWrite) {
builder.create<mlir::memref::CopyOp>(loc, allocResult, op);
builder.create<mlir::gpu::MemcpyOp>(
loc, /*resultTypes=*/mlir::TypeRange(),
/*asyncDeps=*/mlir::ValueRange(), /*dst=*/op,
/*src=*/allocResult);
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions test/Transforms/InsertGpuAllocs/add-gpu-alloc-for-tmp-buff.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %out_buff: memre
%c5 = arith.constant 5 : index
// OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy %[[arg1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: gpu.memcpy %[[MEMREF1]], %[[arg1]] : memref<2x5xf32>, memref<2x5xf32>
// OPENCL: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy %[[arg0]], %[[MEMREF2]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: gpu.memcpy %[[MEMREF2]], %[[arg0]] : memref<2x5xf32>, memref<2x5xf32>

// VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy %[[arg1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: gpu.memcpy %[[MEMREF1]], %[[arg1]] : memref<2x5xf32>, memref<2x5xf32>
// VULKAN: %[[MEMREF2:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy %[[arg0]], %[[MEMREF2]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: gpu.memcpy %[[MEMREF2]], %[[arg0]] : memref<2x5xf32>, memref<2x5xf32>

%tmp_buff = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
// OPENCL-NOT: %[[MEMREF3:.*]] = memref.alloc().*
Expand Down Expand Up @@ -49,7 +49,7 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %out_buff: memre
// OPENCL: gpu.dealloc %[[MEMREF3]] : memref<2x5xf32>
// OPENCL: gpu.dealloc %[[MEMREF2]] : memref<2x5xf32>
// OPENCL: gpu.dealloc %[[MEMREF1]] : memref<2x5xf32>
// OPENCL: memref.copy %[[MEMREF0]], %[[out_buff]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: gpu.memcpy %[[out_buff]], %[[MEMREF0]] : memref<2x5xf32>, memref<2x5xf32>
// OPENCL: gpu.dealloc %[[MEMREF0]] : memref<2x5xf32>
// VULKAN: memref.dealloc %[[MEMREF3]] : memref<2x5xf32>
memref.dealloc %tmp_buff : memref<2x5xf32>
Expand Down
8 changes: 4 additions & 4 deletions test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %gpu_arg0: memre
%c1 = arith.constant 1 : index
%c5 = arith.constant 5 : index
// OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy %arg1, %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: gpu.memcpy %[[MEMREF0]], %arg1 : memref<2x5xf32>, memref<2x5xf32>
// OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy %arg0, %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: gpu.memcpy %[[MEMREF1]], %arg0 : memref<2x5xf32>, memref<2x5xf32>
// VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy %arg1, %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: gpu.memcpy %[[MEMREF0]], %arg1 : memref<2x5xf32>, memref<2x5xf32>
// VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy %arg0, %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: gpu.memcpy %[[MEMREF1]], %arg0 : memref<2x5xf32>, memref<2x5xf32>

%0 = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
// OPENCL: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
Expand Down
12 changes: 6 additions & 6 deletions test/Transforms/InsertGpuAllocs/memref-get-global.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3
%2 = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>

// OPENCL: [[VAR0:%.*]] = memref.get_global @__constant_2x5xf32 : memref<2x5xf32>
// OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy [[VAR0]], %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: %[[MEMREF0:.*]] = gpu.alloc () : memref<2x5xf32>
// OPENCL: gpu.memcpy %[[MEMREF0]], [[VAR0]] : memref<2x5xf32>, memref<2x5xf32>
// OPENCL: [[VAR1:%.*]] = memref.get_global @__constant_2x5xf32_0 : memref<2x5xf32>
// OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy [[VAR1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: %[[MEMREF1:.*]] = gpu.alloc () : memref<2x5xf32>
// OPENCL: gpu.memcpy %[[MEMREF1]], [[VAR1]] : memref<2x5xf32>, memref<2x5xf32>
// OPENCL: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// VULKAN: [[VAR0:%.*]] = memref.get_global @__constant_2x5xf32 : memref<2x5xf32>
// VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy [[VAR0]], %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: gpu.memcpy %[[MEMREF0]], [[VAR0]] : memref<2x5xf32>, memref<2x5xf32>
// VULKAN: [[VAR1:%.*]] = memref.get_global @__constant_2x5xf32_0 : memref<2x5xf32>
// VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy [[VAR1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: gpu.memcpy %[[MEMREF1]], [[VAR1]] : memref<2x5xf32>, memref<2x5xf32>

%c1_0 = arith.constant 1 : index
%3 = affine.apply affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>(%c2)[%c0, %c1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ func.func @main() {
// OPENCL: func.func @main()
%0 = func.call @alloc_buffer() : () -> memref<8xf32>
// OPENCL: %[[MEMREF:.*]] = gpu.alloc host_shared () : memref<8xf32>
// OPENCL: memref.copy %0, %[[MEMREF]] : memref<8xf32> to memref<8xf32>
// OPENCL: gpu.memcpy %[[MEMREF]], %0 : memref<8xf32>, memref<8xf32>
%1 = memref.alloc() : memref<8xf32>
%2 = memref.alloc() : memref<8xf32>
gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c8, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) {
Expand Down
8 changes: 4 additions & 4 deletions test/Transforms/InsertGpuAllocs/xegpu-mem-copy.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3
%c1 = arith.constant 1 : index
// OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
// OPENCL: memref.copy %[[arg0]], %[[MEMREF1:.*]]
// OPENCL: gpu.memcpy %[[MEMREF1:.*]], %[[arg0]]
// VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
// VULKAN: memref.copy %[[arg0]], %[[MEMREF1:.*]]
// VULKAN: gpu.memcpy %[[MEMREF1:.*]], %[[arg0]]

gpu.launch blocks(%arg2, %arg3, %arg4) in (%arg8 = %c1, %arg9 = %c1, %arg10 = %c1) threads(%arg5, %arg6, %arg7) in (%arg11 = %c1, %arg12 = %c1, %arg13 = %c1) {
%c0 = arith.constant 0 : index
Expand All @@ -23,8 +23,8 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3
gpu.terminator
} {SCFToGPU_visited}

// OPENCL: memref.copy %[[MEMREF0]], %[[arg1]] : memref<2x5xf32> to memref<2x5xf32>
// VULKAN: memref.copy %[[MEMREF0]], %[[arg1]] : memref<2x5xf32> to memref<2x5xf32>
// OPENCL: gpu.memcpy %[[arg1]], %[[MEMREF0]] : memref<2x5xf32>, memref<2x5xf32>
// VULKAN: gpu.memcpy %[[arg1]], %[[MEMREF0]] : memref<2x5xf32>, memref<2x5xf32>

return %arg1 : memref<2x5xf32>
}

0 comments on commit 7b23d43

Please sign in to comment.