diff --git a/lib/Transforms/InsertGPUAllocs.cpp b/lib/Transforms/InsertGPUAllocs.cpp index 1bad1a9c7..f48d9b8c1 100644 --- a/lib/Transforms/InsertGPUAllocs.cpp +++ b/lib/Transforms/InsertGPUAllocs.cpp @@ -74,6 +74,32 @@ class InsertGPUAllocsPass final return mlir::success(); } + // Go through all users of 'memory' recursively and replace + // all memref.copy with gpu.memcpy + static void replaceMemrefCopyWithGpuMemcpy(mlir::OpBuilder &builder, + mlir::Value memory) { + mlir::SmallVector toErase; + + for (auto u : memory.getUsers()) { + if (auto copyOp = mlir::dyn_cast(u)) { + builder.setInsertionPoint(copyOp); + builder.create( + copyOp.getLoc(), /*resultTypes=*/mlir::TypeRange(), + /*asyncDeps=*/mlir::ValueRange(), /*dst=*/copyOp.getTarget(), + /*src=*/copyOp.getSource()); + toErase.push_back(copyOp); + } else if (u->getNumResults() == 0) + continue; + else { + for (auto result : u->getResults()) + replaceMemrefCopyWithGpuMemcpy(builder, result); + } + } + + for (auto copyOp : toErase) + copyOp.erase(); + } + void runOnOperation() override { auto func = getOperation(); auto &funcBody = func.getBody(); @@ -382,12 +408,14 @@ class InsertGPUAllocsPass final } if (auto copy = mlir::dyn_cast(user)) { - if (copy.getSource() == mem) - ret.hostRead = true; - - if (copy.getTarget() == mem) - ret.hostWrite = true; - + // All memref.copy ops should be replaced with gpu.memcpy by + // 'add_gpu_alloc' function, so we shouldn't count them as "host + // usage". We may need to uncomment this code and do some analysis + // here instead if a case where it doesn't work would be found. if + // (copy.getSource() == mem) + // ret.hostRead = true; + // if (copy.getTarget() == mem) + // ret.hostWrite = true; continue; } @@ -433,23 +461,29 @@ class InsertGPUAllocsPass final /*asyncDependencies*/ std::nullopt, alloc.getDynamicSizes(), alloc.getSymbolOperands(), hostShared); auto allocResult = gpuAlloc.getResult(0); + + auto memory = alloc->getResult(0); + replaceMemrefCopyWithGpuMemcpy(builder, memory); + builder.setInsertionPoint(term); for (mlir::OpOperand &use : alloc.getResult().getUses()) { if (use.getOwner() == term) { auto newAlloc = builder.create( loc, alloc.getType(), alloc.getDynamicSizes(), alloc.getSymbolOperands()); - builder.create(loc, allocResult, - newAlloc.getResult()); + builder.create( + loc, /*resultTypes=*/mlir::TypeRange(), + /*asyncDeps=*/mlir::ValueRange(), /*dst=*/newAlloc.getResult(), + /*src=*/allocResult); use.set(newAlloc.getResult()); } } // remove 'memref.dealloc' (it's later replaced with gpu.dealloc) - auto memory = alloc->getResult(0); for (auto u : memory.getUsers()) { if (auto dealloc = mlir::dyn_cast(u)) { dealloc.erase(); + break; } } @@ -469,6 +503,8 @@ class InsertGPUAllocsPass final filter.clear(); dims.clear(); + replaceMemrefCopyWithGpuMemcpy(builder, op); + // This code handles dynamic dims with known rank. for (auto i : llvm::seq(0u, rank)) { if (memrefType.isDynamicDim(i)) { @@ -489,8 +525,9 @@ class InsertGPUAllocsPass final /*symbolOperands*/ std::nullopt, hostShared); auto allocResult = gpuAlloc.getResult(0); if (access.hostWrite && access.deviceRead) { - auto copy = - builder.create(loc, op, allocResult); + auto copy = builder.create( + loc, /*resultTypes=*/mlir::TypeRange(), + /*asyncDeps=*/mlir::ValueRange(), allocResult, op); filter.insert(copy); } @@ -501,7 +538,10 @@ class InsertGPUAllocsPass final op.replaceAllUsesExcept(castedAllocResult, filter); builder.setInsertionPoint(term); if (access.hostRead && access.deviceWrite) { - builder.create(loc, castedAllocResult, op); + builder.create( + loc, /*resultTypes=*/mlir::TypeRange(), + /*asyncDeps=*/mlir::ValueRange(), /*dst=*/op, + /*src=*/castedAllocResult); } builder.create(loc, std::nullopt, castedAllocResult); @@ -509,7 +549,10 @@ class InsertGPUAllocsPass final op.replaceAllUsesExcept(allocResult, filter); builder.setInsertionPoint(term); if (access.hostRead && access.deviceWrite) { - builder.create(loc, allocResult, op); + builder.create( + loc, /*resultTypes=*/mlir::TypeRange(), + /*asyncDeps=*/mlir::ValueRange(), /*dst=*/op, + /*src=*/allocResult); } builder.create(loc, std::nullopt, allocResult); } @@ -518,8 +561,10 @@ class InsertGPUAllocsPass final builder.create(loc, allocType, dims); auto allocResult = gpuAlloc.getResult(); if (access.hostWrite && access.deviceRead) { - auto copy = - builder.create(loc, op, allocResult); + auto copy = builder.create( + loc, /*resultTypes=*/mlir::TypeRange(), + /*asyncDeps=*/mlir::ValueRange(), /*dst=*/allocResult, + /*src=*/op); filter.insert(copy); } @@ -530,13 +575,19 @@ class InsertGPUAllocsPass final op.replaceAllUsesExcept(castedAllocResult, filter); builder.setInsertionPoint(term); if (access.hostRead && access.deviceWrite) { - builder.create(loc, castedAllocResult, op); + builder.create( + loc, /*resultTypes=*/mlir::TypeRange(), + /*asyncDeps=*/mlir::ValueRange(), /*dst=*/op, + /*src=*/castedAllocResult); } } else { op.replaceAllUsesExcept(allocResult, filter); builder.setInsertionPoint(term); if (access.hostRead && access.deviceWrite) { - builder.create(loc, allocResult, op); + builder.create( + loc, /*resultTypes=*/mlir::TypeRange(), + /*asyncDeps=*/mlir::ValueRange(), /*dst=*/op, + /*src=*/allocResult); } } } diff --git a/test/Transforms/InsertGpuAllocs/add-gpu-alloc-for-tmp-buff.mlir b/test/Transforms/InsertGpuAllocs/add-gpu-alloc-for-tmp-buff.mlir index 69be7113f..9670afc08 100644 --- a/test/Transforms/InsertGpuAllocs/add-gpu-alloc-for-tmp-buff.mlir +++ b/test/Transforms/InsertGpuAllocs/add-gpu-alloc-for-tmp-buff.mlir @@ -12,15 +12,15 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %out_buff: memre %c5 = arith.constant 5 : index // OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32> // OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32> - // OPENCL: memref.copy %[[arg1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32> + // OPENCL: gpu.memcpy %[[MEMREF1]], %[[arg1]] : memref<2x5xf32>, memref<2x5xf32> // OPENCL: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32> - // OPENCL: memref.copy %[[arg0]], %[[MEMREF2]] : memref<2x5xf32> to memref<2x5xf32> + // OPENCL: gpu.memcpy %[[MEMREF2]], %[[arg0]] : memref<2x5xf32>, memref<2x5xf32> // VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32> // VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32> - // VULKAN: memref.copy %[[arg1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32> + // VULKAN: gpu.memcpy %[[MEMREF1]], %[[arg1]] : memref<2x5xf32>, memref<2x5xf32> // VULKAN: %[[MEMREF2:.*]] = memref.alloc() : memref<2x5xf32> - // VULKAN: memref.copy %[[arg0]], %[[MEMREF2]] : memref<2x5xf32> to memref<2x5xf32> + // VULKAN: gpu.memcpy %[[MEMREF2]], %[[arg0]] : memref<2x5xf32>, memref<2x5xf32> %tmp_buff = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32> // OPENCL-NOT: %[[MEMREF3:.*]] = memref.alloc().* @@ -49,7 +49,7 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %out_buff: memre // OPENCL: gpu.dealloc %[[MEMREF3]] : memref<2x5xf32> // OPENCL: gpu.dealloc %[[MEMREF2]] : memref<2x5xf32> // OPENCL: gpu.dealloc %[[MEMREF1]] : memref<2x5xf32> - // OPENCL: memref.copy %[[MEMREF0]], %[[out_buff]] : memref<2x5xf32> to memref<2x5xf32> + // OPENCL: gpu.memcpy %[[out_buff]], %[[MEMREF0]] : memref<2x5xf32>, memref<2x5xf32> // OPENCL: gpu.dealloc %[[MEMREF0]] : memref<2x5xf32> // VULKAN: memref.dealloc %[[MEMREF3]] : memref<2x5xf32> memref.dealloc %tmp_buff : memref<2x5xf32> diff --git a/test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir b/test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir index be1f61706..1a5df8963 100644 --- a/test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir +++ b/test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir @@ -12,13 +12,13 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %gpu_arg0: memre %c1 = arith.constant 1 : index %c5 = arith.constant 5 : index // OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32> - // OPENCL: memref.copy %arg1, %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32> + // OPENCL: gpu.memcpy %[[MEMREF0]], %arg1 : memref<2x5xf32>, memref<2x5xf32> // OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32> - // OPENCL: memref.copy %arg0, %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32> + // OPENCL: gpu.memcpy %[[MEMREF1]], %arg0 : memref<2x5xf32>, memref<2x5xf32> // VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32> - // VULKAN: memref.copy %arg1, %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32> + // VULKAN: gpu.memcpy %[[MEMREF0]], %arg1 : memref<2x5xf32>, memref<2x5xf32> // VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32> - // VULKAN: memref.copy %arg0, %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32> + // VULKAN: gpu.memcpy %[[MEMREF1]], %arg0 : memref<2x5xf32>, memref<2x5xf32> %0 = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32> // OPENCL: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32> diff --git a/test/Transforms/InsertGpuAllocs/memref-get-global.mlir b/test/Transforms/InsertGpuAllocs/memref-get-global.mlir index 2d94ebfee..ad49059e4 100644 --- a/test/Transforms/InsertGpuAllocs/memref-get-global.mlir +++ b/test/Transforms/InsertGpuAllocs/memref-get-global.mlir @@ -16,18 +16,18 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3 %2 = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32> // OPENCL: [[VAR0:%.*]] = memref.get_global @__constant_2x5xf32 : memref<2x5xf32> - // OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32> - // OPENCL: memref.copy [[VAR0]], %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32> + // OPENCL: %[[MEMREF0:.*]] = gpu.alloc () : memref<2x5xf32> + // OPENCL: gpu.memcpy %[[MEMREF0]], [[VAR0]] : memref<2x5xf32>, memref<2x5xf32> // OPENCL: [[VAR1:%.*]] = memref.get_global @__constant_2x5xf32_0 : memref<2x5xf32> - // OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32> - // OPENCL: memref.copy [[VAR1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32> + // OPENCL: %[[MEMREF1:.*]] = gpu.alloc () : memref<2x5xf32> + // OPENCL: gpu.memcpy %[[MEMREF1]], [[VAR1]] : memref<2x5xf32>, memref<2x5xf32> // OPENCL: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32> // VULKAN: [[VAR0:%.*]] = memref.get_global @__constant_2x5xf32 : memref<2x5xf32> // VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32> - // VULKAN: memref.copy [[VAR0]], %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32> + // VULKAN: gpu.memcpy %[[MEMREF0]], [[VAR0]] : memref<2x5xf32>, memref<2x5xf32> // VULKAN: [[VAR1:%.*]] = memref.get_global @__constant_2x5xf32_0 : memref<2x5xf32> // VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32> - // VULKAN: memref.copy [[VAR1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32> + // VULKAN: gpu.memcpy %[[MEMREF1]], [[VAR1]] : memref<2x5xf32>, memref<2x5xf32> %c1_0 = arith.constant 1 : index %3 = affine.apply affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>(%c2)[%c0, %c1] diff --git a/test/Transforms/InsertGpuAllocs/memref-returned-from-call.mlir b/test/Transforms/InsertGpuAllocs/memref-returned-from-call.mlir index 62af82f4f..3d0e3d50e 100644 --- a/test/Transforms/InsertGpuAllocs/memref-returned-from-call.mlir +++ b/test/Transforms/InsertGpuAllocs/memref-returned-from-call.mlir @@ -12,7 +12,7 @@ func.func @main() { // OPENCL: func.func @main() %0 = func.call @alloc_buffer() : () -> memref<8xf32> // OPENCL: %[[MEMREF:.*]] = gpu.alloc host_shared () : memref<8xf32> - // OPENCL: memref.copy %0, %[[MEMREF]] : memref<8xf32> to memref<8xf32> + // OPENCL: gpu.memcpy %[[MEMREF]], %0 : memref<8xf32>, memref<8xf32> %1 = memref.alloc() : memref<8xf32> %2 = memref.alloc() : memref<8xf32> gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c8, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) { diff --git a/test/Transforms/InsertGpuAllocs/xegpu-mem-copy.mlir b/test/Transforms/InsertGpuAllocs/xegpu-mem-copy.mlir index cba5c67c5..a62e03294 100644 --- a/test/Transforms/InsertGpuAllocs/xegpu-mem-copy.mlir +++ b/test/Transforms/InsertGpuAllocs/xegpu-mem-copy.mlir @@ -9,10 +9,10 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3 %c1 = arith.constant 1 : index // OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32> // OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32> - // OPENCL: memref.copy %[[arg0]], %[[MEMREF1:.*]] + // OPENCL: gpu.memcpy %[[MEMREF1:.*]], %[[arg0]] // VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32> // VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32> - // VULKAN: memref.copy %[[arg0]], %[[MEMREF1:.*]] + // VULKAN: gpu.memcpy %[[MEMREF1:.*]], %[[arg0]] gpu.launch blocks(%arg2, %arg3, %arg4) in (%arg8 = %c1, %arg9 = %c1, %arg10 = %c1) threads(%arg5, %arg6, %arg7) in (%arg11 = %c1, %arg12 = %c1, %arg13 = %c1) { %c0 = arith.constant 0 : index @@ -23,8 +23,8 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3 gpu.terminator } {SCFToGPU_visited} - // OPENCL: memref.copy %[[MEMREF0]], %[[arg1]] : memref<2x5xf32> to memref<2x5xf32> - // VULKAN: memref.copy %[[MEMREF0]], %[[arg1]] : memref<2x5xf32> to memref<2x5xf32> + // OPENCL: gpu.memcpy %[[arg1]], %[[MEMREF0]] : memref<2x5xf32>, memref<2x5xf32> + // VULKAN: gpu.memcpy %[[arg1]], %[[MEMREF0]] : memref<2x5xf32>, memref<2x5xf32> return %arg1 : memref<2x5xf32> }