diff --git a/lib/Transforms/InsertGPUAllocs.cpp b/lib/Transforms/InsertGPUAllocs.cpp
index 1bad1a9c7..f48d9b8c1 100644
--- a/lib/Transforms/InsertGPUAllocs.cpp
+++ b/lib/Transforms/InsertGPUAllocs.cpp
@@ -74,6 +74,32 @@ class InsertGPUAllocsPass final
     return mlir::success();
   }
 
+  // Go through all users of 'memory' recursively and replace
+  // all memref.copy with gpu.memcpy
+  static void replaceMemrefCopyWithGpuMemcpy(mlir::OpBuilder &builder,
+                                             mlir::Value memory) {
+    mlir::SmallVector<mlir::memref::CopyOp> toErase;
+
+    for (auto u : memory.getUsers()) {
+      if (auto copyOp = mlir::dyn_cast<mlir::memref::CopyOp>(u)) {
+        builder.setInsertionPoint(copyOp);
+        builder.create<mlir::gpu::MemcpyOp>(
+            copyOp.getLoc(), /*resultTypes=*/mlir::TypeRange(),
+            /*asyncDeps=*/mlir::ValueRange(), /*dst=*/copyOp.getTarget(),
+            /*src=*/copyOp.getSource());
+        toErase.push_back(copyOp);
+      } else if (u->getNumResults() == 0)
+        continue;
+      else {
+        for (auto result : u->getResults())
+          replaceMemrefCopyWithGpuMemcpy(builder, result);
+      }
+    }
+
+    for (auto copyOp : toErase)
+      copyOp.erase();
+  }
+
   void runOnOperation() override {
     auto func = getOperation();
     auto &funcBody = func.getBody();
@@ -382,12 +408,14 @@ class InsertGPUAllocsPass final
           }
 
           if (auto copy = mlir::dyn_cast<mlir::memref::CopyOp>(user)) {
-            if (copy.getSource() == mem)
-              ret.hostRead = true;
-
-            if (copy.getTarget() == mem)
-              ret.hostWrite = true;
-
+            // All memref.copy ops should be replaced with gpu.memcpy by
+            // 'add_gpu_alloc' function, so we shouldn't count them as "host
+            // usage". We may need to uncomment this code and do some analysis
+            // here instead if a case where it doesn't work would be found. if
+            // (copy.getSource() == mem)
+            //   ret.hostRead = true;
+            // if (copy.getTarget() == mem)
+            //   ret.hostWrite = true;
             continue;
           }
 
@@ -433,23 +461,29 @@ class InsertGPUAllocsPass final
             /*asyncDependencies*/ std::nullopt, alloc.getDynamicSizes(),
             alloc.getSymbolOperands(), hostShared);
         auto allocResult = gpuAlloc.getResult(0);
+
+        auto memory = alloc->getResult(0);
+        replaceMemrefCopyWithGpuMemcpy(builder, memory);
+
         builder.setInsertionPoint(term);
         for (mlir::OpOperand &use : alloc.getResult().getUses()) {
           if (use.getOwner() == term) {
             auto newAlloc = builder.create<mlir::memref::AllocOp>(
                 loc, alloc.getType(), alloc.getDynamicSizes(),
                 alloc.getSymbolOperands());
-            builder.create<mlir::memref::CopyOp>(loc, allocResult,
-                                                 newAlloc.getResult());
+            builder.create<mlir::gpu::MemcpyOp>(
+                loc, /*resultTypes=*/mlir::TypeRange(),
+                /*asyncDeps=*/mlir::ValueRange(), /*dst=*/newAlloc.getResult(),
+                /*src=*/allocResult);
             use.set(newAlloc.getResult());
           }
         }
 
         // remove 'memref.dealloc' (it's later replaced with gpu.dealloc)
-        auto memory = alloc->getResult(0);
         for (auto u : memory.getUsers()) {
           if (auto dealloc = mlir::dyn_cast<mlir::memref::DeallocOp>(u)) {
             dealloc.erase();
+            break;
           }
         }
 
@@ -469,6 +503,8 @@ class InsertGPUAllocsPass final
       filter.clear();
       dims.clear();
 
+      replaceMemrefCopyWithGpuMemcpy(builder, op);
+
       // This code handles dynamic dims with known rank.
       for (auto i : llvm::seq(0u, rank)) {
         if (memrefType.isDynamicDim(i)) {
@@ -489,8 +525,9 @@ class InsertGPUAllocsPass final
             /*symbolOperands*/ std::nullopt, hostShared);
         auto allocResult = gpuAlloc.getResult(0);
         if (access.hostWrite && access.deviceRead) {
-          auto copy =
-              builder.create<mlir::memref::CopyOp>(loc, op, allocResult);
+          auto copy = builder.create<mlir::gpu::MemcpyOp>(
+              loc, /*resultTypes=*/mlir::TypeRange(),
+              /*asyncDeps=*/mlir::ValueRange(), allocResult, op);
           filter.insert(copy);
         }
 
@@ -501,7 +538,10 @@ class InsertGPUAllocsPass final
           op.replaceAllUsesExcept(castedAllocResult, filter);
           builder.setInsertionPoint(term);
           if (access.hostRead && access.deviceWrite) {
-            builder.create<mlir::memref::CopyOp>(loc, castedAllocResult, op);
+            builder.create<mlir::gpu::MemcpyOp>(
+                loc, /*resultTypes=*/mlir::TypeRange(),
+                /*asyncDeps=*/mlir::ValueRange(), /*dst=*/op,
+                /*src=*/castedAllocResult);
           }
           builder.create<mlir::gpu::DeallocOp>(loc, std::nullopt,
                                                castedAllocResult);
@@ -509,7 +549,10 @@ class InsertGPUAllocsPass final
           op.replaceAllUsesExcept(allocResult, filter);
           builder.setInsertionPoint(term);
           if (access.hostRead && access.deviceWrite) {
-            builder.create<mlir::memref::CopyOp>(loc, allocResult, op);
+            builder.create<mlir::gpu::MemcpyOp>(
+                loc, /*resultTypes=*/mlir::TypeRange(),
+                /*asyncDeps=*/mlir::ValueRange(), /*dst=*/op,
+                /*src=*/allocResult);
           }
           builder.create<mlir::gpu::DeallocOp>(loc, std::nullopt, allocResult);
         }
@@ -518,8 +561,10 @@ class InsertGPUAllocsPass final
             builder.create<mlir::memref::AllocOp>(loc, allocType, dims);
         auto allocResult = gpuAlloc.getResult();
         if (access.hostWrite && access.deviceRead) {
-          auto copy =
-              builder.create<mlir::memref::CopyOp>(loc, op, allocResult);
+          auto copy = builder.create<mlir::gpu::MemcpyOp>(
+              loc, /*resultTypes=*/mlir::TypeRange(),
+              /*asyncDeps=*/mlir::ValueRange(), /*dst=*/allocResult,
+              /*src=*/op);
           filter.insert(copy);
         }
 
@@ -530,13 +575,19 @@ class InsertGPUAllocsPass final
           op.replaceAllUsesExcept(castedAllocResult, filter);
           builder.setInsertionPoint(term);
           if (access.hostRead && access.deviceWrite) {
-            builder.create<mlir::memref::CopyOp>(loc, castedAllocResult, op);
+            builder.create<mlir::gpu::MemcpyOp>(
+                loc, /*resultTypes=*/mlir::TypeRange(),
+                /*asyncDeps=*/mlir::ValueRange(), /*dst=*/op,
+                /*src=*/castedAllocResult);
           }
         } else {
           op.replaceAllUsesExcept(allocResult, filter);
           builder.setInsertionPoint(term);
           if (access.hostRead && access.deviceWrite) {
-            builder.create<mlir::memref::CopyOp>(loc, allocResult, op);
+            builder.create<mlir::gpu::MemcpyOp>(
+                loc, /*resultTypes=*/mlir::TypeRange(),
+                /*asyncDeps=*/mlir::ValueRange(), /*dst=*/op,
+                /*src=*/allocResult);
           }
         }
       }
diff --git a/test/Transforms/InsertGpuAllocs/add-gpu-alloc-for-tmp-buff.mlir b/test/Transforms/InsertGpuAllocs/add-gpu-alloc-for-tmp-buff.mlir
index 69be7113f..9670afc08 100644
--- a/test/Transforms/InsertGpuAllocs/add-gpu-alloc-for-tmp-buff.mlir
+++ b/test/Transforms/InsertGpuAllocs/add-gpu-alloc-for-tmp-buff.mlir
@@ -12,15 +12,15 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %out_buff: memre
   %c5 = arith.constant 5 : index
   // OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
   // OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
-  // OPENCL: memref.copy %[[arg1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
+  // OPENCL: gpu.memcpy %[[MEMREF1]], %[[arg1]] : memref<2x5xf32>, memref<2x5xf32>
   // OPENCL: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
-  // OPENCL: memref.copy %[[arg0]], %[[MEMREF2]] : memref<2x5xf32> to memref<2x5xf32>
+  // OPENCL: gpu.memcpy %[[MEMREF2]], %[[arg0]] : memref<2x5xf32>, memref<2x5xf32>
 
   // VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
   // VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
-  // VULKAN: memref.copy %[[arg1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
+  // VULKAN: gpu.memcpy %[[MEMREF1]], %[[arg1]] : memref<2x5xf32>, memref<2x5xf32>
   // VULKAN: %[[MEMREF2:.*]] = memref.alloc() : memref<2x5xf32>
-  // VULKAN: memref.copy %[[arg0]], %[[MEMREF2]] : memref<2x5xf32> to memref<2x5xf32>
+  // VULKAN: gpu.memcpy %[[MEMREF2]], %[[arg0]] : memref<2x5xf32>, memref<2x5xf32>
 
   %tmp_buff = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
   // OPENCL-NOT:  %[[MEMREF3:.*]] = memref.alloc().*
@@ -49,7 +49,7 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %out_buff: memre
   // OPENCL: gpu.dealloc %[[MEMREF3]] : memref<2x5xf32>
   // OPENCL: gpu.dealloc %[[MEMREF2]] : memref<2x5xf32>
   // OPENCL: gpu.dealloc %[[MEMREF1]] : memref<2x5xf32>
-  // OPENCL: memref.copy %[[MEMREF0]], %[[out_buff]] : memref<2x5xf32> to memref<2x5xf32>
+  // OPENCL: gpu.memcpy %[[out_buff]], %[[MEMREF0]] : memref<2x5xf32>, memref<2x5xf32>
   // OPENCL: gpu.dealloc %[[MEMREF0]] : memref<2x5xf32>
   // VULKAN: memref.dealloc %[[MEMREF3]] : memref<2x5xf32>
   memref.dealloc %tmp_buff : memref<2x5xf32>
diff --git a/test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir b/test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir
index be1f61706..1a5df8963 100644
--- a/test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir
+++ b/test/Transforms/InsertGpuAllocs/add-gpu-alloc.mlir
@@ -12,13 +12,13 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %gpu_arg0: memre
   %c1 = arith.constant 1 : index
   %c5 = arith.constant 5 : index
   // OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
-  // OPENCL: memref.copy %arg1, %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
+  // OPENCL: gpu.memcpy %[[MEMREF0]], %arg1 : memref<2x5xf32>, memref<2x5xf32>
   // OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
-  // OPENCL: memref.copy %arg0, %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
+  // OPENCL: gpu.memcpy %[[MEMREF1]], %arg0 : memref<2x5xf32>, memref<2x5xf32>
   // VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
-  // VULKAN: memref.copy %arg1, %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
+  // VULKAN: gpu.memcpy %[[MEMREF0]], %arg1 : memref<2x5xf32>, memref<2x5xf32>
   // VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
-  // VULKAN: memref.copy %arg0, %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
+  // VULKAN: gpu.memcpy %[[MEMREF1]], %arg0 : memref<2x5xf32>, memref<2x5xf32>
 
   %0 = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
   // OPENCL:  %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
diff --git a/test/Transforms/InsertGpuAllocs/memref-get-global.mlir b/test/Transforms/InsertGpuAllocs/memref-get-global.mlir
index 2d94ebfee..ad49059e4 100644
--- a/test/Transforms/InsertGpuAllocs/memref-get-global.mlir
+++ b/test/Transforms/InsertGpuAllocs/memref-get-global.mlir
@@ -16,18 +16,18 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3
   %2 = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
 
   // OPENCL: [[VAR0:%.*]] = memref.get_global @__constant_2x5xf32 : memref<2x5xf32>
-  // OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
-  // OPENCL: memref.copy [[VAR0]], %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
+  // OPENCL: %[[MEMREF0:.*]] = gpu.alloc () : memref<2x5xf32>
+  // OPENCL: gpu.memcpy %[[MEMREF0]], [[VAR0]] : memref<2x5xf32>, memref<2x5xf32>
   // OPENCL: [[VAR1:%.*]] = memref.get_global @__constant_2x5xf32_0 : memref<2x5xf32>
-  // OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
-  // OPENCL: memref.copy [[VAR1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
+  // OPENCL: %[[MEMREF1:.*]] = gpu.alloc () : memref<2x5xf32>
+  // OPENCL: gpu.memcpy %[[MEMREF1]], [[VAR1]] : memref<2x5xf32>, memref<2x5xf32>
   // OPENCL: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
   // VULKAN: [[VAR0:%.*]] = memref.get_global @__constant_2x5xf32 : memref<2x5xf32>
   // VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
-  // VULKAN: memref.copy [[VAR0]], %[[MEMREF0]] : memref<2x5xf32> to memref<2x5xf32>
+  // VULKAN: gpu.memcpy %[[MEMREF0]], [[VAR0]] : memref<2x5xf32>, memref<2x5xf32>
   // VULKAN: [[VAR1:%.*]] = memref.get_global @__constant_2x5xf32_0 : memref<2x5xf32>
   // VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
-  // VULKAN: memref.copy [[VAR1]], %[[MEMREF1]] : memref<2x5xf32> to memref<2x5xf32>
+  // VULKAN: gpu.memcpy %[[MEMREF1]], [[VAR1]] : memref<2x5xf32>, memref<2x5xf32>
 
   %c1_0 = arith.constant 1 : index
   %3 = affine.apply affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>(%c2)[%c0, %c1]
diff --git a/test/Transforms/InsertGpuAllocs/memref-returned-from-call.mlir b/test/Transforms/InsertGpuAllocs/memref-returned-from-call.mlir
index 62af82f4f..3d0e3d50e 100644
--- a/test/Transforms/InsertGpuAllocs/memref-returned-from-call.mlir
+++ b/test/Transforms/InsertGpuAllocs/memref-returned-from-call.mlir
@@ -12,7 +12,7 @@ func.func @main() {
   // OPENCL: func.func @main()
   %0 = func.call @alloc_buffer() :  () -> memref<8xf32>
   // OPENCL: %[[MEMREF:.*]] = gpu.alloc  host_shared () : memref<8xf32>
-  // OPENCL: memref.copy %0, %[[MEMREF]] : memref<8xf32> to memref<8xf32>
+  // OPENCL: gpu.memcpy %[[MEMREF]], %0 : memref<8xf32>, memref<8xf32>
   %1 = memref.alloc() : memref<8xf32>
   %2 = memref.alloc() : memref<8xf32>
   gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c8, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) {
diff --git a/test/Transforms/InsertGpuAllocs/xegpu-mem-copy.mlir b/test/Transforms/InsertGpuAllocs/xegpu-mem-copy.mlir
index cba5c67c5..a62e03294 100644
--- a/test/Transforms/InsertGpuAllocs/xegpu-mem-copy.mlir
+++ b/test/Transforms/InsertGpuAllocs/xegpu-mem-copy.mlir
@@ -9,10 +9,10 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3
   %c1 = arith.constant 1 : index
   // OPENCL: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
   // OPENCL: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
-  // OPENCL: memref.copy %[[arg0]], %[[MEMREF1:.*]]
+  // OPENCL: gpu.memcpy %[[MEMREF1:.*]], %[[arg0]]
   // VULKAN: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
   // VULKAN: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
-  // VULKAN: memref.copy %[[arg0]], %[[MEMREF1:.*]]
+  // VULKAN: gpu.memcpy %[[MEMREF1:.*]], %[[arg0]]
 
   gpu.launch blocks(%arg2, %arg3, %arg4) in (%arg8 = %c1, %arg9 = %c1, %arg10 = %c1) threads(%arg5, %arg6, %arg7) in (%arg11 = %c1, %arg12 = %c1, %arg13 = %c1) {
     %c0 = arith.constant 0 : index
@@ -23,8 +23,8 @@ func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>) -> memref<2x5xf3
     gpu.terminator
   } {SCFToGPU_visited}
 
-  // OPENCL: memref.copy %[[MEMREF0]], %[[arg1]] : memref<2x5xf32> to memref<2x5xf32>
-  // VULKAN: memref.copy %[[MEMREF0]], %[[arg1]] : memref<2x5xf32> to memref<2x5xf32>
+  // OPENCL: gpu.memcpy %[[arg1]], %[[MEMREF0]] : memref<2x5xf32>, memref<2x5xf32>
+  // VULKAN: gpu.memcpy %[[arg1]], %[[MEMREF0]] : memref<2x5xf32>, memref<2x5xf32>
 
   return %arg1 : memref<2x5xf32>
 }