diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6d4f589..d379acf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,11 @@ project adheres to [Semantic Versioning](http://semver.org/).
 - Added `cu::DeviceMemory::memset()`
 - Added `cu::Stream::memsetAsync()`
 - Added `nvml::Device::getPower()`
+- Added `cu::Stream::memcpyHtoD2DAsync()`, `cu::Stream::memcpyDtoHD2Async()`,
+  and `cu::Stream::memcpyDtoD2DAsync()`
+- Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()`
+- Added `cufft::FFT1DR2C` and `cufft::FFT1DC2R`
+- Added `cu::Device::getOrdinal()`
 
 ### Changed
 
@@ -24,6 +29,7 @@ project adheres to [Semantic Versioning](http://semver.org/).
 - Upgrade Catch2 to version v3.6.0
 - `target_embed_source` is now more robust: it properly tracks dependencies and
   runs again whenever any of them changes
+- Expanded tests to cover the new 2D memory operations and FFT support
 
 ## \[0.8.0\] - 2024-07-05
 
diff --git a/README.dev.md b/README.dev.md
index dcf02f1..f797b00 100644
--- a/README.dev.md
+++ b/README.dev.md
@@ -299,5 +299,5 @@ To view the generated documentation, open `_build/html/index.html` in your web-b
 
 ### Verification
 
-1. Make sure the new release is added to Zenodo (see <https://zenodo.org/record/6076447>)
+1. Make sure the new release is added to Zenodo (see <https://zenodo.org/records/8075251>)
 1. Activate the latest release documentation at <https://readthedocs.org/projects/cudawrappers/versions/>
diff --git a/cmake/cudawrappers-helper.cmake b/cmake/cudawrappers-helper.cmake
index 9ea5173..1bfdc0c 100644
--- a/cmake/cudawrappers-helper.cmake
+++ b/cmake/cudawrappers-helper.cmake
@@ -1,4 +1,4 @@
-# Retun a list of asbolute file names for all the local includes of the
+# Return a list of absolute file names for all the local includes of the
 # input_file.  Only files in the root directory will be considered.
 function(get_local_includes input_file root_dir)
   file(READ ${input_file} input_file_contents)
diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp
index 032ab44..06cc69a 100644
--- a/include/cudawrappers/cu.hpp
+++ b/include/cudawrappers/cu.hpp
@@ -94,6 +94,23 @@ class Wrapper {
 
   explicit Wrapper(T &obj) : _obj(obj) {}
 
+  template <CUmemorytype... AllowedMemoryTypes>
+  inline void checkPointerAccess(const CUdeviceptr &pointer) const {
+    CUmemorytype memoryType;
+    checkCudaCall(cuPointerGetAttribute(
+        &memoryType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer));
+
+    // Check if the memoryType is one of the allowed memory types
+    for (auto allowedType : {AllowedMemoryTypes...}) {
+      if (memoryType == allowedType) {
+        return;
+      }
+    }
+
+    throw std::runtime_error(
+        "Invalid memory type: allowed types are not matched.");
+  }
+
   T _obj{};
   std::shared_ptr<T> manager;
 };
@@ -188,6 +205,8 @@ class Device : public Wrapper<CUdevice> {
     return size;
   }
 
+  int getOrdinal() const { return _ordinal; }
+
   // Primary Context Management
   std::pair<unsigned, bool> primaryCtxGetState() const {
     unsigned flags{};
@@ -597,6 +616,32 @@ class DeviceMemory : public Wrapper<CUdeviceptr> {
     checkCudaCall(cuMemsetD32(_obj, value, size));
   }
 
+  void memset2D(unsigned char value, size_t pitch, size_t width,
+                size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemset2D(_obj, pitch, value, width, height));
+#else
+    checkCudaCall(cuMemsetD2D8(_obj, pitch, value, width, height));
+#endif
+  }
+
+  void memset2D(unsigned short value, size_t pitch, size_t width,
+                size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemset2D(_obj, pitch, value, width, height));
+#else
+    checkCudaCall(cuMemsetD2D16(_obj, pitch, value, width, height));
+#endif
+  }
+
+  void memset2D(unsigned int value, size_t pitch, size_t width, size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemset2D(_obj, pitch, value, width, height));
+#else
+    checkCudaCall(cuMemsetD2D32(_obj, pitch, value, width, height));
+#endif
+  }
+
   void zero(size_t size) { memset(static_cast<unsigned char>(0), size); }
 
   const void *parameter()
@@ -607,15 +652,14 @@ class DeviceMemory : public Wrapper<CUdeviceptr> {
 
   template <typename T>
   operator T *() {
-    int data;
-    checkCudaCall(
-        cuPointerGetAttribute(&data, CU_POINTER_ATTRIBUTE_IS_MANAGED, _obj));
-    if (data) {
-      return reinterpret_cast<T *>(_obj);
-    } else {
-      throw std::runtime_error(
-          "Cannot return memory of type CU_MEMORYTYPE_DEVICE as pointer.");
-    }
+    checkPointerAccess<CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_UNIFIED>(_obj);
+    return reinterpret_cast<T *>(_obj);
+  }
+
+  template <typename T>
+  operator T *() const {
+    checkPointerAccess<CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_UNIFIED>(_obj);
+    return reinterpret_cast<T const *>(_obj);
   }
 
   size_t size() const { return _size; }
@@ -670,6 +714,74 @@ class Stream : public Wrapper<CUstream> {
 #endif
   }
 
+  void memcpyHtoD2DAsync(DeviceMemory &devPtr, size_t dpitch,
+                         const void *hostPtr, size_t spitch, size_t width,
+                         size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemcpy2DAsync(devPtr, dpitch, hostPtr, spitch, width,
+                                   height, hipMemcpyHostToDevice, _obj));
+#else
+    // Initialize the CUDA_MEMCPY2D structure
+    CUDA_MEMCPY2D copyParams = {0};
+
+    // Set width and height for the 2D copy
+    copyParams.WidthInBytes = width;
+    copyParams.Height = height;
+
+    // Set the destination (dst)
+    copyParams.dstXInBytes = 0;
+    copyParams.dstY = 0;
+    copyParams.dstPitch = dpitch;
+
+    // Set the source (src)
+    copyParams.srcXInBytes = 0;
+    copyParams.srcY = 0;
+    copyParams.srcPitch = spitch;
+
+    copyParams.srcMemoryType = CU_MEMORYTYPE_HOST;
+    copyParams.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    copyParams.srcHost = hostPtr;
+    copyParams.dstDevice = devPtr;
+
+    // Call the driver API function cuMemcpy2DAsync
+    checkCudaCall(cuMemcpy2DAsync(&copyParams, _obj));
+#endif
+  }
+
+  void memcpyDtoH2DAsync(void *hostPtr, size_t dpitch,
+                         const DeviceMemory &devPtr, size_t spitch,
+                         size_t width, size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemcpy2DAsync(hostPtr, dpitch, devPtr, spitch, width,
+                                   height, hipMemcpyDeviceToHost, _obj));
+#else
+    // Initialize the CUDA_MEMCPY2D structure
+    CUDA_MEMCPY2D copyParams = {0};
+
+    // Set width and height for the 2D copy
+    copyParams.WidthInBytes = width;
+    copyParams.Height = height;
+
+    // Set the destination (dst)
+    copyParams.dstXInBytes = 0;
+    copyParams.dstY = 0;
+    copyParams.dstPitch = dpitch;
+
+    // Set the source (src)
+    copyParams.srcXInBytes = 0;
+    copyParams.srcY = 0;
+    copyParams.srcPitch = spitch;
+
+    copyParams.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+    copyParams.dstMemoryType = CU_MEMORYTYPE_HOST;
+    copyParams.srcDevice = devPtr;
+    copyParams.dstHost = hostPtr;
+
+    // Call the driver API function cuMemcpy2DAsync
+    checkCudaCall(cuMemcpy2DAsync(&copyParams, _obj));
+#endif
+  }
+
   void memcpyHtoDAsync(CUdeviceptr devPtr, const void *hostPtr, size_t size) {
 #if defined(__HIP__)
     checkCudaCall(
@@ -716,10 +828,43 @@ class Stream : public Wrapper<CUstream> {
     checkCudaCall(cuMemsetD32Async(devPtr, value, size, _obj));
   }
 
+  void memset2DAsync(DeviceMemory &devPtr, unsigned char value, size_t pitch,
+                     size_t width, size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj));
+#else
+    checkCudaCall(cuMemsetD2D8Async(devPtr, pitch, value, width, height, _obj));
+#endif
+  }
+
+  void memset2DAsync(DeviceMemory &devPtr, unsigned short value, size_t pitch,
+                     size_t width, size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj));
+#else
+    checkCudaCall(
+        cuMemsetD2D16Async(devPtr, pitch, value, width, height, _obj));
+#endif
+  }
+
+  void memset2DAsync(DeviceMemory &devPtr, unsigned int value, size_t pitch,
+                     size_t width, size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj));
+#else
+    checkCudaCall(
+        cuMemsetD2D32Async(devPtr, pitch, value, width, height, _obj));
+#endif
+  }
+
   void zero(DeviceMemory &devPtr, size_t size) {
     memsetAsync(devPtr, static_cast<unsigned char>(0), size);
   }
 
+  void zero2D(DeviceMemory &devPtr, size_t pitch, size_t width, size_t height) {
+    memset2DAsync(devPtr, static_cast<unsigned char>(0), pitch, width, height);
+  }
+
   void launchKernel(Function &function, unsigned gridX, unsigned gridY,
                     unsigned gridZ, unsigned blockX, unsigned blockY,
                     unsigned blockZ, unsigned sharedMemBytes,
@@ -777,4 +922,4 @@ inline void Event::record(Stream &stream) {
 }
 }  // namespace cu
 
-#endif
+#endif
\ No newline at end of file
diff --git a/include/cudawrappers/cufft.hpp b/include/cudawrappers/cufft.hpp
index 7457fc5..57a08df 100644
--- a/include/cudawrappers/cufft.hpp
+++ b/include/cudawrappers/cufft.hpp
@@ -11,6 +11,7 @@
 #include <cufftXt.h>
 #endif
 
+#include <array>
 #include <exception>
 
 #include "cudawrappers/cu.hpp"
@@ -110,18 +111,19 @@ class FFT {
 
   ~FFT() { checkCuFFTCall(cufftDestroy(plan_)); }
 
-  void setStream(cu::Stream &stream) {
+  void setStream(cu::Stream &stream) const {
     checkCuFFTCall(cufftSetStream(plan_, stream));
   }
 
-  void execute(cu::DeviceMemory &in, cu::DeviceMemory &out, int direction) {
+  void execute(cu::DeviceMemory &in, cu::DeviceMemory &out,
+               const int direction) const {
     void *in_ptr = reinterpret_cast<void *>(static_cast<CUdeviceptr>(in));
     void *out_ptr = reinterpret_cast<void *>(static_cast<CUdeviceptr>(out));
     checkCuFFTCall(cufftXtExec(plan_, in_ptr, out_ptr, direction));
   }
 
  protected:
-  void checkCuFFTCall(cufftResult result) {
+  void checkCuFFTCall(cufftResult result) const {
     if (result != CUFFT_SUCCESS) {
       throw Error(result);
     }
@@ -142,39 +144,39 @@ class FFT1D : public FFT {
 #if defined(__HIP__)
   __host__
 #endif
-  FFT1D(int nx) = delete;
+  FFT1D(const int nx) = delete;
 #if defined(__HIP__)
   __host__
 #endif
-  FFT1D(int nx, int batch) = delete;
+  FFT1D(const int nx, const int batch) = delete;
 };
 
 template <>
-FFT1D<CUDA_C_32F>::FFT1D(int nx, int batch) {
+FFT1D<CUDA_C_32F>::FFT1D(const int nx, const int batch) {
   checkCuFFTCall(cufftCreate(plan()));
   checkCuFFTCall(cufftPlan1d(plan(), nx, CUFFT_C2C, batch));
 }
 
 template <>
-FFT1D<CUDA_C_32F>::FFT1D(int nx) : FFT1D(nx, 1) {}
+FFT1D<CUDA_C_32F>::FFT1D(const int nx) : FFT1D(nx, 1) {}
 
 template <>
-FFT1D<CUDA_C_16F>::FFT1D(int nx, int batch) {
+FFT1D<CUDA_C_16F>::FFT1D(const int nx, const int batch) {
   checkCuFFTCall(cufftCreate(plan()));
   const int rank = 1;
   size_t ws = 0;
   std::array<long long, 1> n{nx};
-  long long int idist = 1;
-  long long int odist = 1;
-  int istride = 1;
-  int ostride = 1;
+  const long long idist = 1;
+  const long long odist = 1;
+  const int istride = 1;
+  const int ostride = 1;
   checkCuFFTCall(cufftXtMakePlanMany(*plan(), rank, n.data(), nullptr, istride,
                                      idist, CUDA_C_16F, nullptr, ostride, odist,
                                      CUDA_C_16F, batch, &ws, CUDA_C_16F));
 }
 
 template <>
-FFT1D<CUDA_C_16F>::FFT1D(int nx) : FFT1D(nx, 1) {}
+FFT1D<CUDA_C_16F>::FFT1D(const int nx) : FFT1D(nx, 1) {}
 
 /*
  * FFT2D
@@ -185,21 +187,23 @@ class FFT2D : public FFT {
 #if defined(__HIP__)
   __host__
 #endif
-  FFT2D(int nx, int ny) = delete;
+  FFT2D(const int nx, const int ny) = delete;
 #if defined(__HIP__)
   __host__
 #endif
-  FFT2D(int nx, int ny, int stride, int dist, int batch) = delete;
+  FFT2D(const int nx, const int ny, const int stride, const int dist,
+        const int batch) = delete;
 };
 
 template <>
-FFT2D<CUDA_C_32F>::FFT2D(int nx, int ny) {
+FFT2D<CUDA_C_32F>::FFT2D(const int nx, const int ny) {
   checkCuFFTCall(cufftCreate(plan()));
   checkCuFFTCall(cufftPlan2d(plan(), nx, ny, CUFFT_C2C));
 }
 
 template <>
-FFT2D<CUDA_C_32F>::FFT2D(int nx, int ny, int stride, int dist, int batch) {
+FFT2D<CUDA_C_32F>::FFT2D(const int nx, const int ny, const int stride,
+                         const int dist, const int batch) {
   checkCuFFTCall(cufftCreate(plan()));
   std::array<int, 2> n{nx, ny};
   checkCuFFTCall(cufftPlanMany(plan(), 2, n.data(), n.data(), stride, dist,
@@ -207,23 +211,102 @@ FFT2D<CUDA_C_32F>::FFT2D(int nx, int ny, int stride, int dist, int batch) {
 }
 
 template <>
-FFT2D<CUDA_C_16F>::FFT2D(int nx, int ny, int stride, int dist, int batch) {
+FFT2D<CUDA_C_16F>::FFT2D(const int nx, const int ny, const int stride,
+                         const int dist, const int batch) {
   checkCuFFTCall(cufftCreate(plan()));
   const int rank = 2;
   size_t ws = 0;
   std::array<long long, 2> n{nx, ny};
-  int istride = stride;
-  int ostride = stride;
-  long long int idist = dist;
-  long long int odist = dist;
+  const int istride = stride;
+  const int ostride = stride;
+  const long long int idist = dist;
+  const long long int odist = dist;
   checkCuFFTCall(cufftXtMakePlanMany(*plan(), rank, n.data(), nullptr, istride,
                                      idist, CUDA_C_16F, nullptr, ostride, odist,
                                      CUDA_C_16F, batch, &ws, CUDA_C_16F));
 }
 
 template <>
-FFT2D<CUDA_C_16F>::FFT2D(int nx, int ny) : FFT2D(nx, ny, 1, nx * ny, 1) {}
+FFT2D<CUDA_C_16F>::FFT2D(const int nx, const int ny)
+    : FFT2D(nx, ny, 1, nx * ny, 1) {}
+
+/*
+ * FFT1DR2C
+ */
+template <cudaDataType_t T>
+class FFT1DR2C : public FFT {
+ public:
+#if defined(__HIP__)
+  __host__
+#endif
+  FFT1DR2C(const int nx) = delete;
+#if defined(__HIP__)
+  __host__
+#endif
+  FFT1DR2C(const int nx, const int batch) = delete;
+
+#if defined(__HIP__)
+  __host__
+#endif
+  FFT1DR2C(const int nx, const int batch, long long inembed,
+           long long ouembed) = delete;
+};
+
+template <>
+FFT1DR2C<CUDA_R_32F>::FFT1DR2C(const int nx, const int batch, long long inembed,
+                               long long ouembed) {
+  checkCuFFTCall(cufftCreate(plan()));
+  const int rank = 1;
+  size_t ws = 0;
+  std::array<long long, 1> n{nx};
+  const long long idist = inembed;
+  const long long odist = ouembed;
+  const long long istride = 1;
+  const long long ostride = 1;
+
+  checkCuFFTCall(cufftXtMakePlanMany(
+      *plan(), rank, n.data(), &inembed, istride, idist, CUDA_R_32F, &ouembed,
+      ostride, odist, CUDA_C_32F, batch, &ws, CUDA_C_32F));
+}
+
+/*
+ * FFT1D_C2R
+ */
+template <cudaDataType_t T>
+class FFT1DC2R : public FFT {
+ public:
+#if defined(__HIP__)
+  __host__
+#endif
+  FFT1DC2R(const int nx) = delete;
+#if defined(__HIP__)
+  __host__
+#endif
+  FFT1DC2R(const int nx, const int batch) = delete;
+#if defined(__HIP__)
+  __host__
+#endif
+  FFT1DC2R(const int nx, const int batch, long long inembed,
+           long long ouembed) = delete;
+};
+
+template <>
+FFT1DC2R<CUDA_C_32F>::FFT1DC2R(const int nx, const int batch, long long inembed,
+                               long long ouembed) {
+  checkCuFFTCall(cufftCreate(plan()));
+  const int rank = 1;
+  size_t ws = 0;
+  std::array<long long, 1> n{nx};
+  const long long idist = inembed;
+  const long long odist = ouembed;
+  const int istride = 1;
+  const int ostride = 1;
+
+  checkCuFFTCall(cufftXtMakePlanMany(
+      *plan(), rank, n.data(), &inembed, istride, idist, CUDA_C_32F, &ouembed,
+      ostride, odist, CUDA_R_32F, batch, &ws, CUDA_C_32F));
+}
 
 }  // namespace cufft
 
-#endif  // CUFFT_H
+#endif  // CUFFT_H
\ No newline at end of file
diff --git a/include/cudawrappers/nvtx.hpp b/include/cudawrappers/nvtx.hpp
index 5bb6f5f..31dac0d 100644
--- a/include/cudawrappers/nvtx.hpp
+++ b/include/cudawrappers/nvtx.hpp
@@ -1,7 +1,9 @@
 #if !defined NVTX_H
 #define NVTX_H
 
+#if !defined(__HIP__)
 #include <nvToolsExt.h>
+#endif
 
 namespace nvtx {
 
@@ -9,6 +11,10 @@ class Marker {
  public:
   enum Color { red, green, blue, yellow, black };
 
+#if defined(__HIP__)
+  explicit Marker(const char* message, unsigned color = Color::green) {}
+
+#else
   explicit Marker(const char* message, unsigned color = Color::green)
       : _attributes{0} {
     _attributes.version = NVTX_VERSION;
@@ -18,12 +24,21 @@ class Marker {
     _attributes.messageType = NVTX_MESSAGE_TYPE_ASCII;
     _attributes.message.ascii = message;
   }
+#endif
 
   Marker(const char* message, Color color) : Marker(message, convert(color)) {}
 
-  void start() { _id = nvtxRangeStartEx(&_attributes); }
+  void start() {
+#if !defined(__HIP__)
+    _id = nvtxRangeStartEx(&_attributes);
+#endif
+  }
 
-  void end() { nvtxRangeEnd(_id); }
+  void end() {
+#if !defined(__HIP__)
+    nvtxRangeEnd(_id);
+#endif
+  }
 
  private:
   unsigned int convert(Color color) {
@@ -43,8 +58,10 @@ class Marker {
     }
   }
 
+#if !defined(__HIP__)
   nvtxEventAttributes_t _attributes;
   nvtxRangeId_t _id;
+#endif
 };
 
 }  // end namespace nvtx
diff --git a/tests/test_cu.cpp b/tests/test_cu.cpp
index e383b0f..744f80d 100644
--- a/tests/test_cu.cpp
+++ b/tests/test_cu.cpp
@@ -12,17 +12,29 @@ TEST_CASE("Test cu::Device", "[device]") {
   cu::Device device(0);
   cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device);
 
-  SECTION("Test Device.getName") {
+  SECTION("Test Device.getName", "[device]") {
     const std::string name = device.getName();
     std::cout << "Device name: " << name << std::endl;
     CHECK(name.size() > 0);
   }
 
-  SECTION("Test Device.getArch") {
+  SECTION("Test Device.getArch", "[device]") {
     const std::string arch = device.getArch();
     std::cout << "Device arch: " << arch << std::endl;
     CHECK(arch.size() > 0);
   }
+
+  SECTION("Test device.totalMem", "[device]") {
+    const size_t total_mem = device.totalMem();
+    std::cout << "Device total memory: " << (total_mem / (1024 * 1024))
+              << " bytes" << std::endl;
+    CHECK(total_mem > 0);
+  }
+
+  SECTION("Test Device.getOrdinal", "[device]") {
+    const int dev_ordinal = device.getOrdinal();
+    CHECK(dev_ordinal >= 0);
+  }
 }
 
 TEST_CASE("Test context::getDevice", "[device]") {
@@ -57,6 +69,24 @@ TEST_CASE("Test copying cu::DeviceMemory and cu::HostMemory using cu::Stream",
     CHECK(src == tgt);
   }
 
+  SECTION("Test copying a 2D std::array to the device and back") {
+    const std::array<std::array<int, 3>, 3> src = {
+        {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}};
+    std::array<std::array<int, 3>, 3> tgt = {{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}};
+    const size_t width = 3 * sizeof(int);
+    const size_t height = 3;
+    const size_t pitch = width;
+
+    cu::DeviceMemory mem(pitch * height);
+
+    cu::Stream stream;
+    stream.memcpyHtoD2DAsync(mem, pitch, src.data(), pitch, width, height);
+    stream.memcpyDtoH2DAsync(tgt.data(), pitch, mem, pitch, width, height);
+    stream.synchronize();
+
+    CHECK(src == tgt);
+  }
+
   SECTION("Test copying HostMemory to the device and back") {
     const size_t N = 3;
     const size_t size = N * sizeof(int);
@@ -79,6 +109,34 @@ TEST_CASE("Test copying cu::DeviceMemory and cu::HostMemory using cu::Stream",
 
     CHECK(!static_cast<bool>(memcmp(src, tgt, size)));
   }
+
+  SECTION("Test copying 2D HostMemory to the device and back") {
+    const size_t width = 3 * sizeof(int);
+    const size_t height = 3;
+    const size_t pitch = width;
+    const size_t size = pitch * height;
+    cu::HostMemory src(size);
+    cu::HostMemory tgt(size);
+
+    // Populate the 2D memory with values
+    int* const src_ptr = static_cast<int*>(src);
+    int* const tgt_ptr = static_cast<int*>(tgt);
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < 3; ++x) {
+        src_ptr[y * 3 + x] = y * 3 + x + 1;
+        tgt_ptr[y * 3 + x] = 0;
+      }
+    }
+
+    cu::DeviceMemory mem(size);
+    cu::Stream stream;
+
+    stream.memcpyHtoD2DAsync(mem, pitch, src, pitch, width, height);
+    stream.memcpyDtoH2DAsync(tgt, pitch, mem, pitch, width, height);
+    stream.synchronize();
+
+    CHECK(static_cast<bool>(memcmp(src, tgt, size)) == 0);
+  }
 }
 
 TEST_CASE("Test cu::DeviceMemory", "[devicememory]") {
@@ -163,7 +221,7 @@ TEST_CASE("Test cu::DeviceMemory", "[devicememory]") {
   SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_DEVICE as host pointer") {
     cu::DeviceMemory mem(sizeof(float), CU_MEMORYTYPE_DEVICE, 0);
     float* ptr;
-    CHECK_THROWS(ptr = mem);
+    CHECK_NOTHROW(ptr = mem);
   }
 
   SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_UNIFIED as host pointer") {
@@ -206,7 +264,7 @@ TEST_CASE("Test cu::DeviceMemory", "[devicememory]") {
 }
 
 using TestTypes = std::tuple<unsigned char, unsigned short, unsigned int>;
-TEMPLATE_LIST_TEST_CASE("Test memset", "[memset]", TestTypes) {
+TEMPLATE_LIST_TEST_CASE("Test memset 1D", "[memset]", TestTypes) {
   cu::init();
   cu::Device device(0);
   cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device);
@@ -263,6 +321,71 @@ TEMPLATE_LIST_TEST_CASE("Test memset", "[memset]", TestTypes) {
   }
 }
 
+using TestTypes = std::tuple<unsigned char, unsigned short, unsigned int>;
+TEMPLATE_LIST_TEST_CASE("Test memset 2D", "[memset]", TestTypes) {
+  cu::init();
+  cu::Device device(0);
+  cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device);
+
+  SECTION("Test memset2D cu::DeviceMemory asynchronously") {
+    const size_t width = 3;
+    const size_t height = 3;
+    const size_t pitch = width * sizeof(TestType);
+    const size_t size = pitch * height;
+    cu::HostMemory a(size);
+    cu::HostMemory b(size);
+    TestType value = 0xAA;
+
+    // Populate the memory with initial values
+    TestType* const a_ptr = static_cast<TestType*>(a);
+    TestType* const b_ptr = static_cast<TestType*>(b);
+    for (int i = 0; i < width * height; i++) {
+      a_ptr[i] = 0;
+      b_ptr[i] = value;
+    }
+
+    cu::DeviceMemory mem(size);
+    cu::Stream stream;
+
+    // Perform the 2D memory operations
+    stream.memcpyHtoD2DAsync(mem, pitch, b, pitch, width, height);
+    stream.memset2DAsync(mem, value, pitch, width, height);
+    stream.memcpyDtoH2DAsync(b, pitch, mem, pitch, width, height);
+
+    CHECK(static_cast<bool>(memcmp(a, b, size)));
+  }
+
+  SECTION("Test zeroing cu::DeviceMemory synchronously in 2D") {
+    const size_t width = 3;
+    const size_t height = 3;
+    const size_t pitch = width * sizeof(TestType);
+    const size_t size = pitch * height;
+    cu::HostMemory a(size);
+    cu::HostMemory b(size);
+    TestType value = 0xAA;
+
+    // Populate the memory with initial values
+    TestType* const a_ptr = static_cast<TestType*>(a);
+    TestType* const b_ptr = static_cast<TestType*>(b);
+    for (int i = 0; i < width * height; i++) {
+      a_ptr[i] = 0;
+      b_ptr[i] = value;
+    }
+
+    cu::DeviceMemory mem(size);
+    cu::Stream stream;
+
+    // Perform the 2D memory operations
+    stream.memcpyHtoD2DAsync(mem, pitch, b, pitch, width, height);
+    stream.synchronize();
+    mem.memset2D(value, pitch, width, height);
+    stream.memcpyDtoH2DAsync(b, pitch, mem, pitch, width, height);
+    stream.synchronize();
+
+    CHECK(static_cast<bool>(memcmp(a, b, size)));
+  }
+}
+
 TEST_CASE("Test cu::Stream", "[stream]") {
   cu::init();
   cu::Device device(0);
diff --git a/tests/test_cufft.cpp b/tests/test_cufft.cpp
index 251091d..afc5354 100644
--- a/tests/test_cufft.cpp
+++ b/tests/test_cufft.cpp
@@ -114,6 +114,34 @@ TEST_CASE("Test 1D FFT", "[FFT1D]") {
     scaleSignal(out_ptr, out_ptr, size, float(size));
     compare(out_ptr, in_ptr, size);
   }
+
+  SECTION("FP32 FFT with Real-To-Complex translation, and back") {
+    const size_t arraySize = size * sizeof(cufftComplex);
+
+    cu::HostMemory h_in(arraySize);
+    cu::HostMemory h_out(arraySize);
+    cu::DeviceMemory d_in(arraySize);
+    cu::DeviceMemory d_out(arraySize);
+    cu::DeviceMemory d_out2(arraySize);
+
+    generateSignal(static_cast<cufftComplex *>(h_in), size, patchSize, {1, 1});
+    stream.memcpyHtoDAsync(d_in, h_in, arraySize);
+
+    cufft::FFT1DR2C<CUDA_R_32F> fft_r2c(size, 1, 1, 1);
+    cufft::FFT1DC2R<CUDA_C_32F> fft_c2r(size, 1, 1, 1);
+    fft_r2c.setStream(stream);
+    fft_c2r.setStream(stream);
+
+    fft_r2c.execute(d_in, d_out, CUFFT_FORWARD);
+    fft_c2r.execute(d_out, d_out2, CUFFT_INVERSE);
+    stream.memcpyDtoHAsync(h_out, d_out2, arraySize);
+    stream.synchronize();
+
+    cuFloatComplex *in_ptr = static_cast<cuFloatComplex *>(h_in);
+    cuFloatComplex *out_ptr = static_cast<cuFloatComplex *>(h_out);
+    scaleSignal(out_ptr, out_ptr, size, float(size));
+    compare(out_ptr, in_ptr, size);
+  }
 }
 
 TEST_CASE("Test 2D FFT", "[FFT2D]") {