Added support for 2D operations and C2R/R2C FFT (#305)

* Added `cu::Stream::memcpyHtoD2DAsync()`, `cu::Stream::memcpyDtoHD2Async()`, and `cu::Stream::memcpyDtoD2DAsync()` * Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()` * Added `cufft::FFT1DR2C` and `cufft::FFT1DC2R` * Added `cu::Device::getOrdinal()` * Allow non-managed memory dereferencing in `cu::DeviceMemory` --------- Co-authored-by: Bram Veenboer <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
nlesc-recruit · Nov 7, 2024 · fd7f102 · fd7f102
1 parent a7f1c0a
commit fd7f102
Show file tree

Hide file tree

Showing 8 changed files with 444 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,11 @@ project adheres to [Semantic Versioning](http://semver.org/).
 - Added `cu::DeviceMemory::memset()`
 - Added `cu::Stream::memsetAsync()`
 - Added `nvml::Device::getPower()`
+- Added `cu::Stream::memcpyHtoD2DAsync()`, `cu::Stream::memcpyDtoHD2Async()`,
+  and `cu::Stream::memcpyDtoD2DAsync()`
+- Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()`
+- Added `cufft::FFT1DR2C` and `cufft::FFT1DC2R`
+- Added `cu::Device::getOrdinal()`
 
 ### Changed
 
@@ -24,6 +29,7 @@ project adheres to [Semantic Versioning](http://semver.org/).
 - Upgrade Catch2 to version v3.6.0
 - `target_embed_source` is now more robust: it properly tracks dependencies and
   runs again whenever any of them changes
+- Expanded tests to cover the new 2D memory operations and FFT support
 
 ## \[0.8.0\] - 2024-07-05
 

diff --git a/README.dev.md b/README.dev.md
@@ -299,5 +299,5 @@ To view the generated documentation, open `_build/html/index.html` in your web-b
 
 ### Verification
 
-1. Make sure the new release is added to Zenodo (see <https://zenodo.org/record/6076447>)
+1. Make sure the new release is added to Zenodo (see <https://zenodo.org/records/8075251>)
 1. Activate the latest release documentation at <https://readthedocs.org/projects/cudawrappers/versions/>
diff --git a/cmake/cudawrappers-helper.cmake b/cmake/cudawrappers-helper.cmake
@@ -1,4 +1,4 @@
-# Retun a list of asbolute file names for all the local includes of the
+# Return a list of absolute file names for all the local includes of the
 # input_file.  Only files in the root directory will be considered.
 function(get_local_includes input_file root_dir)
   file(READ ${input_file} input_file_contents)

diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp
@@ -94,6 +94,23 @@ class Wrapper {
 
   explicit Wrapper(T &obj) : _obj(obj) {}
 
+  template <CUmemorytype... AllowedMemoryTypes>
+  inline void checkPointerAccess(const CUdeviceptr &pointer) const {
+    CUmemorytype memoryType;
+    checkCudaCall(cuPointerGetAttribute(
+        &memoryType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer));
+
+    // Check if the memoryType is one of the allowed memory types
+    for (auto allowedType : {AllowedMemoryTypes...}) {
+      if (memoryType == allowedType) {
+        return;
+      }
+    }
+
+    throw std::runtime_error(
+        "Invalid memory type: allowed types are not matched.");
+  }
+
   T _obj{};
   std::shared_ptr<T> manager;
 };
@@ -188,6 +205,8 @@ class Device : public Wrapper<CUdevice> {
     return size;
   }
 
+  int getOrdinal() const { return _ordinal; }
+
   // Primary Context Management
   std::pair<unsigned, bool> primaryCtxGetState() const {
     unsigned flags{};
@@ -597,6 +616,32 @@ class DeviceMemory : public Wrapper<CUdeviceptr> {
     checkCudaCall(cuMemsetD32(_obj, value, size));
   }
 
+  void memset2D(unsigned char value, size_t pitch, size_t width,
+                size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemset2D(_obj, pitch, value, width, height));
+#else
+    checkCudaCall(cuMemsetD2D8(_obj, pitch, value, width, height));
+#endif
+  }
+
+  void memset2D(unsigned short value, size_t pitch, size_t width,
+                size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemset2D(_obj, pitch, value, width, height));
+#else
+    checkCudaCall(cuMemsetD2D16(_obj, pitch, value, width, height));
+#endif
+  }
+
+  void memset2D(unsigned int value, size_t pitch, size_t width, size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemset2D(_obj, pitch, value, width, height));
+#else
+    checkCudaCall(cuMemsetD2D32(_obj, pitch, value, width, height));
+#endif
+  }
+
   void zero(size_t size) { memset(static_cast<unsigned char>(0), size); }
 
   const void *parameter()
@@ -607,15 +652,14 @@ class DeviceMemory : public Wrapper<CUdeviceptr> {
 
   template <typename T>
   operator T *() {
-    int data;
-    checkCudaCall(
-        cuPointerGetAttribute(&data, CU_POINTER_ATTRIBUTE_IS_MANAGED, _obj));
-    if (data) {
-      return reinterpret_cast<T *>(_obj);
-    } else {
-      throw std::runtime_error(
-          "Cannot return memory of type CU_MEMORYTYPE_DEVICE as pointer.");
-    }
+    checkPointerAccess<CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_UNIFIED>(_obj);
+    return reinterpret_cast<T *>(_obj);
+  }
+
+  template <typename T>
+  operator T *() const {
+    checkPointerAccess<CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_UNIFIED>(_obj);
+    return reinterpret_cast<T const *>(_obj);
   }
 
   size_t size() const { return _size; }
@@ -670,6 +714,74 @@ class Stream : public Wrapper<CUstream> {
 #endif
   }
 
+  void memcpyHtoD2DAsync(DeviceMemory &devPtr, size_t dpitch,
+                         const void *hostPtr, size_t spitch, size_t width,
+                         size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemcpy2DAsync(devPtr, dpitch, hostPtr, spitch, width,
+                                   height, hipMemcpyHostToDevice, _obj));
+#else
+    // Initialize the CUDA_MEMCPY2D structure
+    CUDA_MEMCPY2D copyParams = {0};
+
+    // Set width and height for the 2D copy
+    copyParams.WidthInBytes = width;
+    copyParams.Height = height;
+
+    // Set the destination (dst)
+    copyParams.dstXInBytes = 0;
+    copyParams.dstY = 0;
+    copyParams.dstPitch = dpitch;
+
+    // Set the source (src)
+    copyParams.srcXInBytes = 0;
+    copyParams.srcY = 0;
+    copyParams.srcPitch = spitch;
+
+    copyParams.srcMemoryType = CU_MEMORYTYPE_HOST;
+    copyParams.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    copyParams.srcHost = hostPtr;
+    copyParams.dstDevice = devPtr;
+
+    // Call the driver API function cuMemcpy2DAsync
+    checkCudaCall(cuMemcpy2DAsync(&copyParams, _obj));
+#endif
+  }
+
+  void memcpyDtoH2DAsync(void *hostPtr, size_t dpitch,
+                         const DeviceMemory &devPtr, size_t spitch,
+                         size_t width, size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemcpy2DAsync(hostPtr, dpitch, devPtr, spitch, width,
+                                   height, hipMemcpyDeviceToHost, _obj));
+#else
+    // Initialize the CUDA_MEMCPY2D structure
+    CUDA_MEMCPY2D copyParams = {0};
+
+    // Set width and height for the 2D copy
+    copyParams.WidthInBytes = width;
+    copyParams.Height = height;
+
+    // Set the destination (dst)
+    copyParams.dstXInBytes = 0;
+    copyParams.dstY = 0;
+    copyParams.dstPitch = dpitch;
+
+    // Set the source (src)
+    copyParams.srcXInBytes = 0;
+    copyParams.srcY = 0;
+    copyParams.srcPitch = spitch;
+
+    copyParams.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+    copyParams.dstMemoryType = CU_MEMORYTYPE_HOST;
+    copyParams.srcDevice = devPtr;
+    copyParams.dstHost = hostPtr;
+
+    // Call the driver API function cuMemcpy2DAsync
+    checkCudaCall(cuMemcpy2DAsync(&copyParams, _obj));
+#endif
+  }
+
   void memcpyHtoDAsync(CUdeviceptr devPtr, const void *hostPtr, size_t size) {
 #if defined(__HIP__)
     checkCudaCall(
@@ -716,10 +828,43 @@ class Stream : public Wrapper<CUstream> {
     checkCudaCall(cuMemsetD32Async(devPtr, value, size, _obj));
   }
 
+  void memset2DAsync(DeviceMemory &devPtr, unsigned char value, size_t pitch,
+                     size_t width, size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj));
+#else
+    checkCudaCall(cuMemsetD2D8Async(devPtr, pitch, value, width, height, _obj));
+#endif
+  }
+
+  void memset2DAsync(DeviceMemory &devPtr, unsigned short value, size_t pitch,
+                     size_t width, size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj));
+#else
+    checkCudaCall(
+        cuMemsetD2D16Async(devPtr, pitch, value, width, height, _obj));
+#endif
+  }
+
+  void memset2DAsync(DeviceMemory &devPtr, unsigned int value, size_t pitch,
+                     size_t width, size_t height) {
+#if defined(__HIP__)
+    checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj));
+#else
+    checkCudaCall(
+        cuMemsetD2D32Async(devPtr, pitch, value, width, height, _obj));
+#endif
+  }
+
   void zero(DeviceMemory &devPtr, size_t size) {
     memsetAsync(devPtr, static_cast<unsigned char>(0), size);
   }
 
+  void zero2D(DeviceMemory &devPtr, size_t pitch, size_t width, size_t height) {
+    memset2DAsync(devPtr, static_cast<unsigned char>(0), pitch, width, height);
+  }
+
   void launchKernel(Function &function, unsigned gridX, unsigned gridY,
                     unsigned gridZ, unsigned blockX, unsigned blockY,
                     unsigned blockZ, unsigned sharedMemBytes,
@@ -777,4 +922,4 @@ inline void Event::record(Stream &stream) {
 }
 }  // namespace cu
 
-#endif
+#endif