Skip to content

Commit

Permalink
Added support for 2D operations and C2R/R2C FFT (#305)
Browse files Browse the repository at this point in the history
* Added `cu::Stream::memcpyHtoD2DAsync()`, `cu::Stream::memcpyDtoHD2Async()`,
  and `cu::Stream::memcpyDtoD2DAsync()`
* Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()`
* Added `cufft::FFT1DR2C` and `cufft::FFT1DC2R`
* Added `cu::Device::getOrdinal()`
* Allow non-managed memory dereferencing in `cu::DeviceMemory`  

---------

Co-authored-by: Bram Veenboer <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Nov 7, 2024
1 parent a7f1c0a commit fd7f102
Show file tree
Hide file tree
Showing 8 changed files with 444 additions and 42 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ project adheres to [Semantic Versioning](http://semver.org/).
- Added `cu::DeviceMemory::memset()`
- Added `cu::Stream::memsetAsync()`
- Added `nvml::Device::getPower()`
- Added `cu::Stream::memcpyHtoD2DAsync()`, `cu::Stream::memcpyDtoHD2Async()`,
and `cu::Stream::memcpyDtoD2DAsync()`
- Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()`
- Added `cufft::FFT1DR2C` and `cufft::FFT1DC2R`
- Added `cu::Device::getOrdinal()`

### Changed

Expand All @@ -24,6 +29,7 @@ project adheres to [Semantic Versioning](http://semver.org/).
- Upgrade Catch2 to version v3.6.0
- `target_embed_source` is now more robust: it properly tracks dependencies and
runs again whenever any of them changes
- Expanded tests to cover the new 2D memory operations and FFT support

## \[0.8.0\] - 2024-07-05

Expand Down
2 changes: 1 addition & 1 deletion README.dev.md
Original file line number Diff line number Diff line change
Expand Up @@ -299,5 +299,5 @@ To view the generated documentation, open `_build/html/index.html` in your web-b

### Verification

1. Make sure the new release is added to Zenodo (see <https://zenodo.org/record/6076447>)
1. Make sure the new release is added to Zenodo (see <https://zenodo.org/records/8075251>)
1. Activate the latest release documentation at <https://readthedocs.org/projects/cudawrappers/versions/>
2 changes: 1 addition & 1 deletion cmake/cudawrappers-helper.cmake
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Retun a list of asbolute file names for all the local includes of the
# Return a list of absolute file names for all the local includes of the
# input_file. Only files in the root directory will be considered.
function(get_local_includes input_file root_dir)
file(READ ${input_file} input_file_contents)
Expand Down
165 changes: 155 additions & 10 deletions include/cudawrappers/cu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,23 @@ class Wrapper {

explicit Wrapper(T &obj) : _obj(obj) {}

template <CUmemorytype... AllowedMemoryTypes>
inline void checkPointerAccess(const CUdeviceptr &pointer) const {
CUmemorytype memoryType;
checkCudaCall(cuPointerGetAttribute(
&memoryType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer));

// Check if the memoryType is one of the allowed memory types
for (auto allowedType : {AllowedMemoryTypes...}) {
if (memoryType == allowedType) {
return;
}
}

throw std::runtime_error(
"Invalid memory type: allowed types are not matched.");
}

T _obj{};
std::shared_ptr<T> manager;
};
Expand Down Expand Up @@ -188,6 +205,8 @@ class Device : public Wrapper<CUdevice> {
return size;
}

int getOrdinal() const { return _ordinal; }

// Primary Context Management
std::pair<unsigned, bool> primaryCtxGetState() const {
unsigned flags{};
Expand Down Expand Up @@ -597,6 +616,32 @@ class DeviceMemory : public Wrapper<CUdeviceptr> {
checkCudaCall(cuMemsetD32(_obj, value, size));
}

void memset2D(unsigned char value, size_t pitch, size_t width,
size_t height) {
#if defined(__HIP__)
checkCudaCall(hipMemset2D(_obj, pitch, value, width, height));
#else
checkCudaCall(cuMemsetD2D8(_obj, pitch, value, width, height));
#endif
}

void memset2D(unsigned short value, size_t pitch, size_t width,
size_t height) {
#if defined(__HIP__)
checkCudaCall(hipMemset2D(_obj, pitch, value, width, height));
#else
checkCudaCall(cuMemsetD2D16(_obj, pitch, value, width, height));
#endif
}

void memset2D(unsigned int value, size_t pitch, size_t width, size_t height) {
#if defined(__HIP__)
checkCudaCall(hipMemset2D(_obj, pitch, value, width, height));
#else
checkCudaCall(cuMemsetD2D32(_obj, pitch, value, width, height));
#endif
}

void zero(size_t size) { memset(static_cast<unsigned char>(0), size); }

const void *parameter()
Expand All @@ -607,15 +652,14 @@ class DeviceMemory : public Wrapper<CUdeviceptr> {

template <typename T>
operator T *() {
int data;
checkCudaCall(
cuPointerGetAttribute(&data, CU_POINTER_ATTRIBUTE_IS_MANAGED, _obj));
if (data) {
return reinterpret_cast<T *>(_obj);
} else {
throw std::runtime_error(
"Cannot return memory of type CU_MEMORYTYPE_DEVICE as pointer.");
}
checkPointerAccess<CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_UNIFIED>(_obj);
return reinterpret_cast<T *>(_obj);
}

template <typename T>
operator T *() const {
checkPointerAccess<CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_UNIFIED>(_obj);
return reinterpret_cast<T const *>(_obj);
}

size_t size() const { return _size; }
Expand Down Expand Up @@ -670,6 +714,74 @@ class Stream : public Wrapper<CUstream> {
#endif
}

void memcpyHtoD2DAsync(DeviceMemory &devPtr, size_t dpitch,
const void *hostPtr, size_t spitch, size_t width,
size_t height) {
#if defined(__HIP__)
checkCudaCall(hipMemcpy2DAsync(devPtr, dpitch, hostPtr, spitch, width,
height, hipMemcpyHostToDevice, _obj));
#else
// Initialize the CUDA_MEMCPY2D structure
CUDA_MEMCPY2D copyParams = {0};

// Set width and height for the 2D copy
copyParams.WidthInBytes = width;
copyParams.Height = height;

// Set the destination (dst)
copyParams.dstXInBytes = 0;
copyParams.dstY = 0;
copyParams.dstPitch = dpitch;

// Set the source (src)
copyParams.srcXInBytes = 0;
copyParams.srcY = 0;
copyParams.srcPitch = spitch;

copyParams.srcMemoryType = CU_MEMORYTYPE_HOST;
copyParams.dstMemoryType = CU_MEMORYTYPE_DEVICE;
copyParams.srcHost = hostPtr;
copyParams.dstDevice = devPtr;

// Call the driver API function cuMemcpy2DAsync
checkCudaCall(cuMemcpy2DAsync(&copyParams, _obj));
#endif
}

void memcpyDtoH2DAsync(void *hostPtr, size_t dpitch,
const DeviceMemory &devPtr, size_t spitch,
size_t width, size_t height) {
#if defined(__HIP__)
checkCudaCall(hipMemcpy2DAsync(hostPtr, dpitch, devPtr, spitch, width,
height, hipMemcpyDeviceToHost, _obj));
#else
// Initialize the CUDA_MEMCPY2D structure
CUDA_MEMCPY2D copyParams = {0};

// Set width and height for the 2D copy
copyParams.WidthInBytes = width;
copyParams.Height = height;

// Set the destination (dst)
copyParams.dstXInBytes = 0;
copyParams.dstY = 0;
copyParams.dstPitch = dpitch;

// Set the source (src)
copyParams.srcXInBytes = 0;
copyParams.srcY = 0;
copyParams.srcPitch = spitch;

copyParams.srcMemoryType = CU_MEMORYTYPE_DEVICE;
copyParams.dstMemoryType = CU_MEMORYTYPE_HOST;
copyParams.srcDevice = devPtr;
copyParams.dstHost = hostPtr;

// Call the driver API function cuMemcpy2DAsync
checkCudaCall(cuMemcpy2DAsync(&copyParams, _obj));
#endif
}

void memcpyHtoDAsync(CUdeviceptr devPtr, const void *hostPtr, size_t size) {
#if defined(__HIP__)
checkCudaCall(
Expand Down Expand Up @@ -716,10 +828,43 @@ class Stream : public Wrapper<CUstream> {
checkCudaCall(cuMemsetD32Async(devPtr, value, size, _obj));
}

void memset2DAsync(DeviceMemory &devPtr, unsigned char value, size_t pitch,
size_t width, size_t height) {
#if defined(__HIP__)
checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj));
#else
checkCudaCall(cuMemsetD2D8Async(devPtr, pitch, value, width, height, _obj));
#endif
}

void memset2DAsync(DeviceMemory &devPtr, unsigned short value, size_t pitch,
size_t width, size_t height) {
#if defined(__HIP__)
checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj));
#else
checkCudaCall(
cuMemsetD2D16Async(devPtr, pitch, value, width, height, _obj));
#endif
}

void memset2DAsync(DeviceMemory &devPtr, unsigned int value, size_t pitch,
size_t width, size_t height) {
#if defined(__HIP__)
checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj));
#else
checkCudaCall(
cuMemsetD2D32Async(devPtr, pitch, value, width, height, _obj));
#endif
}

void zero(DeviceMemory &devPtr, size_t size) {
memsetAsync(devPtr, static_cast<unsigned char>(0), size);
}

void zero2D(DeviceMemory &devPtr, size_t pitch, size_t width, size_t height) {
memset2DAsync(devPtr, static_cast<unsigned char>(0), pitch, width, height);
}

void launchKernel(Function &function, unsigned gridX, unsigned gridY,
unsigned gridZ, unsigned blockX, unsigned blockY,
unsigned blockZ, unsigned sharedMemBytes,
Expand Down Expand Up @@ -777,4 +922,4 @@ inline void Event::record(Stream &stream) {
}
} // namespace cu

#endif
#endif
Loading

0 comments on commit fd7f102

Please sign in to comment.