From 74559b8a0cd418efc1b7df6e3aae6f069ffcf753 Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Tue, 22 Oct 2024 15:59:10 +0200 Subject: [PATCH 01/23] Added 2D memcpy, memset and zeroing functionality. --- include/cudawrappers/cu.hpp | 145 +++++++++++++++++++++++++++++++++++- 1 file changed, 143 insertions(+), 2 deletions(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 032ab44..c09a044 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -23,6 +23,7 @@ #include #endif + namespace cu { class Error : public std::exception { public: @@ -188,6 +189,10 @@ class Device : public Wrapper { return size; } + size_t getTotalConstMem() const { + return static_cast(getAttribute(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY)); + } + // Primary Context Management std::pair primaryCtxGetState() const { unsigned flags{}; @@ -597,14 +602,40 @@ class DeviceMemory : public Wrapper { checkCudaCall(cuMemsetD32(_obj, value, size)); } - void zero(size_t size) { memset(static_cast(0), size); } + void memset2D(unsigned char value, size_t size, size_t width, size_t height) { + checkCudaCall(cuMemsetD2D8(_obj, value, size, width, height)); + } + + void memset2D(unsigned short value, size_t size, size_t width, size_t height) { + checkCudaCall(cuMemsetD2D16(_obj, value, size, width, height)); + } + + void memset2D(unsigned int value, size_t size, size_t width, size_t height) { + checkCudaCall(cuMemsetD2D32(_obj, value, size, width, height)); + } + + void zero(size_t size) { + memset(static_cast(0), size); + } + void memcpyToSymbolSync(const void *symbol, size_t count, size_t offset) { + if (cudaMemcpyToSymbol(symbol,reinterpret_cast(_obj), count, offset, cudaMemcpyDeviceToDevice) != cudaSuccess) { + throw cu::Error(CUDA_ERROR_UNKNOWN); + } + } + const void *parameter() const // used to construct parameter list for launchKernel(); { return &_obj; } + // FIXME: remove this function. + void *parameter_copy_temp() + { + return reinterpret_cast(_obj); + } + template operator T *() { int data; @@ -670,6 +701,70 @@ class Stream : public Wrapper { #endif } + void memcpyHtoD2DAsync(DeviceMemory &devPtr, size_t dpitch, const void *hostPtr, size_t spitch, size_t width, size_t height) { +#if defined(__HIP__) + // FIXME: implement for HIP + #error "memcpyHtoD2DAsync not yet implemented for HIP" +#else + // Initialize the CUDA_MEMCPY2D structure + CUDA_MEMCPY2D copyParams = {0}; + + // Set width and height for the 2D copy + copyParams.WidthInBytes = width; + copyParams.Height = height; + + // Set the destination (dst) + copyParams.dstXInBytes = 0; + copyParams.dstY = 0; + copyParams.dstPitch = dpitch; + + // Set the source (src) + copyParams.srcXInBytes = 0; + copyParams.srcY = 0; + copyParams.srcPitch = spitch; + + copyParams.srcMemoryType = CU_MEMORYTYPE_HOST; + copyParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + copyParams.srcHost = hostPtr; + copyParams.dstDevice = devPtr; + + // Call the driver API function cuMemcpy2DAsync + checkCudaCall(cuMemcpy2DAsync(©Params, _obj)); +#endif + } + + void memcpyDtoH2DAsync(void *hostPtr, size_t dpitch, const DeviceMemory &devPtr, size_t spitch, size_t width, size_t height) { +#if defined(__HIP__) + // FIXME: implement for HIP + #error "memcpyDtoH2DAsync not yet implemented for HIP" +#else + // Initialize the CUDA_MEMCPY2D structure + CUDA_MEMCPY2D copyParams = {0}; + + // Set width and height for the 2D copy + copyParams.WidthInBytes = width; + copyParams.Height = height; + + // Set the destination (dst) + copyParams.dstXInBytes = 0; + copyParams.dstY = 0; + copyParams.dstPitch = dpitch; + + // Set the source (src) + copyParams.srcXInBytes = 0; + copyParams.srcY = 0; + copyParams.srcPitch = spitch; + + copyParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + copyParams.dstMemoryType = CU_MEMORYTYPE_HOST; + copyParams.srcDevice = devPtr; + copyParams.dstHost = hostPtr; + + // Call the driver API function cuMemcpy2DAsync + checkCudaCall(cuMemcpy2DAsync(©Params, _obj)); +#endif + } + void memcpyHtoDAsync(CUdeviceptr devPtr, const void *hostPtr, size_t size) { #if defined(__HIP__) checkCudaCall( @@ -716,10 +811,26 @@ class Stream : public Wrapper { checkCudaCall(cuMemsetD32Async(devPtr, value, size, _obj)); } + void memset2DAsync(DeviceMemory &devPtr, unsigned char value, size_t pitch, size_t width, size_t height) { + checkCudaCall(cuMemsetD2D8Async(devPtr, pitch, value, width, height, _obj)); + } + + void memset2DAsync(DeviceMemory &devPtr, unsigned short value, size_t pitch, size_t width, size_t height) { + checkCudaCall(cuMemsetD2D16Async(devPtr, pitch, value, width, height, _obj)); + } + + void memset2DAsync(DeviceMemory &devPtr, int value, size_t pitch, size_t width, size_t height) { + checkCudaCall(cuMemsetD2D32Async(devPtr, pitch, value, width, height, _obj)); + } + void zero(DeviceMemory &devPtr, size_t size) { memsetAsync(devPtr, static_cast(0), size); } + void zero2D(DeviceMemory &devPtr, size_t pitch, size_t width, size_t height) { + memset2DAsync(devPtr, static_cast(0), pitch, width, height); + } + void launchKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, @@ -775,6 +886,36 @@ class Stream : public Wrapper { inline void Event::record(Stream &stream) { checkCudaCall(cuEventRecord(_obj, stream._obj)); } + +// inline void memcpyToSymbolSync(const void *symbol, cu::DeviceMemory &src, size_t count, size_t offset) { +// // cudaMemcpyToSymbolAsync(c_killmask, src, count, offset, +// // cudaMemcpyDeviceToDevice, stream); +// #if defined(__HIP__) +// // FIXME: finish HIP implementation. +// #error "memcpyToSymbolAsync not yet implemented for HIP" +// #else + +// // FIXME: find the 'cu' equivalent of cudaMemcpyToSymbolAsync, i.e. cuMemcpyToSymbolAsync +// // checkCudaCall(cudaMemcpyToSymbol(symbol, src, count, offset, cudaMemcpyDeviceToDevice)); + + +// if (cudaMemcpyToSymbol(symbol, src.parameter_by_copy(), count, offset, cudaMemcpyDeviceToDevice) != cudaSuccess) { +// throw cu::Error(CUDA_ERROR_UNKNOWN); +// } + + +// // CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, const char* name ) +// // CUdeviceptr dptr = nullptr; +// // size_t dsize = 0; + +// // checkCudaCall(cuModuleGetGlobal()) + +// // checkCudaCall(cuMemcpyToSymbolAsync(symbol, src, count, cudaMemcpyDeviceToDevice, _obj)); +// #endif +// } + } // namespace cu -#endif + + +#endif \ No newline at end of file From 0d06efd952e233eea5cdf274d233b581b66fe7b5 Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Tue, 22 Oct 2024 16:02:52 +0200 Subject: [PATCH 02/23] WIP: configurable FFT for real-to-complex (and vice versa) conversions. --- include/cudawrappers/cufft.hpp | 75 +++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/include/cudawrappers/cufft.hpp b/include/cudawrappers/cufft.hpp index 7457fc5..6ce5759 100644 --- a/include/cudawrappers/cufft.hpp +++ b/include/cudawrappers/cufft.hpp @@ -12,6 +12,7 @@ #endif #include +#include #include "cudawrappers/cu.hpp" @@ -224,6 +225,78 @@ FFT2D::FFT2D(int nx, int ny, int stride, int dist, int batch) { template <> FFT2D::FFT2D(int nx, int ny) : FFT2D(nx, ny, 1, nx * ny, 1) {} +/* + * FFT1DRealToComplex + */ +template +class FFT1DRealToComplex : public FFT { + public: +#if defined(__HIP__) + __host__ +#endif + FFT1DRealToComplex(int nx) = delete; +#if defined(__HIP__) + __host__ +#endif + FFT1DRealToComplex(int nx, int batch) = delete; + + FFT1DRealToComplex(int nx, int batch, std::array& inembed, std::array& ouembed) = delete; +}; + + +template <> +FFT1DRealToComplex::FFT1DRealToComplex(int nx, int batch, std::array& inembed, std::array& ouembed) { + checkCuFFTCall(cufftCreate(plan())); + const int rank = 1; + size_t ws = 0; + std::array n{nx}; + long long int idist = inembed[0]; + long long int odist = ouembed[0]; + int istride = 1; + int ostride = 1; + checkCuFFTCall(cufftXtMakePlanMany(*plan(), rank, n.data(), inembed.data(), istride, + idist, CUDA_R_32F, ouembed.data(), ostride, odist, + CUDA_C_32F, batch, &ws, CUDA_R_32F)); +} + +/* + * FFT1DComplexToReal + */ +template +class FFT1DComplexToReal : public FFT { + public: +#if defined(__HIP__) + __host__ +#endif + FFT1DComplexToReal(int nx) = delete; +#if defined(__HIP__) + __host__ +#endif + FFT1DComplexToReal(int nx, int batch) = delete; + + FFT1DComplexToReal(int nx, int batch, std::array& inembed, std::array& ouembed) = delete; +}; + + +template <> +FFT1DComplexToReal::FFT1DComplexToReal(int nx, int batch, std::array& inembed, std::array& ouembed) { + checkCuFFTCall(cufftCreate(plan())); + const int rank = 1; + size_t ws = 0; + std::array n{nx}; + long long int idist = inembed[0]; + long long int odist = ouembed[0]; + int istride = 1; + int ostride = 1; + checkCuFFTCall(cufftXtMakePlanMany(*plan(), rank, n.data(), inembed.data(), istride, + idist, CUDA_C_32F, ouembed.data(), ostride, odist, + CUDA_R_32F, batch, &ws, CUDA_C_32F)); +} + + } // namespace cufft -#endif // CUFFT_H + + + +#endif // CUFFT_H \ No newline at end of file From a4f390ef58e8840cc3b0b9e38193274125143777 Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Wed, 23 Oct 2024 12:28:24 +0200 Subject: [PATCH 03/23] Added tests for new cudawrapper::cu methods. --- include/cudawrappers/cu.hpp | 86 +++++++++++++----------- tests/test_cu.cpp | 127 +++++++++++++++++++++++++++++++++++- 2 files changed, 173 insertions(+), 40 deletions(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index c09a044..8855622 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -23,7 +23,6 @@ #include #endif - namespace cu { class Error : public std::exception { public: @@ -190,7 +189,8 @@ class Device : public Wrapper { } size_t getTotalConstMem() const { - return static_cast(getAttribute(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY)); + return static_cast( + getAttribute(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY)); } // Primary Context Management @@ -602,28 +602,29 @@ class DeviceMemory : public Wrapper { checkCudaCall(cuMemsetD32(_obj, value, size)); } - void memset2D(unsigned char value, size_t size, size_t width, size_t height) { - checkCudaCall(cuMemsetD2D8(_obj, value, size, width, height)); + void memset2D(unsigned char value, size_t pitch, size_t width, + size_t height) { + checkCudaCall(cuMemsetD2D8(_obj, pitch, value, width, height)); } - void memset2D(unsigned short value, size_t size, size_t width, size_t height) { - checkCudaCall(cuMemsetD2D16(_obj, value, size, width, height)); + void memset2D(unsigned short value, size_t pitch, size_t width, + size_t height) { + checkCudaCall(cuMemsetD2D16(_obj, pitch, value, width, height)); } - void memset2D(unsigned int value, size_t size, size_t width, size_t height) { - checkCudaCall(cuMemsetD2D32(_obj, value, size, width, height)); + void memset2D(unsigned int value, size_t pitch, size_t width, size_t height) { + checkCudaCall(cuMemsetD2D32(_obj, pitch, value, width, height)); } - void zero(size_t size) { - memset(static_cast(0), size); - } + void zero(size_t size) { memset(static_cast(0), size); } void memcpyToSymbolSync(const void *symbol, size_t count, size_t offset) { - if (cudaMemcpyToSymbol(symbol,reinterpret_cast(_obj), count, offset, cudaMemcpyDeviceToDevice) != cudaSuccess) { + if (cudaMemcpyToSymbol(symbol, reinterpret_cast(_obj), count, + offset, cudaMemcpyDeviceToDevice) != cudaSuccess) { throw cu::Error(CUDA_ERROR_UNKNOWN); } } - + const void *parameter() const // used to construct parameter list for launchKernel(); { @@ -631,10 +632,7 @@ class DeviceMemory : public Wrapper { } // FIXME: remove this function. - void *parameter_copy_temp() - { - return reinterpret_cast(_obj); - } + void *parameter_copy_temp() { return reinterpret_cast(_obj); } template operator T *() { @@ -701,10 +699,12 @@ class Stream : public Wrapper { #endif } - void memcpyHtoD2DAsync(DeviceMemory &devPtr, size_t dpitch, const void *hostPtr, size_t spitch, size_t width, size_t height) { + void memcpyHtoD2DAsync(DeviceMemory &devPtr, size_t dpitch, + const void *hostPtr, size_t spitch, size_t width, + size_t height) { #if defined(__HIP__) - // FIXME: implement for HIP - #error "memcpyHtoD2DAsync not yet implemented for HIP" +// FIXME: implement for HIP +#error "memcpyHtoD2DAsync not yet implemented for HIP" #else // Initialize the CUDA_MEMCPY2D structure CUDA_MEMCPY2D copyParams = {0}; @@ -733,10 +733,12 @@ class Stream : public Wrapper { #endif } - void memcpyDtoH2DAsync(void *hostPtr, size_t dpitch, const DeviceMemory &devPtr, size_t spitch, size_t width, size_t height) { + void memcpyDtoH2DAsync(void *hostPtr, size_t dpitch, + const DeviceMemory &devPtr, size_t spitch, + size_t width, size_t height) { #if defined(__HIP__) - // FIXME: implement for HIP - #error "memcpyDtoH2DAsync not yet implemented for HIP" +// FIXME: implement for HIP +#error "memcpyDtoH2DAsync not yet implemented for HIP" #else // Initialize the CUDA_MEMCPY2D structure CUDA_MEMCPY2D copyParams = {0}; @@ -811,16 +813,21 @@ class Stream : public Wrapper { checkCudaCall(cuMemsetD32Async(devPtr, value, size, _obj)); } - void memset2DAsync(DeviceMemory &devPtr, unsigned char value, size_t pitch, size_t width, size_t height) { + void memset2DAsync(DeviceMemory &devPtr, unsigned char value, size_t pitch, + size_t width, size_t height) { checkCudaCall(cuMemsetD2D8Async(devPtr, pitch, value, width, height, _obj)); } - void memset2DAsync(DeviceMemory &devPtr, unsigned short value, size_t pitch, size_t width, size_t height) { - checkCudaCall(cuMemsetD2D16Async(devPtr, pitch, value, width, height, _obj)); + void memset2DAsync(DeviceMemory &devPtr, unsigned short value, size_t pitch, + size_t width, size_t height) { + checkCudaCall( + cuMemsetD2D16Async(devPtr, pitch, value, width, height, _obj)); } - void memset2DAsync(DeviceMemory &devPtr, int value, size_t pitch, size_t width, size_t height) { - checkCudaCall(cuMemsetD2D32Async(devPtr, pitch, value, width, height, _obj)); + void memset2DAsync(DeviceMemory &devPtr, unsigned int value, size_t pitch, + size_t width, size_t height) { + checkCudaCall( + cuMemsetD2D32Async(devPtr, pitch, value, width, height, _obj)); } void zero(DeviceMemory &devPtr, size_t size) { @@ -887,7 +894,8 @@ inline void Event::record(Stream &stream) { checkCudaCall(cuEventRecord(_obj, stream._obj)); } -// inline void memcpyToSymbolSync(const void *symbol, cu::DeviceMemory &src, size_t count, size_t offset) { +// inline void memcpyToSymbolSync(const void *symbol, cu::DeviceMemory &src, +// size_t count, size_t offset) { // // cudaMemcpyToSymbolAsync(c_killmask, src, count, offset, // // cudaMemcpyDeviceToDevice, stream); // #if defined(__HIP__) @@ -895,27 +903,27 @@ inline void Event::record(Stream &stream) { // #error "memcpyToSymbolAsync not yet implemented for HIP" // #else -// // FIXME: find the 'cu' equivalent of cudaMemcpyToSymbolAsync, i.e. cuMemcpyToSymbolAsync -// // checkCudaCall(cudaMemcpyToSymbol(symbol, src, count, offset, cudaMemcpyDeviceToDevice)); - +// // FIXME: find the 'cu' equivalent of cudaMemcpyToSymbolAsync, i.e. +// cuMemcpyToSymbolAsync +// // checkCudaCall(cudaMemcpyToSymbol(symbol, src, count, offset, +// cudaMemcpyDeviceToDevice)); -// if (cudaMemcpyToSymbol(symbol, src.parameter_by_copy(), count, offset, cudaMemcpyDeviceToDevice) != cudaSuccess) { +// if (cudaMemcpyToSymbol(symbol, src.parameter_by_copy(), count, offset, +// cudaMemcpyDeviceToDevice) != cudaSuccess) { // throw cu::Error(CUDA_ERROR_UNKNOWN); // } - -// // CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, const char* name ) +// // CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule +// hmod, const char* name ) // // CUdeviceptr dptr = nullptr; // // size_t dsize = 0; // // checkCudaCall(cuModuleGetGlobal()) -// // checkCudaCall(cuMemcpyToSymbolAsync(symbol, src, count, cudaMemcpyDeviceToDevice, _obj)); -// #endif +// // checkCudaCall(cuMemcpyToSymbolAsync(symbol, src, count, +// cudaMemcpyDeviceToDevice, _obj)); #endif // } } // namespace cu - - #endif \ No newline at end of file diff --git a/tests/test_cu.cpp b/tests/test_cu.cpp index e383b0f..42e11b6 100644 --- a/tests/test_cu.cpp +++ b/tests/test_cu.cpp @@ -23,6 +23,20 @@ TEST_CASE("Test cu::Device", "[device]") { std::cout << "Device arch: " << arch << std::endl; CHECK(arch.size() > 0); } + + SECTION("Test Device.totalMem") { + const size_t total_mem = device.totalMem(); + std::cout << "Device total memory: " << (total_mem / (1024 * 1024)) + << " bytes" << std::endl; + CHECK(total_mem > 0); + } + + SECTION("Test Device.getTotalConstMem") { + const size_t const_mem = device.getTotalConstMem(); + std::cout << "Device constant memory: " << const_mem << " bytes" + << std::endl; + CHECK(const_mem > 0); + } } TEST_CASE("Test context::getDevice", "[device]") { @@ -57,6 +71,24 @@ TEST_CASE("Test copying cu::DeviceMemory and cu::HostMemory using cu::Stream", CHECK(src == tgt); } + SECTION("Test copying a 2D std::array to the device and back") { + const std::array, 3> src = { + {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}}; + std::array, 3> tgt = {{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}; + const size_t width = 3 * sizeof(int); + const size_t height = 3; + const size_t pitch = width; + + cu::DeviceMemory mem(pitch * height); + + cu::Stream stream; + stream.memcpyHtoD2DAsync(mem, pitch, src.data(), pitch, width, height); + stream.memcpyDtoH2DAsync(tgt.data(), pitch, mem, pitch, width, height); + stream.synchronize(); + + CHECK(src == tgt); + } + SECTION("Test copying HostMemory to the device and back") { const size_t N = 3; const size_t size = N * sizeof(int); @@ -79,6 +111,34 @@ TEST_CASE("Test copying cu::DeviceMemory and cu::HostMemory using cu::Stream", CHECK(!static_cast(memcmp(src, tgt, size))); } + + SECTION("Test copying 2D HostMemory to the device and back") { + const size_t width = 3 * sizeof(int); + const size_t height = 3; + const size_t pitch = width; + const size_t size = pitch * height; + cu::HostMemory src(size); + cu::HostMemory tgt(size); + + // Populate the 2D memory with values + int* const src_ptr = static_cast(src); + int* const tgt_ptr = static_cast(tgt); + for (int y = 0; y < height; ++y) { + for (int x = 0; x < 3; ++x) { + src_ptr[y * 3 + x] = y * 3 + x + 1; + tgt_ptr[y * 3 + x] = 0; + } + } + + cu::DeviceMemory mem(size); + cu::Stream stream; + + stream.memcpyHtoD2DAsync(mem, pitch, src, pitch, width, height); + stream.memcpyDtoH2DAsync(tgt, pitch, mem, pitch, width, height); + stream.synchronize(); + + CHECK(static_cast(memcmp(src, tgt, size)) == 0); + } } TEST_CASE("Test cu::DeviceMemory", "[devicememory]") { @@ -206,7 +266,7 @@ TEST_CASE("Test cu::DeviceMemory", "[devicememory]") { } using TestTypes = std::tuple; -TEMPLATE_LIST_TEST_CASE("Test memset", "[memset]", TestTypes) { +TEMPLATE_LIST_TEST_CASE("Test memset 1D", "[memset]", TestTypes) { cu::init(); cu::Device device(0); cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device); @@ -263,6 +323,71 @@ TEMPLATE_LIST_TEST_CASE("Test memset", "[memset]", TestTypes) { } } +using TestTypes = std::tuple; +TEMPLATE_LIST_TEST_CASE("Test memset 2D", "[memset]", TestTypes) { + cu::init(); + cu::Device device(0); + cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device); + + SECTION("Test memset2D cu::DeviceMemory asynchronously") { + const size_t width = 3; + const size_t height = 3; + const size_t pitch = width * sizeof(TestType); + const size_t size = pitch * height; + cu::HostMemory a(size); + cu::HostMemory b(size); + TestType value = 0xAA; + + // Populate the memory with initial values + TestType* const a_ptr = static_cast(a); + TestType* const b_ptr = static_cast(b); + for (int i = 0; i < width * height; i++) { + a_ptr[i] = 0; + b_ptr[i] = value; + } + + cu::DeviceMemory mem(size); + cu::Stream stream; + + // Perform the 2D memory operations + stream.memcpyHtoD2DAsync(mem, pitch, b, pitch, width, height); + stream.memset2DAsync(mem, value, pitch, width, height); + stream.memcpyDtoH2DAsync(b, pitch, mem, pitch, width, height); + + CHECK(static_cast(memcmp(a, b, size))); + } + + SECTION("Test zeroing cu::DeviceMemory synchronously in 2D") { + const size_t width = 3; + const size_t height = 3; + const size_t pitch = width * sizeof(TestType); + const size_t size = pitch * height; + cu::HostMemory a(size); + cu::HostMemory b(size); + TestType value = 0xAA; + + // Populate the memory with initial values + TestType* const a_ptr = static_cast(a); + TestType* const b_ptr = static_cast(b); + for (int i = 0; i < width * height; i++) { + a_ptr[i] = 0; + b_ptr[i] = value; + } + + cu::DeviceMemory mem(size); + cu::Stream stream; + + // Perform the 2D memory operations + stream.memcpyHtoD2DAsync(mem, pitch, b, pitch, width, height); + stream.synchronize(); + mem.memset2D(value, pitch, width, height); + stream.memcpyDtoH2DAsync(b, pitch, mem, pitch, width, height); + stream.synchronize(); + + CHECK(static_cast(memcmp(a, b, size))); + } +} + TEST_CASE("Test cu::Stream", "[stream]") { cu::init(); cu::Device device(0); From 165d2a6db621ac4316c0904219084afccf3e3c2d Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Tue, 29 Oct 2024 16:37:43 +0100 Subject: [PATCH 04/23] Simplified FFT1DRealToComplex interface and added relevant tests. --- include/cudawrappers/cu.hpp | 10 ++++++++ include/cudawrappers/cufft.hpp | 45 +++++++++++++++++----------------- tests/test_cufft.cpp | 28 +++++++++++++++++++++ 3 files changed, 61 insertions(+), 22 deletions(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 8855622..4a68b3d 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -189,6 +189,7 @@ class Device : public Wrapper { } size_t getTotalConstMem() const { + // FIXME: implement HIP. return static_cast( getAttribute(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY)); } @@ -618,6 +619,15 @@ class DeviceMemory : public Wrapper { void zero(size_t size) { memset(static_cast(0), size); } + void memcpyToSymbolAsync(const void *symbol, size_t count, size_t offset, + cudaStream_t stream) { + if (cudaMemcpyToSymbolAsync(symbol, reinterpret_cast(_obj), + count, offset, cudaMemcpyDeviceToDevice, + stream) != cudaSuccess) { + throw cu::Error(CUDA_ERROR_UNKNOWN); + } + } + void memcpyToSymbolSync(const void *symbol, size_t count, size_t offset) { if (cudaMemcpyToSymbol(symbol, reinterpret_cast(_obj), count, offset, cudaMemcpyDeviceToDevice) != cudaSuccess) { diff --git a/include/cudawrappers/cufft.hpp b/include/cudawrappers/cufft.hpp index 6ce5759..605fcb3 100644 --- a/include/cudawrappers/cufft.hpp +++ b/include/cudawrappers/cufft.hpp @@ -11,8 +11,8 @@ #include #endif -#include #include +#include #include "cudawrappers/cu.hpp" @@ -240,23 +240,26 @@ class FFT1DRealToComplex : public FFT { #endif FFT1DRealToComplex(int nx, int batch) = delete; - FFT1DRealToComplex(int nx, int batch, std::array& inembed, std::array& ouembed) = delete; + FFT1DRealToComplex(int nx, int batch, long long inembed, + long long ouembed) = delete; }; - template <> -FFT1DRealToComplex::FFT1DRealToComplex(int nx, int batch, std::array& inembed, std::array& ouembed) { +FFT1DRealToComplex::FFT1DRealToComplex(int nx, int batch, + long long inembed, + long long ouembed) { checkCuFFTCall(cufftCreate(plan())); const int rank = 1; size_t ws = 0; std::array n{nx}; - long long int idist = inembed[0]; - long long int odist = ouembed[0]; + const long long int idist = inembed; + const long long int odist = ouembed; + int istride = 1; int ostride = 1; - checkCuFFTCall(cufftXtMakePlanMany(*plan(), rank, n.data(), inembed.data(), istride, - idist, CUDA_R_32F, ouembed.data(), ostride, odist, - CUDA_C_32F, batch, &ws, CUDA_R_32F)); + checkCuFFTCall(cufftXtMakePlanMany( + *plan(), rank, n.data(), &inembed, istride, idist, CUDA_R_32F, &ouembed, + ostride, odist, CUDA_C_32F, batch, &ws, CUDA_R_32F)); } /* @@ -266,6 +269,7 @@ template class FFT1DComplexToReal : public FFT { public: #if defined(__HIP__) + __host__ #endif FFT1DComplexToReal(int nx) = delete; @@ -273,30 +277,27 @@ class FFT1DComplexToReal : public FFT { __host__ #endif FFT1DComplexToReal(int nx, int batch) = delete; - - FFT1DComplexToReal(int nx, int batch, std::array& inembed, std::array& ouembed) = delete; + FFT1DComplexToReal(int nx, int batch, long long inembed, + long long ouembed) = delete; }; - template <> -FFT1DComplexToReal::FFT1DComplexToReal(int nx, int batch, std::array& inembed, std::array& ouembed) { +FFT1DComplexToReal::FFT1DComplexToReal(int nx, int batch, + long long inembed, + long long ouembed) { checkCuFFTCall(cufftCreate(plan())); const int rank = 1; size_t ws = 0; std::array n{nx}; - long long int idist = inembed[0]; - long long int odist = ouembed[0]; + long long int idist = inembed; + long long int odist = ouembed; int istride = 1; int ostride = 1; - checkCuFFTCall(cufftXtMakePlanMany(*plan(), rank, n.data(), inembed.data(), istride, - idist, CUDA_C_32F, ouembed.data(), ostride, odist, - CUDA_R_32F, batch, &ws, CUDA_C_32F)); + checkCuFFTCall(cufftXtMakePlanMany( + *plan(), rank, n.data(), &inembed, istride, idist, CUDA_C_32F, &ouembed, + ostride, odist, CUDA_R_32F, batch, &ws, CUDA_C_32F)); } - } // namespace cufft - - - #endif // CUFFT_H \ No newline at end of file diff --git a/tests/test_cufft.cpp b/tests/test_cufft.cpp index 251091d..98af26a 100644 --- a/tests/test_cufft.cpp +++ b/tests/test_cufft.cpp @@ -114,6 +114,34 @@ TEST_CASE("Test 1D FFT", "[FFT1D]") { scaleSignal(out_ptr, out_ptr, size, float(size)); compare(out_ptr, in_ptr, size); } + + SECTION("FP32 R2C C2R") { + const size_t arraySize = size * sizeof(cufftComplex); + + cu::HostMemory h_in(arraySize); + cu::HostMemory h_out(arraySize); + cu::DeviceMemory d_in(arraySize); + cu::DeviceMemory d_out(arraySize); + cu::DeviceMemory d_out2(arraySize); + + generateSignal(static_cast(h_in), size, patchSize, {1, 1}); + stream.memcpyHtoDAsync(d_in, h_in, arraySize); + + cufft::FFT1DRealToComplex fft_r2c(size, 1, 1, 1); + cufft::FFT1DComplexToReal fft_c2r(size, 1, 1, 1); + fft_r2c.setStream(stream); + fft_c2r.setStream(stream); + + fft_r2c.execute(d_in, d_out, CUFFT_FORWARD); + fft_c2r.execute(d_out, d_out2, CUFFT_INVERSE); + stream.memcpyDtoHAsync(h_out, d_out2, arraySize); + stream.synchronize(); + + cuFloatComplex *in_ptr = static_cast(h_in); + cuFloatComplex *out_ptr = static_cast(h_out); + scaleSignal(out_ptr, out_ptr, size, float(size)); + compare(out_ptr, in_ptr, size); + } } TEST_CASE("Test 2D FFT", "[FFT2D]") { From 122ccd9d8def2c09cdb077a05075ee0553acfe9d Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Wed, 30 Oct 2024 16:57:13 +0100 Subject: [PATCH 05/23] AMD tests passing. --- include/cudawrappers/cu.hpp | 48 ++++++++++++++++++++-------------- include/cudawrappers/cufft.hpp | 26 +++++++++++------- 2 files changed, 44 insertions(+), 30 deletions(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 4a68b3d..c170eb5 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -605,36 +605,32 @@ class DeviceMemory : public Wrapper { void memset2D(unsigned char value, size_t pitch, size_t width, size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemset2D(_obj, pitch, value, width, height)); +#else checkCudaCall(cuMemsetD2D8(_obj, pitch, value, width, height)); +#endif } void memset2D(unsigned short value, size_t pitch, size_t width, size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemset2D(_obj, pitch, value, width, height)); +#else checkCudaCall(cuMemsetD2D16(_obj, pitch, value, width, height)); +#endif } void memset2D(unsigned int value, size_t pitch, size_t width, size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemset2D(_obj, pitch, value, width, height)); +#else checkCudaCall(cuMemsetD2D32(_obj, pitch, value, width, height)); +#endif } void zero(size_t size) { memset(static_cast(0), size); } - void memcpyToSymbolAsync(const void *symbol, size_t count, size_t offset, - cudaStream_t stream) { - if (cudaMemcpyToSymbolAsync(symbol, reinterpret_cast(_obj), - count, offset, cudaMemcpyDeviceToDevice, - stream) != cudaSuccess) { - throw cu::Error(CUDA_ERROR_UNKNOWN); - } - } - - void memcpyToSymbolSync(const void *symbol, size_t count, size_t offset) { - if (cudaMemcpyToSymbol(symbol, reinterpret_cast(_obj), count, - offset, cudaMemcpyDeviceToDevice) != cudaSuccess) { - throw cu::Error(CUDA_ERROR_UNKNOWN); - } - } - const void *parameter() const // used to construct parameter list for launchKernel(); { @@ -713,8 +709,8 @@ class Stream : public Wrapper { const void *hostPtr, size_t spitch, size_t width, size_t height) { #if defined(__HIP__) -// FIXME: implement for HIP -#error "memcpyHtoD2DAsync not yet implemented for HIP" + checkCudaCall(hipMemcpy2DAsync(devPtr, dpitch, hostPtr, spitch, width, + height, hipMemcpyHostToDevice, _obj)); #else // Initialize the CUDA_MEMCPY2D structure CUDA_MEMCPY2D copyParams = {0}; @@ -747,8 +743,8 @@ class Stream : public Wrapper { const DeviceMemory &devPtr, size_t spitch, size_t width, size_t height) { #if defined(__HIP__) -// FIXME: implement for HIP -#error "memcpyDtoH2DAsync not yet implemented for HIP" + checkCudaCall(hipMemcpy2DAsync(hostPtr, dpitch, devPtr, spitch, width, + height, hipMemcpyDeviceToHost, _obj)); #else // Initialize the CUDA_MEMCPY2D structure CUDA_MEMCPY2D copyParams = {0}; @@ -825,19 +821,31 @@ class Stream : public Wrapper { void memset2DAsync(DeviceMemory &devPtr, unsigned char value, size_t pitch, size_t width, size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj)); +#else checkCudaCall(cuMemsetD2D8Async(devPtr, pitch, value, width, height, _obj)); +#endif } void memset2DAsync(DeviceMemory &devPtr, unsigned short value, size_t pitch, size_t width, size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj)); +#else checkCudaCall( cuMemsetD2D16Async(devPtr, pitch, value, width, height, _obj)); +#endif } void memset2DAsync(DeviceMemory &devPtr, unsigned int value, size_t pitch, size_t width, size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj)); +#else checkCudaCall( cuMemsetD2D32Async(devPtr, pitch, value, width, height, _obj)); +#endif } void zero(DeviceMemory &devPtr, size_t size) { diff --git a/include/cudawrappers/cufft.hpp b/include/cudawrappers/cufft.hpp index 605fcb3..fa161af 100644 --- a/include/cudawrappers/cufft.hpp +++ b/include/cudawrappers/cufft.hpp @@ -226,7 +226,7 @@ template <> FFT2D::FFT2D(int nx, int ny) : FFT2D(nx, ny, 1, nx * ny, 1) {} /* - * FFT1DRealToComplex + * FFT2DRealToComplex */ template class FFT1DRealToComplex : public FFT { @@ -240,26 +240,29 @@ class FFT1DRealToComplex : public FFT { #endif FFT1DRealToComplex(int nx, int batch) = delete; +#if defined(__HIP__) + __host__ +#endif FFT1DRealToComplex(int nx, int batch, long long inembed, long long ouembed) = delete; }; template <> FFT1DRealToComplex::FFT1DRealToComplex(int nx, int batch, - long long inembed, - long long ouembed) { + long long int inembed, + long long int ouembed) { checkCuFFTCall(cufftCreate(plan())); const int rank = 1; size_t ws = 0; std::array n{nx}; - const long long int idist = inembed; - const long long int odist = ouembed; - + long long int idist = inembed; + long long int odist = ouembed; int istride = 1; int ostride = 1; + checkCuFFTCall(cufftXtMakePlanMany( *plan(), rank, n.data(), &inembed, istride, idist, CUDA_R_32F, &ouembed, - ostride, odist, CUDA_C_32F, batch, &ws, CUDA_R_32F)); + ostride, odist, CUDA_C_32F, batch, &ws, CUDA_C_32F)); } /* @@ -269,7 +272,6 @@ template class FFT1DComplexToReal : public FFT { public: #if defined(__HIP__) - __host__ #endif FFT1DComplexToReal(int nx) = delete; @@ -277,14 +279,17 @@ class FFT1DComplexToReal : public FFT { __host__ #endif FFT1DComplexToReal(int nx, int batch) = delete; +#if defined(__HIP__) + __host__ +#endif FFT1DComplexToReal(int nx, int batch, long long inembed, long long ouembed) = delete; }; template <> FFT1DComplexToReal::FFT1DComplexToReal(int nx, int batch, - long long inembed, - long long ouembed) { + long long int inembed, + long long int ouembed) { checkCuFFTCall(cufftCreate(plan())); const int rank = 1; size_t ws = 0; @@ -293,6 +298,7 @@ FFT1DComplexToReal::FFT1DComplexToReal(int nx, int batch, long long int odist = ouembed; int istride = 1; int ostride = 1; + checkCuFFTCall(cufftXtMakePlanMany( *plan(), rank, n.data(), &inembed, istride, idist, CUDA_C_32F, &ouembed, ostride, odist, CUDA_R_32F, batch, &ws, CUDA_C_32F)); From e8eca2f6d283ff7f295007668851839ce7868bd6 Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Thu, 31 Oct 2024 14:05:39 +0100 Subject: [PATCH 06/23] Disable nvtx for HIP. --- cmake/cudawrappers-helper.cmake | 2 +- include/cudawrappers/cu.hpp | 36 +-------------------------------- include/cudawrappers/nvtx.hpp | 21 +++++++++++++++++-- 3 files changed, 21 insertions(+), 38 deletions(-) diff --git a/cmake/cudawrappers-helper.cmake b/cmake/cudawrappers-helper.cmake index 9ea5173..1bfdc0c 100644 --- a/cmake/cudawrappers-helper.cmake +++ b/cmake/cudawrappers-helper.cmake @@ -1,4 +1,4 @@ -# Retun a list of asbolute file names for all the local includes of the +# Return a list of absolute file names for all the local includes of the # input_file. Only files in the root directory will be considered. function(get_local_includes input_file root_dir) file(READ ${input_file} input_file_contents) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index c170eb5..40717fa 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -189,7 +189,6 @@ class Device : public Wrapper { } size_t getTotalConstMem() const { - // FIXME: implement HIP. return static_cast( getAttribute(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY)); } @@ -636,9 +635,7 @@ class DeviceMemory : public Wrapper { { return &_obj; } - - // FIXME: remove this function. - void *parameter_copy_temp() { return reinterpret_cast(_obj); } + void *parameter_copy() { return reinterpret_cast(_obj); } template operator T *() { @@ -911,37 +908,6 @@ class Stream : public Wrapper { inline void Event::record(Stream &stream) { checkCudaCall(cuEventRecord(_obj, stream._obj)); } - -// inline void memcpyToSymbolSync(const void *symbol, cu::DeviceMemory &src, -// size_t count, size_t offset) { -// // cudaMemcpyToSymbolAsync(c_killmask, src, count, offset, -// // cudaMemcpyDeviceToDevice, stream); -// #if defined(__HIP__) -// // FIXME: finish HIP implementation. -// #error "memcpyToSymbolAsync not yet implemented for HIP" -// #else - -// // FIXME: find the 'cu' equivalent of cudaMemcpyToSymbolAsync, i.e. -// cuMemcpyToSymbolAsync -// // checkCudaCall(cudaMemcpyToSymbol(symbol, src, count, offset, -// cudaMemcpyDeviceToDevice)); - -// if (cudaMemcpyToSymbol(symbol, src.parameter_by_copy(), count, offset, -// cudaMemcpyDeviceToDevice) != cudaSuccess) { -// throw cu::Error(CUDA_ERROR_UNKNOWN); -// } - -// // CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule -// hmod, const char* name ) -// // CUdeviceptr dptr = nullptr; -// // size_t dsize = 0; - -// // checkCudaCall(cuModuleGetGlobal()) - -// // checkCudaCall(cuMemcpyToSymbolAsync(symbol, src, count, -// cudaMemcpyDeviceToDevice, _obj)); #endif -// } - } // namespace cu #endif \ No newline at end of file diff --git a/include/cudawrappers/nvtx.hpp b/include/cudawrappers/nvtx.hpp index 5bb6f5f..31dac0d 100644 --- a/include/cudawrappers/nvtx.hpp +++ b/include/cudawrappers/nvtx.hpp @@ -1,7 +1,9 @@ #if !defined NVTX_H #define NVTX_H +#if !defined(__HIP__) #include +#endif namespace nvtx { @@ -9,6 +11,10 @@ class Marker { public: enum Color { red, green, blue, yellow, black }; +#if defined(__HIP__) + explicit Marker(const char* message, unsigned color = Color::green) {} + +#else explicit Marker(const char* message, unsigned color = Color::green) : _attributes{0} { _attributes.version = NVTX_VERSION; @@ -18,12 +24,21 @@ class Marker { _attributes.messageType = NVTX_MESSAGE_TYPE_ASCII; _attributes.message.ascii = message; } +#endif Marker(const char* message, Color color) : Marker(message, convert(color)) {} - void start() { _id = nvtxRangeStartEx(&_attributes); } + void start() { +#if !defined(__HIP__) + _id = nvtxRangeStartEx(&_attributes); +#endif + } - void end() { nvtxRangeEnd(_id); } + void end() { +#if !defined(__HIP__) + nvtxRangeEnd(_id); +#endif + } private: unsigned int convert(Color color) { @@ -43,8 +58,10 @@ class Marker { } } +#if !defined(__HIP__) nvtxEventAttributes_t _attributes; nvtxRangeId_t _id; +#endif }; } // end namespace nvtx From 4272ce272a7d8e1e1cbfcf5df1ca304828c1b43c Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Thu, 31 Oct 2024 14:09:07 +0100 Subject: [PATCH 07/23] Updated changelog. --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d4f589..7b1013d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ project adheres to [Semantic Versioning](http://semver.org/). - Added `cu::DeviceMemory::memset()` - Added `cu::Stream::memsetAsync()` - Added `nvml::Device::getPower()` +- Added 2D memcpy and memset operations +- Added `FFT1DRealToComplex` and `FFT1DComplexToReal` ### Changed From d7120711c314c7e1b781cc74211c8b5e2689d9f9 Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Thu, 31 Oct 2024 14:12:25 +0100 Subject: [PATCH 08/23] Fixed formatting. --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b1013d..947d1d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ project adheres to [Semantic Versioning](http://semver.org/). - Added `cu::Stream::memsetAsync()` - Added `nvml::Device::getPower()` - Added 2D memcpy and memset operations -- Added `FFT1DRealToComplex` and `FFT1DComplexToReal` +- Added `FFT1DRealToComplex` and `FFT1DComplexToReal` ### Changed From ef908e20e3b7652a2cf20f80f4cd89972f11f303 Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Fri, 1 Nov 2024 13:10:39 +0100 Subject: [PATCH 09/23] Formatting tests/test_cu.cpp Co-authored-by: Bram Veenboer --- tests/test_cu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cu.cpp b/tests/test_cu.cpp index 42e11b6..2abb948 100644 --- a/tests/test_cu.cpp +++ b/tests/test_cu.cpp @@ -24,7 +24,7 @@ TEST_CASE("Test cu::Device", "[device]") { CHECK(arch.size() > 0); } - SECTION("Test Device.totalMem") { + SECTION("Test device::totalMem", "[device]") { const size_t total_mem = device.totalMem(); std::cout << "Device total memory: " << (total_mem / (1024 * 1024)) << " bytes" << std::endl; From 9702b892095fa68adacc933907a8142cd42488aa Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Fri, 1 Nov 2024 13:59:20 +0100 Subject: [PATCH 10/23] Processed MR feedback. --- CHANGELOG.md | 9 ++- include/cudawrappers/cu.hpp | 2 +- include/cudawrappers/cufft.hpp | 101 +++++++++++++++++---------------- tests/test_cu.cpp | 10 ++-- tests/test_cufft.cpp | 6 +- 5 files changed, 68 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 947d1d2..e48cd01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,8 +14,12 @@ project adheres to [Semantic Versioning](http://semver.org/). - Added `cu::DeviceMemory::memset()` - Added `cu::Stream::memsetAsync()` - Added `nvml::Device::getPower()` -- Added 2D memcpy and memset operations -- Added `FFT1DRealToComplex` and `FFT1DComplexToReal` +- Added `cu::Stream::memcpyHtoD2DAsync()`, `cu::Stream::memcpyDtoHD2Async()`, + and `cu::Stream::memcpyDtoD2DAsync()` for 2D asynchronous memory copies. +- Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()` for 2D + memsets +- Added `cufft::FFT1D_R2C` and `cufft::FFT1D_C2R` for 1D real-to-complex and + vice verse FFT ### Changed @@ -26,6 +30,7 @@ project adheres to [Semantic Versioning](http://semver.org/). - Upgrade Catch2 to version v3.6.0 - `target_embed_source` is now more robust: it properly tracks dependencies and runs again whenever any of them changes +- Expanded tests to cover the new 2D memory operations and FFT support ## \[0.8.0\] - 2024-07-05 diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 40717fa..09abeda 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -182,7 +182,7 @@ class Device : public Wrapper { #endif } - size_t totalMem() const { + size_t getTotalMem() const { size_t size{}; checkCudaCall(cuDeviceTotalMem(&size, _obj)); return size; diff --git a/include/cudawrappers/cufft.hpp b/include/cudawrappers/cufft.hpp index fa161af..fb96132 100644 --- a/include/cudawrappers/cufft.hpp +++ b/include/cudawrappers/cufft.hpp @@ -111,18 +111,19 @@ class FFT { ~FFT() { checkCuFFTCall(cufftDestroy(plan_)); } - void setStream(cu::Stream &stream) { + void setStream(cu::Stream &stream) const { checkCuFFTCall(cufftSetStream(plan_, stream)); } - void execute(cu::DeviceMemory &in, cu::DeviceMemory &out, int direction) { + void execute(cu::DeviceMemory &in, cu::DeviceMemory &out, + const int direction) const { void *in_ptr = reinterpret_cast(static_cast(in)); void *out_ptr = reinterpret_cast(static_cast(out)); checkCuFFTCall(cufftXtExec(plan_, in_ptr, out_ptr, direction)); } protected: - void checkCuFFTCall(cufftResult result) { + void checkCuFFTCall(cufftResult result) const { if (result != CUFFT_SUCCESS) { throw Error(result); } @@ -143,39 +144,39 @@ class FFT1D : public FFT { #if defined(__HIP__) __host__ #endif - FFT1D(int nx) = delete; + FFT1D(const int nx) = delete; #if defined(__HIP__) __host__ #endif - FFT1D(int nx, int batch) = delete; + FFT1D(const int nx, const int batch) = delete; }; template <> -FFT1D::FFT1D(int nx, int batch) { +FFT1D::FFT1D(const int nx, const int batch) { checkCuFFTCall(cufftCreate(plan())); checkCuFFTCall(cufftPlan1d(plan(), nx, CUFFT_C2C, batch)); } template <> -FFT1D::FFT1D(int nx) : FFT1D(nx, 1) {} +FFT1D::FFT1D(const int nx) : FFT1D(nx, 1) {} template <> -FFT1D::FFT1D(int nx, int batch) { +FFT1D::FFT1D(const int nx, const int batch) { checkCuFFTCall(cufftCreate(plan())); const int rank = 1; size_t ws = 0; std::array n{nx}; - long long int idist = 1; - long long int odist = 1; - int istride = 1; - int ostride = 1; + const long long idist = 1; + const long long odist = 1; + const int istride = 1; + const int ostride = 1; checkCuFFTCall(cufftXtMakePlanMany(*plan(), rank, n.data(), nullptr, istride, idist, CUDA_C_16F, nullptr, ostride, odist, CUDA_C_16F, batch, &ws, CUDA_C_16F)); } template <> -FFT1D::FFT1D(int nx) : FFT1D(nx, 1) {} +FFT1D::FFT1D(const int nx) : FFT1D(nx, 1) {} /* * FFT2D @@ -186,21 +187,23 @@ class FFT2D : public FFT { #if defined(__HIP__) __host__ #endif - FFT2D(int nx, int ny) = delete; + FFT2D(const int nx, const int ny) = delete; #if defined(__HIP__) __host__ #endif - FFT2D(int nx, int ny, int stride, int dist, int batch) = delete; + FFT2D(const int nx, const int ny, const int stride, const int dist, + const int batch) = delete; }; template <> -FFT2D::FFT2D(int nx, int ny) { +FFT2D::FFT2D(const int nx, const int ny) { checkCuFFTCall(cufftCreate(plan())); checkCuFFTCall(cufftPlan2d(plan(), nx, ny, CUFFT_C2C)); } template <> -FFT2D::FFT2D(int nx, int ny, int stride, int dist, int batch) { +FFT2D::FFT2D(const int nx, const int ny, const int stride, + const int dist, const int batch) { checkCuFFTCall(cufftCreate(plan())); std::array n{nx, ny}; checkCuFFTCall(cufftPlanMany(plan(), 2, n.data(), n.data(), stride, dist, @@ -208,57 +211,58 @@ FFT2D::FFT2D(int nx, int ny, int stride, int dist, int batch) { } template <> -FFT2D::FFT2D(int nx, int ny, int stride, int dist, int batch) { +FFT2D::FFT2D(const int nx, const int ny, const int stride, + const int dist, const int batch) { checkCuFFTCall(cufftCreate(plan())); const int rank = 2; size_t ws = 0; std::array n{nx, ny}; - int istride = stride; - int ostride = stride; - long long int idist = dist; - long long int odist = dist; + const int istride = stride; + const int ostride = stride; + const long long int idist = dist; + const long long int odist = dist; checkCuFFTCall(cufftXtMakePlanMany(*plan(), rank, n.data(), nullptr, istride, idist, CUDA_C_16F, nullptr, ostride, odist, CUDA_C_16F, batch, &ws, CUDA_C_16F)); } template <> -FFT2D::FFT2D(int nx, int ny) : FFT2D(nx, ny, 1, nx * ny, 1) {} +FFT2D::FFT2D(const int nx, const int ny) + : FFT2D(nx, ny, 1, nx * ny, 1) {} /* - * FFT2DRealToComplex + * FFT1D_R2C */ template -class FFT1DRealToComplex : public FFT { +class FFT1D_R2C : public FFT { public: #if defined(__HIP__) __host__ #endif - FFT1DRealToComplex(int nx) = delete; + FFT1D_R2C(const int nx) = delete; #if defined(__HIP__) __host__ #endif - FFT1DRealToComplex(int nx, int batch) = delete; + FFT1D_R2C(const int nx, const int batch) = delete; #if defined(__HIP__) __host__ #endif - FFT1DRealToComplex(int nx, int batch, long long inembed, - long long ouembed) = delete; + FFT1D_R2C(const int nx, const int batch, long long inembed, + long long ouembed) = delete; }; template <> -FFT1DRealToComplex::FFT1DRealToComplex(int nx, int batch, - long long int inembed, - long long int ouembed) { +FFT1D_R2C::FFT1D_R2C(const int nx, const int batch, + long long inembed, long long ouembed) { checkCuFFTCall(cufftCreate(plan())); const int rank = 1; size_t ws = 0; std::array n{nx}; - long long int idist = inembed; - long long int odist = ouembed; - int istride = 1; - int ostride = 1; + const long long idist = inembed; + const long long odist = ouembed; + const long long istride = 1; + const long long ostride = 1; checkCuFFTCall(cufftXtMakePlanMany( *plan(), rank, n.data(), &inembed, istride, idist, CUDA_R_32F, &ouembed, @@ -266,38 +270,37 @@ FFT1DRealToComplex::FFT1DRealToComplex(int nx, int batch, } /* - * FFT1DComplexToReal + * FFT1D_C2R */ template -class FFT1DComplexToReal : public FFT { +class FFT1D_C2R : public FFT { public: #if defined(__HIP__) __host__ #endif - FFT1DComplexToReal(int nx) = delete; + FFT1D_C2R(const int nx) = delete; #if defined(__HIP__) __host__ #endif - FFT1DComplexToReal(int nx, int batch) = delete; + FFT1D_C2R(const int nx, const int batch) = delete; #if defined(__HIP__) __host__ #endif - FFT1DComplexToReal(int nx, int batch, long long inembed, - long long ouembed) = delete; + FFT1D_C2R(const int nx, const int batch, long long inembed, + long long ouembed) = delete; }; template <> -FFT1DComplexToReal::FFT1DComplexToReal(int nx, int batch, - long long int inembed, - long long int ouembed) { +FFT1D_C2R::FFT1D_C2R(const int nx, const int batch, + long long inembed, long long ouembed) { checkCuFFTCall(cufftCreate(plan())); const int rank = 1; size_t ws = 0; std::array n{nx}; - long long int idist = inembed; - long long int odist = ouembed; - int istride = 1; - int ostride = 1; + const long long idist = inembed; + const long long odist = ouembed; + const int istride = 1; + const int ostride = 1; checkCuFFTCall(cufftXtMakePlanMany( *plan(), rank, n.data(), &inembed, istride, idist, CUDA_C_32F, &ouembed, diff --git a/tests/test_cu.cpp b/tests/test_cu.cpp index 2abb948..55c8f6f 100644 --- a/tests/test_cu.cpp +++ b/tests/test_cu.cpp @@ -12,26 +12,26 @@ TEST_CASE("Test cu::Device", "[device]") { cu::Device device(0); cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device); - SECTION("Test Device.getName") { + SECTION("Test Device.getName", "[device]") { const std::string name = device.getName(); std::cout << "Device name: " << name << std::endl; CHECK(name.size() > 0); } - SECTION("Test Device.getArch") { + SECTION("Test Device.getArch", "[device]") { const std::string arch = device.getArch(); std::cout << "Device arch: " << arch << std::endl; CHECK(arch.size() > 0); } - SECTION("Test device::totalMem", "[device]") { - const size_t total_mem = device.totalMem(); + SECTION("Test device.getTotalMem", "[device]") { + const size_t total_mem = device.getTotalMem(); std::cout << "Device total memory: " << (total_mem / (1024 * 1024)) << " bytes" << std::endl; CHECK(total_mem > 0); } - SECTION("Test Device.getTotalConstMem") { + SECTION("Test Device.getTotalConstMem", "[device]") { const size_t const_mem = device.getTotalConstMem(); std::cout << "Device constant memory: " << const_mem << " bytes" << std::endl; diff --git a/tests/test_cufft.cpp b/tests/test_cufft.cpp index 98af26a..96194aa 100644 --- a/tests/test_cufft.cpp +++ b/tests/test_cufft.cpp @@ -115,7 +115,7 @@ TEST_CASE("Test 1D FFT", "[FFT1D]") { compare(out_ptr, in_ptr, size); } - SECTION("FP32 R2C C2R") { + SECTION("FP32 FFT with Real-To-Complex translation, and back") { const size_t arraySize = size * sizeof(cufftComplex); cu::HostMemory h_in(arraySize); @@ -127,8 +127,8 @@ TEST_CASE("Test 1D FFT", "[FFT1D]") { generateSignal(static_cast(h_in), size, patchSize, {1, 1}); stream.memcpyHtoDAsync(d_in, h_in, arraySize); - cufft::FFT1DRealToComplex fft_r2c(size, 1, 1, 1); - cufft::FFT1DComplexToReal fft_c2r(size, 1, 1, 1); + cufft::FFT1D_R2C fft_r2c(size, 1, 1, 1); + cufft::FFT1D_C2R fft_c2r(size, 1, 1, 1); fft_r2c.setStream(stream); fft_c2r.setStream(stream); From 382b075fa6d2819e7963809a9abd4bf290e70d3f Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Fri, 1 Nov 2024 14:00:33 +0100 Subject: [PATCH 11/23] Fixed typo --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e48cd01..0980101 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ project adheres to [Semantic Versioning](http://semver.org/). - Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()` for 2D memsets - Added `cufft::FFT1D_R2C` and `cufft::FFT1D_C2R` for 1D real-to-complex and - vice verse FFT + vice versa FFT ### Changed From be64ac2617419ee4bf441f14062b3a3e4381510a Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Mon, 4 Nov 2024 15:19:45 +0100 Subject: [PATCH 12/23] Added cu::Device::getOrdinal() to retrieve device identifier. --- CHANGELOG.md | 1 + include/cudawrappers/cu.hpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0980101..03f7d22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ project adheres to [Semantic Versioning](http://semver.org/). memsets - Added `cufft::FFT1D_R2C` and `cufft::FFT1D_C2R` for 1D real-to-complex and vice versa FFT +- Added cu::Device::getOrdinal(). ### Changed diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 09abeda..202ebbd 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -193,6 +193,8 @@ class Device : public Wrapper { getAttribute(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY)); } + int getOrdinal() const { return _ordinal; } + // Primary Context Management std::pair primaryCtxGetState() const { unsigned flags{}; From 4e299595159016b6b7702ab9b85e5fda9a5a4e5a Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Mon, 4 Nov 2024 15:25:38 +0100 Subject: [PATCH 13/23] Added test Device.getOrdinal. --- tests/test_cu.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_cu.cpp b/tests/test_cu.cpp index 55c8f6f..cf5bc0a 100644 --- a/tests/test_cu.cpp +++ b/tests/test_cu.cpp @@ -37,6 +37,11 @@ TEST_CASE("Test cu::Device", "[device]") { << std::endl; CHECK(const_mem > 0); } + + SECTION("Test Device.getOrdinal", "[device]") { + const int dev_ordinal = device.getOrdinal(); + CHECK(dev_ordinal >= 0); + } } TEST_CASE("Test context::getDevice", "[device]") { From 27a8cb283ca0751e4c929cb1900cd035c45bcebc Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Tue, 5 Nov 2024 08:57:19 +0100 Subject: [PATCH 14/23] Update CHANGELOG.md Co-authored-by: Bram Veenboer --- CHANGELOG.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03f7d22..9f2d1cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,12 +15,10 @@ project adheres to [Semantic Versioning](http://semver.org/). - Added `cu::Stream::memsetAsync()` - Added `nvml::Device::getPower()` - Added `cu::Stream::memcpyHtoD2DAsync()`, `cu::Stream::memcpyDtoHD2Async()`, - and `cu::Stream::memcpyDtoD2DAsync()` for 2D asynchronous memory copies. -- Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()` for 2D - memsets -- Added `cufft::FFT1D_R2C` and `cufft::FFT1D_C2R` for 1D real-to-complex and - vice versa FFT -- Added cu::Device::getOrdinal(). + and `cu::Stream::memcpyDtoD2DAsync()` +- Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()` +- Added `cufft::FFT1D_R2C` and `cufft::FFT1D_C2R` +- Added `cu::Device::getOrdinal()` ### Changed From 8437f1485eefdb8163fb449f99abbf7178b83c2f Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Tue, 5 Nov 2024 09:00:31 +0100 Subject: [PATCH 15/23] Renamed functions and removed getTotalConstMem() --- CHANGELOG.md | 2 +- include/cudawrappers/cu.hpp | 7 +------ include/cudawrappers/cufft.hpp | 30 +++++++++++++++--------------- tests/test_cu.cpp | 11 ++--------- tests/test_cufft.cpp | 4 ++-- 5 files changed, 21 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f2d1cd..d379acf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ project adheres to [Semantic Versioning](http://semver.org/). - Added `cu::Stream::memcpyHtoD2DAsync()`, `cu::Stream::memcpyDtoHD2Async()`, and `cu::Stream::memcpyDtoD2DAsync()` - Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()` -- Added `cufft::FFT1D_R2C` and `cufft::FFT1D_C2R` +- Added `cufft::FFT1DR2C` and `cufft::FFT1DC2R` - Added `cu::Device::getOrdinal()` ### Changed diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 202ebbd..bacec83 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -182,17 +182,12 @@ class Device : public Wrapper { #endif } - size_t getTotalMem() const { + size_t totalMem() const { size_t size{}; checkCudaCall(cuDeviceTotalMem(&size, _obj)); return size; } - size_t getTotalConstMem() const { - return static_cast( - getAttribute(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY)); - } - int getOrdinal() const { return _ordinal; } // Primary Context Management diff --git a/include/cudawrappers/cufft.hpp b/include/cudawrappers/cufft.hpp index fb96132..57a08df 100644 --- a/include/cudawrappers/cufft.hpp +++ b/include/cudawrappers/cufft.hpp @@ -231,30 +231,30 @@ FFT2D::FFT2D(const int nx, const int ny) : FFT2D(nx, ny, 1, nx * ny, 1) {} /* - * FFT1D_R2C + * FFT1DR2C */ template -class FFT1D_R2C : public FFT { +class FFT1DR2C : public FFT { public: #if defined(__HIP__) __host__ #endif - FFT1D_R2C(const int nx) = delete; + FFT1DR2C(const int nx) = delete; #if defined(__HIP__) __host__ #endif - FFT1D_R2C(const int nx, const int batch) = delete; + FFT1DR2C(const int nx, const int batch) = delete; #if defined(__HIP__) __host__ #endif - FFT1D_R2C(const int nx, const int batch, long long inembed, - long long ouembed) = delete; + FFT1DR2C(const int nx, const int batch, long long inembed, + long long ouembed) = delete; }; template <> -FFT1D_R2C::FFT1D_R2C(const int nx, const int batch, - long long inembed, long long ouembed) { +FFT1DR2C::FFT1DR2C(const int nx, const int batch, long long inembed, + long long ouembed) { checkCuFFTCall(cufftCreate(plan())); const int rank = 1; size_t ws = 0; @@ -273,26 +273,26 @@ FFT1D_R2C::FFT1D_R2C(const int nx, const int batch, * FFT1D_C2R */ template -class FFT1D_C2R : public FFT { +class FFT1DC2R : public FFT { public: #if defined(__HIP__) __host__ #endif - FFT1D_C2R(const int nx) = delete; + FFT1DC2R(const int nx) = delete; #if defined(__HIP__) __host__ #endif - FFT1D_C2R(const int nx, const int batch) = delete; + FFT1DC2R(const int nx, const int batch) = delete; #if defined(__HIP__) __host__ #endif - FFT1D_C2R(const int nx, const int batch, long long inembed, - long long ouembed) = delete; + FFT1DC2R(const int nx, const int batch, long long inembed, + long long ouembed) = delete; }; template <> -FFT1D_C2R::FFT1D_C2R(const int nx, const int batch, - long long inembed, long long ouembed) { +FFT1DC2R::FFT1DC2R(const int nx, const int batch, long long inembed, + long long ouembed) { checkCuFFTCall(cufftCreate(plan())); const int rank = 1; size_t ws = 0; diff --git a/tests/test_cu.cpp b/tests/test_cu.cpp index cf5bc0a..2986498 100644 --- a/tests/test_cu.cpp +++ b/tests/test_cu.cpp @@ -24,20 +24,13 @@ TEST_CASE("Test cu::Device", "[device]") { CHECK(arch.size() > 0); } - SECTION("Test device.getTotalMem", "[device]") { - const size_t total_mem = device.getTotalMem(); + SECTION("Test device.totalMem", "[device]") { + const size_t total_mem = device.totalMem(); std::cout << "Device total memory: " << (total_mem / (1024 * 1024)) << " bytes" << std::endl; CHECK(total_mem > 0); } - SECTION("Test Device.getTotalConstMem", "[device]") { - const size_t const_mem = device.getTotalConstMem(); - std::cout << "Device constant memory: " << const_mem << " bytes" - << std::endl; - CHECK(const_mem > 0); - } - SECTION("Test Device.getOrdinal", "[device]") { const int dev_ordinal = device.getOrdinal(); CHECK(dev_ordinal >= 0); diff --git a/tests/test_cufft.cpp b/tests/test_cufft.cpp index 96194aa..afc5354 100644 --- a/tests/test_cufft.cpp +++ b/tests/test_cufft.cpp @@ -127,8 +127,8 @@ TEST_CASE("Test 1D FFT", "[FFT1D]") { generateSignal(static_cast(h_in), size, patchSize, {1, 1}); stream.memcpyHtoDAsync(d_in, h_in, arraySize); - cufft::FFT1D_R2C fft_r2c(size, 1, 1, 1); - cufft::FFT1D_C2R fft_c2r(size, 1, 1, 1); + cufft::FFT1DR2C fft_r2c(size, 1, 1, 1); + cufft::FFT1DC2R fft_c2r(size, 1, 1, 1); fft_r2c.setStream(stream); fft_c2r.setStream(stream); From f2e6d331b100775e4b325bf2b0dc6a5e5a90d771 Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Wed, 6 Nov 2024 15:09:35 +0100 Subject: [PATCH 16/23] Rewritten cu::DeviceMemory::operator* for more tolerant casting Currently, it is only allowed use the deference operator in case of managed memory. This commit relaxes this requirement a bit by also allowing access to non-managed memory. This enables casts like this: cu::DeviceMemory(1024) mem; float* ptr = static_cast(mem); therefore avoiding an intermediate cast to CUdeviceptr. --- include/cudawrappers/cu.hpp | 32 ++++++++++++++++++++++---------- tests/test_cu.cpp | 2 +- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index bacec83..d1a2084 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -45,6 +45,20 @@ inline void checkCudaCall(CUresult result) { if (result != CUDA_SUCCESS) throw Error(result); } +template +inline void checkPointerAccess(const T &pointer) { + CUmemorytype memoryType; + checkCudaCall(cuPointerGetAttribute( + &memoryType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer)); + + if (memoryType != CU_MEMORYTYPE_DEVICE && + memoryType != CU_MEMORYTYPE_UNIFIED) { + throw std::runtime_error( + "Invalid memory type: only CU_MEMORYTYPE_DEVICE and " + "CU_MEMORYTYPE_UNIFIED are supported."); + } +} + inline void init(unsigned flags = 0) { checkCudaCall(cuInit(flags)); } inline int driverGetVersion() { @@ -632,19 +646,17 @@ class DeviceMemory : public Wrapper { { return &_obj; } - void *parameter_copy() { return reinterpret_cast(_obj); } template operator T *() { - int data; - checkCudaCall( - cuPointerGetAttribute(&data, CU_POINTER_ATTRIBUTE_IS_MANAGED, _obj)); - if (data) { - return reinterpret_cast(_obj); - } else { - throw std::runtime_error( - "Cannot return memory of type CU_MEMORYTYPE_DEVICE as pointer."); - } + checkPointerAccess(_obj); + return reinterpret_cast(_obj); + } + + template + operator T *() const { + checkPointerAccess(_obj); + return reinterpret_cast(_obj); } size_t size() const { return _size; } diff --git a/tests/test_cu.cpp b/tests/test_cu.cpp index 2986498..91fb020 100644 --- a/tests/test_cu.cpp +++ b/tests/test_cu.cpp @@ -221,7 +221,7 @@ TEST_CASE("Test cu::DeviceMemory", "[devicememory]") { SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_DEVICE as host pointer") { cu::DeviceMemory mem(sizeof(float), CU_MEMORYTYPE_DEVICE, 0); float* ptr; - CHECK_THROWS(ptr = mem); + CHECK_NOTHROWS(ptr = mem); } SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_UNIFIED as host pointer") { From 1ee98d8aa512589845905a5046f49f3d61e6fedf Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Wed, 6 Nov 2024 15:23:58 +0100 Subject: [PATCH 17/23] Fixed typo. --- tests/test_cu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cu.cpp b/tests/test_cu.cpp index 91fb020..744f80d 100644 --- a/tests/test_cu.cpp +++ b/tests/test_cu.cpp @@ -221,7 +221,7 @@ TEST_CASE("Test cu::DeviceMemory", "[devicememory]") { SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_DEVICE as host pointer") { cu::DeviceMemory mem(sizeof(float), CU_MEMORYTYPE_DEVICE, 0); float* ptr; - CHECK_NOTHROWS(ptr = mem); + CHECK_NOTHROW(ptr = mem); } SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_UNIFIED as host pointer") { From 7e83d21caa38728fd48b5416da1916b317037270 Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Thu, 7 Nov 2024 09:35:53 +0100 Subject: [PATCH 18/23] Moved checkPointerAccess method to Wrapper --- include/cudawrappers/cu.hpp | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index d1a2084..2438bee 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -1,3 +1,4 @@ +#include #if !defined CU_WRAPPER_H #define CU_WRAPPER_H @@ -45,20 +46,6 @@ inline void checkCudaCall(CUresult result) { if (result != CUDA_SUCCESS) throw Error(result); } -template -inline void checkPointerAccess(const T &pointer) { - CUmemorytype memoryType; - checkCudaCall(cuPointerGetAttribute( - &memoryType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer)); - - if (memoryType != CU_MEMORYTYPE_DEVICE && - memoryType != CU_MEMORYTYPE_UNIFIED) { - throw std::runtime_error( - "Invalid memory type: only CU_MEMORYTYPE_DEVICE and " - "CU_MEMORYTYPE_UNIFIED are supported."); - } -} - inline void init(unsigned flags = 0) { checkCudaCall(cuInit(flags)); } inline int driverGetVersion() { @@ -108,6 +95,22 @@ class Wrapper { explicit Wrapper(T &obj) : _obj(obj) {} + template + inline void checkPointerAccess(const CUdeviceptr &pointer) const { + CUmemorytype memoryType; + checkCudaCall(cuPointerGetAttribute( + &memoryType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer)); + + // Check if the memoryType is one of the allowed memory types + bool isAllowed = false; + for (auto allowedType : {AllowedMemoryTypes...}) { + if (memoryType == allowedType) return; + } + + throw std::runtime_error( + "Invalid memory type: allowed types are not matched."); + } + T _obj{}; std::shared_ptr manager; }; @@ -649,13 +652,13 @@ class DeviceMemory : public Wrapper { template operator T *() { - checkPointerAccess(_obj); + checkPointerAccess(_obj); return reinterpret_cast(_obj); } template operator T *() const { - checkPointerAccess(_obj); + checkPointerAccess(_obj); return reinterpret_cast(_obj); } From 7b68f9abba4cb261b7079b28fd31f40cd2f666a7 Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Thu, 7 Nov 2024 09:38:57 +0100 Subject: [PATCH 19/23] Fixed Zenodo link redirect test fail --- README.dev.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.dev.md b/README.dev.md index dcf02f1..f797b00 100644 --- a/README.dev.md +++ b/README.dev.md @@ -299,5 +299,5 @@ To view the generated documentation, open `_build/html/index.html` in your web-b ### Verification -1. Make sure the new release is added to Zenodo (see ) +1. Make sure the new release is added to Zenodo (see ) 1. Activate the latest release documentation at From 79045ef1d8baedd2f5963c5cad57db547c4811f0 Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Thu, 7 Nov 2024 09:44:17 +0100 Subject: [PATCH 20/23] Removed unnecessarily file include. --- include/cudawrappers/cu.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 2438bee..53ade95 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -1,4 +1,3 @@ -#include #if !defined CU_WRAPPER_H #define CU_WRAPPER_H From 3fe604dd50ac2cec32ebc228bf3360469fa732da Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Thu, 7 Nov 2024 11:09:01 +0100 Subject: [PATCH 21/23] Update include/cudawrappers/cu.hpp Co-authored-by: Bram Veenboer --- include/cudawrappers/cu.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 53ade95..169d069 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -101,7 +101,6 @@ class Wrapper { &memoryType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer)); // Check if the memoryType is one of the allowed memory types - bool isAllowed = false; for (auto allowedType : {AllowedMemoryTypes...}) { if (memoryType == allowedType) return; } From 5d98ff5d2944831f21d0650cc072f47a78439096 Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Thu, 7 Nov 2024 11:09:11 +0100 Subject: [PATCH 22/23] Update include/cudawrappers/cu.hpp Co-authored-by: Bram Veenboer --- include/cudawrappers/cu.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 169d069..0399bd3 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -102,7 +102,9 @@ class Wrapper { // Check if the memoryType is one of the allowed memory types for (auto allowedType : {AllowedMemoryTypes...}) { - if (memoryType == allowedType) return; + if (memoryType == allowedType) { + return; + } } throw std::runtime_error( From b8c3566b4651af180bf0074dde78561be61655e6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 7 Nov 2024 10:09:16 +0000 Subject: [PATCH 23/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- include/cudawrappers/cu.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 0399bd3..06cc69a 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -103,7 +103,7 @@ class Wrapper { // Check if the memoryType is one of the allowed memory types for (auto allowedType : {AllowedMemoryTypes...}) { if (memoryType == allowedType) { - return; + return; } }