From fd7f102b34baa0d5965bb40b30ced8edb7c58c14 Mon Sep 17 00:00:00 2001 From: Wiebe van Breukelen Date: Thu, 7 Nov 2024 12:57:28 +0100 Subject: [PATCH] Added support for 2D operations and C2R/R2C FFT (#305) * Added `cu::Stream::memcpyHtoD2DAsync()`, `cu::Stream::memcpyDtoHD2Async()`, and `cu::Stream::memcpyDtoD2DAsync()` * Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()` * Added `cufft::FFT1DR2C` and `cufft::FFT1DC2R` * Added `cu::Device::getOrdinal()` * Allow non-managed memory dereferencing in `cu::DeviceMemory` --------- Co-authored-by: Bram Veenboer Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- CHANGELOG.md | 6 ++ README.dev.md | 2 +- cmake/cudawrappers-helper.cmake | 2 +- include/cudawrappers/cu.hpp | 165 ++++++++++++++++++++++++++++++-- include/cudawrappers/cufft.hpp | 131 ++++++++++++++++++++----- include/cudawrappers/nvtx.hpp | 21 +++- tests/test_cu.cpp | 131 ++++++++++++++++++++++++- tests/test_cufft.cpp | 28 ++++++ 8 files changed, 444 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d4f5898..d379acf2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,11 @@ project adheres to [Semantic Versioning](http://semver.org/). - Added `cu::DeviceMemory::memset()` - Added `cu::Stream::memsetAsync()` - Added `nvml::Device::getPower()` +- Added `cu::Stream::memcpyHtoD2DAsync()`, `cu::Stream::memcpyDtoHD2Async()`, + and `cu::Stream::memcpyDtoD2DAsync()` +- Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()` +- Added `cufft::FFT1DR2C` and `cufft::FFT1DC2R` +- Added `cu::Device::getOrdinal()` ### Changed @@ -24,6 +29,7 @@ project adheres to [Semantic Versioning](http://semver.org/). - Upgrade Catch2 to version v3.6.0 - `target_embed_source` is now more robust: it properly tracks dependencies and runs again whenever any of them changes +- Expanded tests to cover the new 2D memory operations and FFT support ## \[0.8.0\] - 2024-07-05 diff --git a/README.dev.md b/README.dev.md index dcf02f12..f797b00d 100644 --- a/README.dev.md +++ b/README.dev.md @@ -299,5 +299,5 @@ To view the generated documentation, open `_build/html/index.html` in your web-b ### Verification -1. Make sure the new release is added to Zenodo (see ) +1. Make sure the new release is added to Zenodo (see ) 1. Activate the latest release documentation at diff --git a/cmake/cudawrappers-helper.cmake b/cmake/cudawrappers-helper.cmake index 9ea5173c..1bfdc0c8 100644 --- a/cmake/cudawrappers-helper.cmake +++ b/cmake/cudawrappers-helper.cmake @@ -1,4 +1,4 @@ -# Retun a list of asbolute file names for all the local includes of the +# Return a list of absolute file names for all the local includes of the # input_file. Only files in the root directory will be considered. function(get_local_includes input_file root_dir) file(READ ${input_file} input_file_contents) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 032ab444..06cc69a0 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -94,6 +94,23 @@ class Wrapper { explicit Wrapper(T &obj) : _obj(obj) {} + template + inline void checkPointerAccess(const CUdeviceptr &pointer) const { + CUmemorytype memoryType; + checkCudaCall(cuPointerGetAttribute( + &memoryType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer)); + + // Check if the memoryType is one of the allowed memory types + for (auto allowedType : {AllowedMemoryTypes...}) { + if (memoryType == allowedType) { + return; + } + } + + throw std::runtime_error( + "Invalid memory type: allowed types are not matched."); + } + T _obj{}; std::shared_ptr manager; }; @@ -188,6 +205,8 @@ class Device : public Wrapper { return size; } + int getOrdinal() const { return _ordinal; } + // Primary Context Management std::pair primaryCtxGetState() const { unsigned flags{}; @@ -597,6 +616,32 @@ class DeviceMemory : public Wrapper { checkCudaCall(cuMemsetD32(_obj, value, size)); } + void memset2D(unsigned char value, size_t pitch, size_t width, + size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemset2D(_obj, pitch, value, width, height)); +#else + checkCudaCall(cuMemsetD2D8(_obj, pitch, value, width, height)); +#endif + } + + void memset2D(unsigned short value, size_t pitch, size_t width, + size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemset2D(_obj, pitch, value, width, height)); +#else + checkCudaCall(cuMemsetD2D16(_obj, pitch, value, width, height)); +#endif + } + + void memset2D(unsigned int value, size_t pitch, size_t width, size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemset2D(_obj, pitch, value, width, height)); +#else + checkCudaCall(cuMemsetD2D32(_obj, pitch, value, width, height)); +#endif + } + void zero(size_t size) { memset(static_cast(0), size); } const void *parameter() @@ -607,15 +652,14 @@ class DeviceMemory : public Wrapper { template operator T *() { - int data; - checkCudaCall( - cuPointerGetAttribute(&data, CU_POINTER_ATTRIBUTE_IS_MANAGED, _obj)); - if (data) { - return reinterpret_cast(_obj); - } else { - throw std::runtime_error( - "Cannot return memory of type CU_MEMORYTYPE_DEVICE as pointer."); - } + checkPointerAccess(_obj); + return reinterpret_cast(_obj); + } + + template + operator T *() const { + checkPointerAccess(_obj); + return reinterpret_cast(_obj); } size_t size() const { return _size; } @@ -670,6 +714,74 @@ class Stream : public Wrapper { #endif } + void memcpyHtoD2DAsync(DeviceMemory &devPtr, size_t dpitch, + const void *hostPtr, size_t spitch, size_t width, + size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemcpy2DAsync(devPtr, dpitch, hostPtr, spitch, width, + height, hipMemcpyHostToDevice, _obj)); +#else + // Initialize the CUDA_MEMCPY2D structure + CUDA_MEMCPY2D copyParams = {0}; + + // Set width and height for the 2D copy + copyParams.WidthInBytes = width; + copyParams.Height = height; + + // Set the destination (dst) + copyParams.dstXInBytes = 0; + copyParams.dstY = 0; + copyParams.dstPitch = dpitch; + + // Set the source (src) + copyParams.srcXInBytes = 0; + copyParams.srcY = 0; + copyParams.srcPitch = spitch; + + copyParams.srcMemoryType = CU_MEMORYTYPE_HOST; + copyParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + copyParams.srcHost = hostPtr; + copyParams.dstDevice = devPtr; + + // Call the driver API function cuMemcpy2DAsync + checkCudaCall(cuMemcpy2DAsync(©Params, _obj)); +#endif + } + + void memcpyDtoH2DAsync(void *hostPtr, size_t dpitch, + const DeviceMemory &devPtr, size_t spitch, + size_t width, size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemcpy2DAsync(hostPtr, dpitch, devPtr, spitch, width, + height, hipMemcpyDeviceToHost, _obj)); +#else + // Initialize the CUDA_MEMCPY2D structure + CUDA_MEMCPY2D copyParams = {0}; + + // Set width and height for the 2D copy + copyParams.WidthInBytes = width; + copyParams.Height = height; + + // Set the destination (dst) + copyParams.dstXInBytes = 0; + copyParams.dstY = 0; + copyParams.dstPitch = dpitch; + + // Set the source (src) + copyParams.srcXInBytes = 0; + copyParams.srcY = 0; + copyParams.srcPitch = spitch; + + copyParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + copyParams.dstMemoryType = CU_MEMORYTYPE_HOST; + copyParams.srcDevice = devPtr; + copyParams.dstHost = hostPtr; + + // Call the driver API function cuMemcpy2DAsync + checkCudaCall(cuMemcpy2DAsync(©Params, _obj)); +#endif + } + void memcpyHtoDAsync(CUdeviceptr devPtr, const void *hostPtr, size_t size) { #if defined(__HIP__) checkCudaCall( @@ -716,10 +828,43 @@ class Stream : public Wrapper { checkCudaCall(cuMemsetD32Async(devPtr, value, size, _obj)); } + void memset2DAsync(DeviceMemory &devPtr, unsigned char value, size_t pitch, + size_t width, size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj)); +#else + checkCudaCall(cuMemsetD2D8Async(devPtr, pitch, value, width, height, _obj)); +#endif + } + + void memset2DAsync(DeviceMemory &devPtr, unsigned short value, size_t pitch, + size_t width, size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj)); +#else + checkCudaCall( + cuMemsetD2D16Async(devPtr, pitch, value, width, height, _obj)); +#endif + } + + void memset2DAsync(DeviceMemory &devPtr, unsigned int value, size_t pitch, + size_t width, size_t height) { +#if defined(__HIP__) + checkCudaCall(hipMemset2DAsync(devPtr, pitch, value, width, height, _obj)); +#else + checkCudaCall( + cuMemsetD2D32Async(devPtr, pitch, value, width, height, _obj)); +#endif + } + void zero(DeviceMemory &devPtr, size_t size) { memsetAsync(devPtr, static_cast(0), size); } + void zero2D(DeviceMemory &devPtr, size_t pitch, size_t width, size_t height) { + memset2DAsync(devPtr, static_cast(0), pitch, width, height); + } + void launchKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, @@ -777,4 +922,4 @@ inline void Event::record(Stream &stream) { } } // namespace cu -#endif +#endif \ No newline at end of file diff --git a/include/cudawrappers/cufft.hpp b/include/cudawrappers/cufft.hpp index 7457fc59..57a08dfd 100644 --- a/include/cudawrappers/cufft.hpp +++ b/include/cudawrappers/cufft.hpp @@ -11,6 +11,7 @@ #include #endif +#include #include #include "cudawrappers/cu.hpp" @@ -110,18 +111,19 @@ class FFT { ~FFT() { checkCuFFTCall(cufftDestroy(plan_)); } - void setStream(cu::Stream &stream) { + void setStream(cu::Stream &stream) const { checkCuFFTCall(cufftSetStream(plan_, stream)); } - void execute(cu::DeviceMemory &in, cu::DeviceMemory &out, int direction) { + void execute(cu::DeviceMemory &in, cu::DeviceMemory &out, + const int direction) const { void *in_ptr = reinterpret_cast(static_cast(in)); void *out_ptr = reinterpret_cast(static_cast(out)); checkCuFFTCall(cufftXtExec(plan_, in_ptr, out_ptr, direction)); } protected: - void checkCuFFTCall(cufftResult result) { + void checkCuFFTCall(cufftResult result) const { if (result != CUFFT_SUCCESS) { throw Error(result); } @@ -142,39 +144,39 @@ class FFT1D : public FFT { #if defined(__HIP__) __host__ #endif - FFT1D(int nx) = delete; + FFT1D(const int nx) = delete; #if defined(__HIP__) __host__ #endif - FFT1D(int nx, int batch) = delete; + FFT1D(const int nx, const int batch) = delete; }; template <> -FFT1D::FFT1D(int nx, int batch) { +FFT1D::FFT1D(const int nx, const int batch) { checkCuFFTCall(cufftCreate(plan())); checkCuFFTCall(cufftPlan1d(plan(), nx, CUFFT_C2C, batch)); } template <> -FFT1D::FFT1D(int nx) : FFT1D(nx, 1) {} +FFT1D::FFT1D(const int nx) : FFT1D(nx, 1) {} template <> -FFT1D::FFT1D(int nx, int batch) { +FFT1D::FFT1D(const int nx, const int batch) { checkCuFFTCall(cufftCreate(plan())); const int rank = 1; size_t ws = 0; std::array n{nx}; - long long int idist = 1; - long long int odist = 1; - int istride = 1; - int ostride = 1; + const long long idist = 1; + const long long odist = 1; + const int istride = 1; + const int ostride = 1; checkCuFFTCall(cufftXtMakePlanMany(*plan(), rank, n.data(), nullptr, istride, idist, CUDA_C_16F, nullptr, ostride, odist, CUDA_C_16F, batch, &ws, CUDA_C_16F)); } template <> -FFT1D::FFT1D(int nx) : FFT1D(nx, 1) {} +FFT1D::FFT1D(const int nx) : FFT1D(nx, 1) {} /* * FFT2D @@ -185,21 +187,23 @@ class FFT2D : public FFT { #if defined(__HIP__) __host__ #endif - FFT2D(int nx, int ny) = delete; + FFT2D(const int nx, const int ny) = delete; #if defined(__HIP__) __host__ #endif - FFT2D(int nx, int ny, int stride, int dist, int batch) = delete; + FFT2D(const int nx, const int ny, const int stride, const int dist, + const int batch) = delete; }; template <> -FFT2D::FFT2D(int nx, int ny) { +FFT2D::FFT2D(const int nx, const int ny) { checkCuFFTCall(cufftCreate(plan())); checkCuFFTCall(cufftPlan2d(plan(), nx, ny, CUFFT_C2C)); } template <> -FFT2D::FFT2D(int nx, int ny, int stride, int dist, int batch) { +FFT2D::FFT2D(const int nx, const int ny, const int stride, + const int dist, const int batch) { checkCuFFTCall(cufftCreate(plan())); std::array n{nx, ny}; checkCuFFTCall(cufftPlanMany(plan(), 2, n.data(), n.data(), stride, dist, @@ -207,23 +211,102 @@ FFT2D::FFT2D(int nx, int ny, int stride, int dist, int batch) { } template <> -FFT2D::FFT2D(int nx, int ny, int stride, int dist, int batch) { +FFT2D::FFT2D(const int nx, const int ny, const int stride, + const int dist, const int batch) { checkCuFFTCall(cufftCreate(plan())); const int rank = 2; size_t ws = 0; std::array n{nx, ny}; - int istride = stride; - int ostride = stride; - long long int idist = dist; - long long int odist = dist; + const int istride = stride; + const int ostride = stride; + const long long int idist = dist; + const long long int odist = dist; checkCuFFTCall(cufftXtMakePlanMany(*plan(), rank, n.data(), nullptr, istride, idist, CUDA_C_16F, nullptr, ostride, odist, CUDA_C_16F, batch, &ws, CUDA_C_16F)); } template <> -FFT2D::FFT2D(int nx, int ny) : FFT2D(nx, ny, 1, nx * ny, 1) {} +FFT2D::FFT2D(const int nx, const int ny) + : FFT2D(nx, ny, 1, nx * ny, 1) {} + +/* + * FFT1DR2C + */ +template +class FFT1DR2C : public FFT { + public: +#if defined(__HIP__) + __host__ +#endif + FFT1DR2C(const int nx) = delete; +#if defined(__HIP__) + __host__ +#endif + FFT1DR2C(const int nx, const int batch) = delete; + +#if defined(__HIP__) + __host__ +#endif + FFT1DR2C(const int nx, const int batch, long long inembed, + long long ouembed) = delete; +}; + +template <> +FFT1DR2C::FFT1DR2C(const int nx, const int batch, long long inembed, + long long ouembed) { + checkCuFFTCall(cufftCreate(plan())); + const int rank = 1; + size_t ws = 0; + std::array n{nx}; + const long long idist = inembed; + const long long odist = ouembed; + const long long istride = 1; + const long long ostride = 1; + + checkCuFFTCall(cufftXtMakePlanMany( + *plan(), rank, n.data(), &inembed, istride, idist, CUDA_R_32F, &ouembed, + ostride, odist, CUDA_C_32F, batch, &ws, CUDA_C_32F)); +} + +/* + * FFT1D_C2R + */ +template +class FFT1DC2R : public FFT { + public: +#if defined(__HIP__) + __host__ +#endif + FFT1DC2R(const int nx) = delete; +#if defined(__HIP__) + __host__ +#endif + FFT1DC2R(const int nx, const int batch) = delete; +#if defined(__HIP__) + __host__ +#endif + FFT1DC2R(const int nx, const int batch, long long inembed, + long long ouembed) = delete; +}; + +template <> +FFT1DC2R::FFT1DC2R(const int nx, const int batch, long long inembed, + long long ouembed) { + checkCuFFTCall(cufftCreate(plan())); + const int rank = 1; + size_t ws = 0; + std::array n{nx}; + const long long idist = inembed; + const long long odist = ouembed; + const int istride = 1; + const int ostride = 1; + + checkCuFFTCall(cufftXtMakePlanMany( + *plan(), rank, n.data(), &inembed, istride, idist, CUDA_C_32F, &ouembed, + ostride, odist, CUDA_R_32F, batch, &ws, CUDA_C_32F)); +} } // namespace cufft -#endif // CUFFT_H +#endif // CUFFT_H \ No newline at end of file diff --git a/include/cudawrappers/nvtx.hpp b/include/cudawrappers/nvtx.hpp index 5bb6f5fd..31dac0de 100644 --- a/include/cudawrappers/nvtx.hpp +++ b/include/cudawrappers/nvtx.hpp @@ -1,7 +1,9 @@ #if !defined NVTX_H #define NVTX_H +#if !defined(__HIP__) #include +#endif namespace nvtx { @@ -9,6 +11,10 @@ class Marker { public: enum Color { red, green, blue, yellow, black }; +#if defined(__HIP__) + explicit Marker(const char* message, unsigned color = Color::green) {} + +#else explicit Marker(const char* message, unsigned color = Color::green) : _attributes{0} { _attributes.version = NVTX_VERSION; @@ -18,12 +24,21 @@ class Marker { _attributes.messageType = NVTX_MESSAGE_TYPE_ASCII; _attributes.message.ascii = message; } +#endif Marker(const char* message, Color color) : Marker(message, convert(color)) {} - void start() { _id = nvtxRangeStartEx(&_attributes); } + void start() { +#if !defined(__HIP__) + _id = nvtxRangeStartEx(&_attributes); +#endif + } - void end() { nvtxRangeEnd(_id); } + void end() { +#if !defined(__HIP__) + nvtxRangeEnd(_id); +#endif + } private: unsigned int convert(Color color) { @@ -43,8 +58,10 @@ class Marker { } } +#if !defined(__HIP__) nvtxEventAttributes_t _attributes; nvtxRangeId_t _id; +#endif }; } // end namespace nvtx diff --git a/tests/test_cu.cpp b/tests/test_cu.cpp index e383b0f4..744f80d2 100644 --- a/tests/test_cu.cpp +++ b/tests/test_cu.cpp @@ -12,17 +12,29 @@ TEST_CASE("Test cu::Device", "[device]") { cu::Device device(0); cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device); - SECTION("Test Device.getName") { + SECTION("Test Device.getName", "[device]") { const std::string name = device.getName(); std::cout << "Device name: " << name << std::endl; CHECK(name.size() > 0); } - SECTION("Test Device.getArch") { + SECTION("Test Device.getArch", "[device]") { const std::string arch = device.getArch(); std::cout << "Device arch: " << arch << std::endl; CHECK(arch.size() > 0); } + + SECTION("Test device.totalMem", "[device]") { + const size_t total_mem = device.totalMem(); + std::cout << "Device total memory: " << (total_mem / (1024 * 1024)) + << " bytes" << std::endl; + CHECK(total_mem > 0); + } + + SECTION("Test Device.getOrdinal", "[device]") { + const int dev_ordinal = device.getOrdinal(); + CHECK(dev_ordinal >= 0); + } } TEST_CASE("Test context::getDevice", "[device]") { @@ -57,6 +69,24 @@ TEST_CASE("Test copying cu::DeviceMemory and cu::HostMemory using cu::Stream", CHECK(src == tgt); } + SECTION("Test copying a 2D std::array to the device and back") { + const std::array, 3> src = { + {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}}; + std::array, 3> tgt = {{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}; + const size_t width = 3 * sizeof(int); + const size_t height = 3; + const size_t pitch = width; + + cu::DeviceMemory mem(pitch * height); + + cu::Stream stream; + stream.memcpyHtoD2DAsync(mem, pitch, src.data(), pitch, width, height); + stream.memcpyDtoH2DAsync(tgt.data(), pitch, mem, pitch, width, height); + stream.synchronize(); + + CHECK(src == tgt); + } + SECTION("Test copying HostMemory to the device and back") { const size_t N = 3; const size_t size = N * sizeof(int); @@ -79,6 +109,34 @@ TEST_CASE("Test copying cu::DeviceMemory and cu::HostMemory using cu::Stream", CHECK(!static_cast(memcmp(src, tgt, size))); } + + SECTION("Test copying 2D HostMemory to the device and back") { + const size_t width = 3 * sizeof(int); + const size_t height = 3; + const size_t pitch = width; + const size_t size = pitch * height; + cu::HostMemory src(size); + cu::HostMemory tgt(size); + + // Populate the 2D memory with values + int* const src_ptr = static_cast(src); + int* const tgt_ptr = static_cast(tgt); + for (int y = 0; y < height; ++y) { + for (int x = 0; x < 3; ++x) { + src_ptr[y * 3 + x] = y * 3 + x + 1; + tgt_ptr[y * 3 + x] = 0; + } + } + + cu::DeviceMemory mem(size); + cu::Stream stream; + + stream.memcpyHtoD2DAsync(mem, pitch, src, pitch, width, height); + stream.memcpyDtoH2DAsync(tgt, pitch, mem, pitch, width, height); + stream.synchronize(); + + CHECK(static_cast(memcmp(src, tgt, size)) == 0); + } } TEST_CASE("Test cu::DeviceMemory", "[devicememory]") { @@ -163,7 +221,7 @@ TEST_CASE("Test cu::DeviceMemory", "[devicememory]") { SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_DEVICE as host pointer") { cu::DeviceMemory mem(sizeof(float), CU_MEMORYTYPE_DEVICE, 0); float* ptr; - CHECK_THROWS(ptr = mem); + CHECK_NOTHROW(ptr = mem); } SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_UNIFIED as host pointer") { @@ -206,7 +264,7 @@ TEST_CASE("Test cu::DeviceMemory", "[devicememory]") { } using TestTypes = std::tuple; -TEMPLATE_LIST_TEST_CASE("Test memset", "[memset]", TestTypes) { +TEMPLATE_LIST_TEST_CASE("Test memset 1D", "[memset]", TestTypes) { cu::init(); cu::Device device(0); cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device); @@ -263,6 +321,71 @@ TEMPLATE_LIST_TEST_CASE("Test memset", "[memset]", TestTypes) { } } +using TestTypes = std::tuple; +TEMPLATE_LIST_TEST_CASE("Test memset 2D", "[memset]", TestTypes) { + cu::init(); + cu::Device device(0); + cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device); + + SECTION("Test memset2D cu::DeviceMemory asynchronously") { + const size_t width = 3; + const size_t height = 3; + const size_t pitch = width * sizeof(TestType); + const size_t size = pitch * height; + cu::HostMemory a(size); + cu::HostMemory b(size); + TestType value = 0xAA; + + // Populate the memory with initial values + TestType* const a_ptr = static_cast(a); + TestType* const b_ptr = static_cast(b); + for (int i = 0; i < width * height; i++) { + a_ptr[i] = 0; + b_ptr[i] = value; + } + + cu::DeviceMemory mem(size); + cu::Stream stream; + + // Perform the 2D memory operations + stream.memcpyHtoD2DAsync(mem, pitch, b, pitch, width, height); + stream.memset2DAsync(mem, value, pitch, width, height); + stream.memcpyDtoH2DAsync(b, pitch, mem, pitch, width, height); + + CHECK(static_cast(memcmp(a, b, size))); + } + + SECTION("Test zeroing cu::DeviceMemory synchronously in 2D") { + const size_t width = 3; + const size_t height = 3; + const size_t pitch = width * sizeof(TestType); + const size_t size = pitch * height; + cu::HostMemory a(size); + cu::HostMemory b(size); + TestType value = 0xAA; + + // Populate the memory with initial values + TestType* const a_ptr = static_cast(a); + TestType* const b_ptr = static_cast(b); + for (int i = 0; i < width * height; i++) { + a_ptr[i] = 0; + b_ptr[i] = value; + } + + cu::DeviceMemory mem(size); + cu::Stream stream; + + // Perform the 2D memory operations + stream.memcpyHtoD2DAsync(mem, pitch, b, pitch, width, height); + stream.synchronize(); + mem.memset2D(value, pitch, width, height); + stream.memcpyDtoH2DAsync(b, pitch, mem, pitch, width, height); + stream.synchronize(); + + CHECK(static_cast(memcmp(a, b, size))); + } +} + TEST_CASE("Test cu::Stream", "[stream]") { cu::init(); cu::Device device(0); diff --git a/tests/test_cufft.cpp b/tests/test_cufft.cpp index 251091df..afc5354e 100644 --- a/tests/test_cufft.cpp +++ b/tests/test_cufft.cpp @@ -114,6 +114,34 @@ TEST_CASE("Test 1D FFT", "[FFT1D]") { scaleSignal(out_ptr, out_ptr, size, float(size)); compare(out_ptr, in_ptr, size); } + + SECTION("FP32 FFT with Real-To-Complex translation, and back") { + const size_t arraySize = size * sizeof(cufftComplex); + + cu::HostMemory h_in(arraySize); + cu::HostMemory h_out(arraySize); + cu::DeviceMemory d_in(arraySize); + cu::DeviceMemory d_out(arraySize); + cu::DeviceMemory d_out2(arraySize); + + generateSignal(static_cast(h_in), size, patchSize, {1, 1}); + stream.memcpyHtoDAsync(d_in, h_in, arraySize); + + cufft::FFT1DR2C fft_r2c(size, 1, 1, 1); + cufft::FFT1DC2R fft_c2r(size, 1, 1, 1); + fft_r2c.setStream(stream); + fft_c2r.setStream(stream); + + fft_r2c.execute(d_in, d_out, CUFFT_FORWARD); + fft_c2r.execute(d_out, d_out2, CUFFT_INVERSE); + stream.memcpyDtoHAsync(h_out, d_out2, arraySize); + stream.synchronize(); + + cuFloatComplex *in_ptr = static_cast(h_in); + cuFloatComplex *out_ptr = static_cast(h_out); + scaleSignal(out_ptr, out_ptr, size, float(size)); + compare(out_ptr, in_ptr, size); + } } TEST_CASE("Test 2D FFT", "[FFT2D]") {