Skip to content

Commit

Permalink
Processed MR feedback.
Browse files Browse the repository at this point in the history
  • Loading branch information
wvbbreu committed Nov 1, 2024
1 parent ef908e2 commit 9702b89
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 60 deletions.
9 changes: 7 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,12 @@ project adheres to [Semantic Versioning](http://semver.org/).
- Added `cu::DeviceMemory::memset()`
- Added `cu::Stream::memsetAsync()`
- Added `nvml::Device::getPower()`
- Added 2D memcpy and memset operations
- Added `FFT1DRealToComplex` and `FFT1DComplexToReal`
- Added `cu::Stream::memcpyHtoD2DAsync()`, `cu::Stream::memcpyDtoHD2Async()`,
and `cu::Stream::memcpyDtoD2DAsync()` for 2D asynchronous memory copies.
- Added `cu::DeviceMemory::memset2D()` and `cu::Stream::memset2DAsync()` for 2D
memsets
- Added `cufft::FFT1D_R2C` and `cufft::FFT1D_C2R` for 1D real-to-complex and
vice verse FFT

### Changed

Expand All @@ -26,6 +30,7 @@ project adheres to [Semantic Versioning](http://semver.org/).
- Upgrade Catch2 to version v3.6.0
- `target_embed_source` is now more robust: it properly tracks dependencies and
runs again whenever any of them changes
- Expanded tests to cover the new 2D memory operations and FFT support

## \[0.8.0\] - 2024-07-05

Expand Down
2 changes: 1 addition & 1 deletion include/cudawrappers/cu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ class Device : public Wrapper<CUdevice> {
#endif
}

size_t totalMem() const {
size_t getTotalMem() const {
size_t size{};
checkCudaCall(cuDeviceTotalMem(&size, _obj));
return size;
Expand Down
101 changes: 52 additions & 49 deletions include/cudawrappers/cufft.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,18 +111,19 @@ class FFT {

~FFT() { checkCuFFTCall(cufftDestroy(plan_)); }

void setStream(cu::Stream &stream) {
void setStream(cu::Stream &stream) const {
checkCuFFTCall(cufftSetStream(plan_, stream));
}

void execute(cu::DeviceMemory &in, cu::DeviceMemory &out, int direction) {
void execute(cu::DeviceMemory &in, cu::DeviceMemory &out,
const int direction) const {
void *in_ptr = reinterpret_cast<void *>(static_cast<CUdeviceptr>(in));
void *out_ptr = reinterpret_cast<void *>(static_cast<CUdeviceptr>(out));
checkCuFFTCall(cufftXtExec(plan_, in_ptr, out_ptr, direction));
}

protected:
void checkCuFFTCall(cufftResult result) {
void checkCuFFTCall(cufftResult result) const {
if (result != CUFFT_SUCCESS) {
throw Error(result);
}
Expand All @@ -143,39 +144,39 @@ class FFT1D : public FFT {
#if defined(__HIP__)
__host__
#endif
FFT1D(int nx) = delete;
FFT1D(const int nx) = delete;
#if defined(__HIP__)
__host__
#endif
FFT1D(int nx, int batch) = delete;
FFT1D(const int nx, const int batch) = delete;
};

template <>
FFT1D<CUDA_C_32F>::FFT1D(int nx, int batch) {
FFT1D<CUDA_C_32F>::FFT1D(const int nx, const int batch) {
checkCuFFTCall(cufftCreate(plan()));
checkCuFFTCall(cufftPlan1d(plan(), nx, CUFFT_C2C, batch));
}

template <>
FFT1D<CUDA_C_32F>::FFT1D(int nx) : FFT1D(nx, 1) {}
FFT1D<CUDA_C_32F>::FFT1D(const int nx) : FFT1D(nx, 1) {}

template <>
FFT1D<CUDA_C_16F>::FFT1D(int nx, int batch) {
FFT1D<CUDA_C_16F>::FFT1D(const int nx, const int batch) {
checkCuFFTCall(cufftCreate(plan()));
const int rank = 1;
size_t ws = 0;
std::array<long long, 1> n{nx};
long long int idist = 1;
long long int odist = 1;
int istride = 1;
int ostride = 1;
const long long idist = 1;
const long long odist = 1;
const int istride = 1;
const int ostride = 1;
checkCuFFTCall(cufftXtMakePlanMany(*plan(), rank, n.data(), nullptr, istride,
idist, CUDA_C_16F, nullptr, ostride, odist,
CUDA_C_16F, batch, &ws, CUDA_C_16F));
}

template <>
FFT1D<CUDA_C_16F>::FFT1D(int nx) : FFT1D(nx, 1) {}
FFT1D<CUDA_C_16F>::FFT1D(const int nx) : FFT1D(nx, 1) {}

/*
* FFT2D
Expand All @@ -186,118 +187,120 @@ class FFT2D : public FFT {
#if defined(__HIP__)
__host__
#endif
FFT2D(int nx, int ny) = delete;
FFT2D(const int nx, const int ny) = delete;
#if defined(__HIP__)
__host__
#endif
FFT2D(int nx, int ny, int stride, int dist, int batch) = delete;
FFT2D(const int nx, const int ny, const int stride, const int dist,
const int batch) = delete;
};

template <>
FFT2D<CUDA_C_32F>::FFT2D(int nx, int ny) {
FFT2D<CUDA_C_32F>::FFT2D(const int nx, const int ny) {
checkCuFFTCall(cufftCreate(plan()));
checkCuFFTCall(cufftPlan2d(plan(), nx, ny, CUFFT_C2C));
}

template <>
FFT2D<CUDA_C_32F>::FFT2D(int nx, int ny, int stride, int dist, int batch) {
FFT2D<CUDA_C_32F>::FFT2D(const int nx, const int ny, const int stride,
const int dist, const int batch) {
checkCuFFTCall(cufftCreate(plan()));
std::array<int, 2> n{nx, ny};
checkCuFFTCall(cufftPlanMany(plan(), 2, n.data(), n.data(), stride, dist,
n.data(), stride, dist, CUFFT_C2C, batch));
}

template <>
FFT2D<CUDA_C_16F>::FFT2D(int nx, int ny, int stride, int dist, int batch) {
FFT2D<CUDA_C_16F>::FFT2D(const int nx, const int ny, const int stride,
const int dist, const int batch) {
checkCuFFTCall(cufftCreate(plan()));
const int rank = 2;
size_t ws = 0;
std::array<long long, 2> n{nx, ny};
int istride = stride;
int ostride = stride;
long long int idist = dist;
long long int odist = dist;
const int istride = stride;
const int ostride = stride;
const long long int idist = dist;
const long long int odist = dist;
checkCuFFTCall(cufftXtMakePlanMany(*plan(), rank, n.data(), nullptr, istride,
idist, CUDA_C_16F, nullptr, ostride, odist,
CUDA_C_16F, batch, &ws, CUDA_C_16F));
}

template <>
FFT2D<CUDA_C_16F>::FFT2D(int nx, int ny) : FFT2D(nx, ny, 1, nx * ny, 1) {}
FFT2D<CUDA_C_16F>::FFT2D(const int nx, const int ny)
: FFT2D(nx, ny, 1, nx * ny, 1) {}

/*
* FFT2DRealToComplex
* FFT1D_R2C
*/
template <cudaDataType_t T>
class FFT1DRealToComplex : public FFT {
class FFT1D_R2C : public FFT {
public:
#if defined(__HIP__)
__host__
#endif
FFT1DRealToComplex(int nx) = delete;
FFT1D_R2C(const int nx) = delete;
#if defined(__HIP__)
__host__
#endif
FFT1DRealToComplex(int nx, int batch) = delete;
FFT1D_R2C(const int nx, const int batch) = delete;

#if defined(__HIP__)
__host__
#endif
FFT1DRealToComplex(int nx, int batch, long long inembed,
long long ouembed) = delete;
FFT1D_R2C(const int nx, const int batch, long long inembed,
long long ouembed) = delete;
};

template <>
FFT1DRealToComplex<CUDA_R_32F>::FFT1DRealToComplex(int nx, int batch,
long long int inembed,
long long int ouembed) {
FFT1D_R2C<CUDA_R_32F>::FFT1D_R2C(const int nx, const int batch,
long long inembed, long long ouembed) {
checkCuFFTCall(cufftCreate(plan()));
const int rank = 1;
size_t ws = 0;
std::array<long long, 1> n{nx};
long long int idist = inembed;
long long int odist = ouembed;
int istride = 1;
int ostride = 1;
const long long idist = inembed;
const long long odist = ouembed;
const long long istride = 1;
const long long ostride = 1;

checkCuFFTCall(cufftXtMakePlanMany(
*plan(), rank, n.data(), &inembed, istride, idist, CUDA_R_32F, &ouembed,
ostride, odist, CUDA_C_32F, batch, &ws, CUDA_C_32F));
}

/*
* FFT1DComplexToReal
* FFT1D_C2R
*/
template <cudaDataType_t T>
class FFT1DComplexToReal : public FFT {
class FFT1D_C2R : public FFT {
public:
#if defined(__HIP__)
__host__
#endif
FFT1DComplexToReal(int nx) = delete;
FFT1D_C2R(const int nx) = delete;
#if defined(__HIP__)
__host__
#endif
FFT1DComplexToReal(int nx, int batch) = delete;
FFT1D_C2R(const int nx, const int batch) = delete;
#if defined(__HIP__)
__host__
#endif
FFT1DComplexToReal(int nx, int batch, long long inembed,
long long ouembed) = delete;
FFT1D_C2R(const int nx, const int batch, long long inembed,
long long ouembed) = delete;
};

template <>
FFT1DComplexToReal<CUDA_C_32F>::FFT1DComplexToReal(int nx, int batch,
long long int inembed,
long long int ouembed) {
FFT1D_C2R<CUDA_C_32F>::FFT1D_C2R(const int nx, const int batch,
long long inembed, long long ouembed) {
checkCuFFTCall(cufftCreate(plan()));
const int rank = 1;
size_t ws = 0;
std::array<long long, 1> n{nx};
long long int idist = inembed;
long long int odist = ouembed;
int istride = 1;
int ostride = 1;
const long long idist = inembed;
const long long odist = ouembed;
const int istride = 1;
const int ostride = 1;

checkCuFFTCall(cufftXtMakePlanMany(
*plan(), rank, n.data(), &inembed, istride, idist, CUDA_C_32F, &ouembed,
Expand Down
10 changes: 5 additions & 5 deletions tests/test_cu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,26 @@ TEST_CASE("Test cu::Device", "[device]") {
cu::Device device(0);
cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device);

SECTION("Test Device.getName") {
SECTION("Test Device.getName", "[device]") {
const std::string name = device.getName();
std::cout << "Device name: " << name << std::endl;
CHECK(name.size() > 0);
}

SECTION("Test Device.getArch") {
SECTION("Test Device.getArch", "[device]") {
const std::string arch = device.getArch();
std::cout << "Device arch: " << arch << std::endl;
CHECK(arch.size() > 0);
}

SECTION("Test device::totalMem", "[device]") {
const size_t total_mem = device.totalMem();
SECTION("Test device.getTotalMem", "[device]") {
const size_t total_mem = device.getTotalMem();
std::cout << "Device total memory: " << (total_mem / (1024 * 1024))
<< " bytes" << std::endl;
CHECK(total_mem > 0);
}

SECTION("Test Device.getTotalConstMem") {
SECTION("Test Device.getTotalConstMem", "[device]") {
const size_t const_mem = device.getTotalConstMem();
std::cout << "Device constant memory: " << const_mem << " bytes"
<< std::endl;
Expand Down
6 changes: 3 additions & 3 deletions tests/test_cufft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ TEST_CASE("Test 1D FFT", "[FFT1D]") {
compare(out_ptr, in_ptr, size);
}

SECTION("FP32 R2C C2R") {
SECTION("FP32 FFT with Real-To-Complex translation, and back") {
const size_t arraySize = size * sizeof(cufftComplex);

cu::HostMemory h_in(arraySize);
Expand All @@ -127,8 +127,8 @@ TEST_CASE("Test 1D FFT", "[FFT1D]") {
generateSignal(static_cast<cufftComplex *>(h_in), size, patchSize, {1, 1});
stream.memcpyHtoDAsync(d_in, h_in, arraySize);

cufft::FFT1DRealToComplex<CUDA_R_32F> fft_r2c(size, 1, 1, 1);
cufft::FFT1DComplexToReal<CUDA_C_32F> fft_c2r(size, 1, 1, 1);
cufft::FFT1D_R2C<CUDA_R_32F> fft_r2c(size, 1, 1, 1);
cufft::FFT1D_C2R<CUDA_C_32F> fft_c2r(size, 1, 1, 1);
fft_r2c.setStream(stream);
fft_c2r.setStream(stream);

Expand Down

0 comments on commit 9702b89

Please sign in to comment.