Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add managed memory #211

Merged
merged 13 commits into from
Sep 21, 2023
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ This project adheres to [Semantic Versioning](http://semver.org/).
### Added
- cufft wrappers for 1D and 2D complex-to-complex FFTs
- cu::HostMemory constuctor for pre-allocated memory
- cu::DeviceMemory constructor for managed memory
- cu::Stream::cuMemPrefetchAsync for pre-fetching of managed memory

### Changed
- The vector_add example has now become a test
Expand Down
15 changes: 13 additions & 2 deletions include/cudawrappers/cu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,15 @@ class HostMemory : public Wrapper<void *> {

class DeviceMemory : public Wrapper<CUdeviceptr> {
public:
explicit DeviceMemory(size_t size) {
checkCudaCall(cuMemAlloc(&_obj, size));
explicit DeviceMemory(size_t size, CUmemorytype type = CU_MEMORYTYPE_DEVICE,
csbnw marked this conversation as resolved.
Show resolved Hide resolved
unsigned int flags = 0) {
if (type == CU_MEMORYTYPE_DEVICE and !flags) {
checkCudaCall(cuMemAlloc(&_obj, size));
} else if (type == CU_MEMORYTYPE_UNIFIED) {
checkCudaCall(cuMemAllocManaged(&_obj, size, flags));
} else {
throw Error(CUDA_ERROR_INVALID_VALUE);
}
manager = std::shared_ptr<CUdeviceptr>(new CUdeviceptr(_obj),
[](CUdeviceptr *ptr) {
cuMemFree(*ptr);
Expand Down Expand Up @@ -429,6 +436,10 @@ class Stream : public Wrapper<CUstream> {
checkCudaCall(cuMemcpyAsync(dstPtr, srcPtr, size, _obj));
}

void memPrefetchAsync(CUdeviceptr devPtr, CUdevice dstDevice, size_t size) {
csbnw marked this conversation as resolved.
Show resolved Hide resolved
checkCudaCall(cuMemPrefetchAsync(devPtr, size, dstDevice, _obj));
}

void launchKernel(Function &function, unsigned gridX, unsigned gridY,
unsigned gridZ, unsigned blockX, unsigned blockY,
unsigned blockZ, unsigned sharedMemBytes,
Expand Down
97 changes: 74 additions & 23 deletions tests/test_vector_add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ void check_arrays_equal(const float *a, const float *b, size_t n) {
}
}

void initialize_arrays(float *a, float *b, float *c, float *r, int N) {
for (int i = 0; i < N; i++) {
a[i] = 1.0 + i;
b[i] = 2.0 - (N - i);
c[i] = 0.0;
r[i] = a[i] + b[i];
}
}

TEST_CASE("Vector add") {
const std::string kernel = R"(
extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
Expand All @@ -32,28 +41,7 @@ TEST_CASE("Vector add") {
cu::Device device(0);
cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device);

cu::HostMemory h_a(bytesize);
cu::HostMemory h_b(bytesize);
cu::HostMemory h_c(bytesize);
std::vector<float> reference_c(N);

float *a = static_cast<float *>(h_a);
float *b = static_cast<float *>(h_b);
float *c = static_cast<float *>(h_c);
for (int i = 0; i < N; i++) {
a[i] = 1.0 + i;
b[i] = 2.0 - (N - i);
c[i] = 0.0;
reference_c[i] = a[i] + b[i];
}

cu::DeviceMemory d_a(bytesize);
cu::DeviceMemory d_b(bytesize);
cu::DeviceMemory d_c(bytesize);

cu::Stream stream;
stream.memcpyHtoDAsync(d_a, a, bytesize);
stream.memcpyHtoDAsync(d_b, b, bytesize);

std::vector<std::string> options = {};
nvrtc::Program program(kernel, "vector_add_kernel.cu");
Expand All @@ -68,11 +56,74 @@ TEST_CASE("Vector add") {
cu::Function function(module, "vector_add");

SECTION("Run kernel") {
cu::HostMemory h_a(bytesize);
cu::HostMemory h_b(bytesize);
cu::HostMemory h_c(bytesize);
std::vector<float> reference_c(N);

initialize_arrays(static_cast<float *>(h_a), static_cast<float *>(h_b),
static_cast<float *>(h_c), reference_c.data(), N);

cu::DeviceMemory d_a(bytesize);
cu::DeviceMemory d_b(bytesize);
cu::DeviceMemory d_c(bytesize);

stream.memcpyHtoDAsync(d_a, h_a, bytesize);
stream.memcpyHtoDAsync(d_b, h_b, bytesize);
std::vector<const void *> parameters = {d_c.parameter(), d_a.parameter(),
d_b.parameter(), &N};
stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters);
stream.memcpyDtoHAsync(c, d_c, bytesize);
stream.memcpyDtoHAsync(h_c, d_c, bytesize);
stream.synchronize();
check_arrays_equal(c, reference_c.data(), N);

check_arrays_equal(h_c, reference_c.data(), N);
}

SECTION("Run kernel with managed memory") {
cu::DeviceMemory d_a(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST);
cu::DeviceMemory d_b(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST);
cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST);

float *h_a = reinterpret_cast<float *>(static_cast<CUdeviceptr>(d_a));
float *h_b = reinterpret_cast<float *>(static_cast<CUdeviceptr>(d_b));
float *h_c = reinterpret_cast<float *>(static_cast<CUdeviceptr>(d_c));
matmanc marked this conversation as resolved.
Show resolved Hide resolved
std::vector<float> reference_c(N);

initialize_arrays(h_a, h_b, h_c, reference_c.data(), N);

std::vector<const void *> parameters = {d_c.parameter(), d_a.parameter(),
d_b.parameter(), &N};
stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters);
stream.synchronize();

check_arrays_equal(h_c, reference_c.data(), N);
}

SECTION("Run kernel with managed memory and prefetch") {
if (device.getAttribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
cu::DeviceMemory d_a(bytesize, CU_MEMORYTYPE_UNIFIED,
CU_MEM_ATTACH_GLOBAL);
cu::DeviceMemory d_b(bytesize, CU_MEMORYTYPE_UNIFIED,
CU_MEM_ATTACH_GLOBAL);
cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED,
CU_MEM_ATTACH_GLOBAL);

float *h_a = reinterpret_cast<float *>(static_cast<CUdeviceptr>(d_a));
float *h_b = reinterpret_cast<float *>(static_cast<CUdeviceptr>(d_b));
float *h_c = reinterpret_cast<float *>(static_cast<CUdeviceptr>(d_c));
std::vector<float> reference_c(N);

initialize_arrays(h_a, h_b, h_c, reference_c.data(), N);

std::vector<const void *> parameters = {d_c.parameter(), d_a.parameter(),
d_b.parameter(), &N};
stream.memPrefetchAsync(d_a, device, bytesize);
stream.memPrefetchAsync(d_b, device, bytesize);
stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters);
stream.memPrefetchAsync(d_c, CU_DEVICE_CPU, bytesize);
stream.synchronize();

check_arrays_equal(h_c, reference_c.data(), N);
}
}
}