Skip to content

Commit

Permalink
Add managed memory (#211)
Browse files Browse the repository at this point in the history
* Add option to allocate managed memory in DeviceMemory
* Add Stream::memPrefetchAsync
* Add operator T *() to DeviceMemory
* Add tests for new functionality
* Update changelog
  • Loading branch information
csbnw authored Sep 21, 2023
1 parent 7520129 commit 2a12b83
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 25 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ This project adheres to [Semantic Versioning](http://semver.org/).
### Added
- cufft wrappers for 1D and 2D complex-to-complex FFTs
- cu::HostMemory constuctor for pre-allocated memory
- cu::DeviceMemory constructor for managed memory
- cu::Stream::cuMemPrefetchAsync for pre-fetching of managed memory

### Changed
- The vector_add example has now become a test
Expand Down
29 changes: 27 additions & 2 deletions include/cudawrappers/cu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,15 @@ class HostMemory : public Wrapper<void *> {

class DeviceMemory : public Wrapper<CUdeviceptr> {
public:
explicit DeviceMemory(size_t size) {
checkCudaCall(cuMemAlloc(&_obj, size));
explicit DeviceMemory(size_t size, CUmemorytype type = CU_MEMORYTYPE_DEVICE,
unsigned int flags = 0) {
if (type == CU_MEMORYTYPE_DEVICE and !flags) {
checkCudaCall(cuMemAlloc(&_obj, size));
} else if (type == CU_MEMORYTYPE_UNIFIED) {
checkCudaCall(cuMemAllocManaged(&_obj, size, flags));
} else {
throw Error(CUDA_ERROR_INVALID_VALUE);
}
manager = std::shared_ptr<CUdeviceptr>(new CUdeviceptr(_obj),
[](CUdeviceptr *ptr) {
cuMemFree(*ptr);
Expand All @@ -267,6 +274,19 @@ class DeviceMemory : public Wrapper<CUdeviceptr> {
{
return &_obj;
}

template <typename T>
operator T *() {
bool data;
checkCudaCall(
cuPointerGetAttribute(&data, CU_POINTER_ATTRIBUTE_IS_MANAGED, _obj));
if (data) {
return reinterpret_cast<T *>(_obj);
} else {
throw std::runtime_error(
"Cannot return memory of type CU_MEMORYTYPE_DEVICE as pointer.");
}
}
};

class Array : public Wrapper<CUarray> {
Expand Down Expand Up @@ -429,6 +449,11 @@ class Stream : public Wrapper<CUstream> {
checkCudaCall(cuMemcpyAsync(dstPtr, srcPtr, size, _obj));
}

void memPrefetchAsync(CUdeviceptr devPtr, size_t size,
CUdevice dstDevice = CU_DEVICE_CPU) {
checkCudaCall(cuMemPrefetchAsync(devPtr, size, dstDevice, _obj));
}

void launchKernel(Function &function, unsigned gridX, unsigned gridY,
unsigned gridZ, unsigned blockX, unsigned blockY,
unsigned blockZ, unsigned sharedMemBytes,
Expand Down
13 changes: 13 additions & 0 deletions tests/test_cu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,4 +143,17 @@ TEST_CASE("Test zeroing cu::DeviceMemory", "[zero]") {

CHECK(data_in == data_out);
}

SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_DEVICE as host pointer") {
cu::DeviceMemory mem(sizeof(float), CU_MEMORYTYPE_DEVICE, 0);
float* ptr;
CHECK_THROWS(ptr = mem);
}

SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_UNIFIED as host pointer") {
cu::DeviceMemory mem(sizeof(float), CU_MEMORYTYPE_UNIFIED,
CU_MEM_ATTACH_GLOBAL);
float* ptr = mem;
CHECK_NOTHROW(ptr[0] = 42.f);
}
}
110 changes: 87 additions & 23 deletions tests/test_vector_add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ void check_arrays_equal(const float *a, const float *b, size_t n) {
}
}

void initialize_arrays(float *a, float *b, float *c, float *r, int N) {
for (int i = 0; i < N; i++) {
a[i] = 1.0 + i;
b[i] = 2.0 - (N - i);
c[i] = 0.0;
r[i] = a[i] + b[i];
}
}

TEST_CASE("Vector add") {
const std::string kernel = R"(
extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
Expand All @@ -32,28 +41,7 @@ TEST_CASE("Vector add") {
cu::Device device(0);
cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device);

cu::HostMemory h_a(bytesize);
cu::HostMemory h_b(bytesize);
cu::HostMemory h_c(bytesize);
std::vector<float> reference_c(N);

float *a = static_cast<float *>(h_a);
float *b = static_cast<float *>(h_b);
float *c = static_cast<float *>(h_c);
for (int i = 0; i < N; i++) {
a[i] = 1.0 + i;
b[i] = 2.0 - (N - i);
c[i] = 0.0;
reference_c[i] = a[i] + b[i];
}

cu::DeviceMemory d_a(bytesize);
cu::DeviceMemory d_b(bytesize);
cu::DeviceMemory d_c(bytesize);

cu::Stream stream;
stream.memcpyHtoDAsync(d_a, a, bytesize);
stream.memcpyHtoDAsync(d_b, b, bytesize);

std::vector<std::string> options = {};
nvrtc::Program program(kernel, "vector_add_kernel.cu");
Expand All @@ -68,11 +56,87 @@ TEST_CASE("Vector add") {
cu::Function function(module, "vector_add");

SECTION("Run kernel") {
cu::HostMemory h_a(bytesize);
cu::HostMemory h_b(bytesize);
cu::HostMemory h_c(bytesize);
std::vector<float> reference_c(N);

initialize_arrays(static_cast<float *>(h_a), static_cast<float *>(h_b),
static_cast<float *>(h_c), reference_c.data(), N);

cu::DeviceMemory d_a(bytesize);
cu::DeviceMemory d_b(bytesize);
cu::DeviceMemory d_c(bytesize);

stream.memcpyHtoDAsync(d_a, h_a, bytesize);
stream.memcpyHtoDAsync(d_b, h_b, bytesize);
std::vector<const void *> parameters = {d_c.parameter(), d_a.parameter(),
d_b.parameter(), &N};
stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters);
stream.memcpyDtoHAsync(h_c, d_c, bytesize);
stream.synchronize();

check_arrays_equal(h_c, reference_c.data(), N);
}

SECTION("Run kernel with managed memory") {
cu::DeviceMemory d_a(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST);
cu::DeviceMemory d_b(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST);
cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST);

float *h_a = d_a;
float *h_b = d_b;
float *h_c = d_c;
std::vector<float> reference_c(N);

initialize_arrays(h_a, h_b, h_c, reference_c.data(), N);

std::vector<const void *> parameters = {d_c.parameter(), d_a.parameter(),
d_b.parameter(), &N};
stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters);
stream.memcpyDtoHAsync(c, d_c, bytesize);
stream.synchronize();
check_arrays_equal(c, reference_c.data(), N);

check_arrays_equal(h_c, reference_c.data(), N);
}

SECTION("Run kernel with managed memory and prefetch") {
if (device.getAttribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
cu::DeviceMemory d_a(bytesize, CU_MEMORYTYPE_UNIFIED,
CU_MEM_ATTACH_GLOBAL);
cu::DeviceMemory d_b(bytesize, CU_MEMORYTYPE_UNIFIED,
CU_MEM_ATTACH_GLOBAL);
cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED,
CU_MEM_ATTACH_GLOBAL);

float *h_a = d_a;
float *h_b = d_b;
float *h_c = d_c;
std::vector<float> reference_c(N);

initialize_arrays(h_a, h_b, h_c, reference_c.data(), N);

std::vector<const void *> parameters = {d_c.parameter(), d_a.parameter(),
d_b.parameter(), &N};
stream.memPrefetchAsync(d_a, bytesize, device);
stream.memPrefetchAsync(d_b, bytesize, device);
stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters);
stream.memPrefetchAsync(d_c, bytesize);
stream.synchronize();

check_arrays_equal(h_c, reference_c.data(), N);
}
}

SECTION("Pass invalid CUmemorytype to cu::DeviceMemory constructor") {
CHECK_THROWS(cu::DeviceMemory(bytesize, CU_MEMORYTYPE_ARRAY));
CHECK_THROWS(cu::DeviceMemory(bytesize, CU_MEMORYTYPE_HOST));
}

SECTION("Pass flags with CU_MEMORYTYPE_DEVICE") {
CHECK_NOTHROW(cu::DeviceMemory(bytesize, CU_MEMORYTYPE_DEVICE, 0));
CHECK_THROWS(
cu::DeviceMemory(bytesize, CU_MEMORYTYPE_DEVICE, CU_MEM_ATTACH_GLOBAL));
CHECK_THROWS(
cu::DeviceMemory(bytesize, CU_MEMORYTYPE_DEVICE, CU_MEM_ATTACH_HOST));
}
}

0 comments on commit 2a12b83

Please sign in to comment.