Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add managed memory #211

Merged
merged 13 commits into from
Sep 21, 2023
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ This project adheres to [Semantic Versioning](http://semver.org/).
### Added
- cufft wrappers for 1D and 2D complex-to-complex FFTs
- cu::HostMemory constuctor for pre-allocated memory
- cu::DeviceMemory constructor for managed memory
- cu::Stream::cuMemPrefetchAsync for pre-fetching of managed memory

### Changed
- The vector_add example has now become a test
Expand Down
29 changes: 27 additions & 2 deletions include/cudawrappers/cu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,15 @@ class HostMemory : public Wrapper<void *> {

class DeviceMemory : public Wrapper<CUdeviceptr> {
public:
explicit DeviceMemory(size_t size) {
checkCudaCall(cuMemAlloc(&_obj, size));
explicit DeviceMemory(size_t size, CUmemorytype type = CU_MEMORYTYPE_DEVICE,
csbnw marked this conversation as resolved.
Show resolved Hide resolved
unsigned int flags = 0) {
if (type == CU_MEMORYTYPE_DEVICE and !flags) {
checkCudaCall(cuMemAlloc(&_obj, size));
} else if (type == CU_MEMORYTYPE_UNIFIED) {
checkCudaCall(cuMemAllocManaged(&_obj, size, flags));
} else {
throw Error(CUDA_ERROR_INVALID_VALUE);
}
manager = std::shared_ptr<CUdeviceptr>(new CUdeviceptr(_obj),
[](CUdeviceptr *ptr) {
cuMemFree(*ptr);
Expand All @@ -267,6 +274,19 @@ class DeviceMemory : public Wrapper<CUdeviceptr> {
{
return &_obj;
}

template <typename T>
operator T *() {
bool data;
checkCudaCall(
cuPointerGetAttribute(&data, CU_POINTER_ATTRIBUTE_IS_MANAGED, _obj));
if (data) {
return reinterpret_cast<T *>(_obj);
} else {
throw std::runtime_error(
"Cannot return memory of type CU_MEMORYTYPE_DEVICE as pointer.");
}
}
};

class Array : public Wrapper<CUarray> {
Expand Down Expand Up @@ -429,6 +449,11 @@ class Stream : public Wrapper<CUstream> {
checkCudaCall(cuMemcpyAsync(dstPtr, srcPtr, size, _obj));
}

void memPrefetchAsync(CUdeviceptr devPtr, size_t size,
CUdevice dstDevice = CU_DEVICE_CPU) {
checkCudaCall(cuMemPrefetchAsync(devPtr, size, dstDevice, _obj));
}

void launchKernel(Function &function, unsigned gridX, unsigned gridY,
unsigned gridZ, unsigned blockX, unsigned blockY,
unsigned blockZ, unsigned sharedMemBytes,
Expand Down
13 changes: 13 additions & 0 deletions tests/test_cu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,4 +143,17 @@ TEST_CASE("Test zeroing cu::DeviceMemory", "[zero]") {

CHECK(data_in == data_out);
}

SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_DEVICE as host pointer") {
cu::DeviceMemory mem(sizeof(float), CU_MEMORYTYPE_DEVICE, 0);
float* ptr;
CHECK_THROWS(ptr = mem);
}

SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_UNIFIED as host pointer") {
cu::DeviceMemory mem(sizeof(float), CU_MEMORYTYPE_UNIFIED,
CU_MEM_ATTACH_GLOBAL);
float* ptr = mem;
CHECK_NOTHROW(ptr[0] = 42.f);
}
}
110 changes: 87 additions & 23 deletions tests/test_vector_add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ void check_arrays_equal(const float *a, const float *b, size_t n) {
}
}

void initialize_arrays(float *a, float *b, float *c, float *r, int N) {
for (int i = 0; i < N; i++) {
a[i] = 1.0 + i;
b[i] = 2.0 - (N - i);
c[i] = 0.0;
r[i] = a[i] + b[i];
}
}

TEST_CASE("Vector add") {
const std::string kernel = R"(
extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
Expand All @@ -32,28 +41,7 @@ TEST_CASE("Vector add") {
cu::Device device(0);
cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device);

cu::HostMemory h_a(bytesize);
cu::HostMemory h_b(bytesize);
cu::HostMemory h_c(bytesize);
std::vector<float> reference_c(N);

float *a = static_cast<float *>(h_a);
float *b = static_cast<float *>(h_b);
float *c = static_cast<float *>(h_c);
for (int i = 0; i < N; i++) {
a[i] = 1.0 + i;
b[i] = 2.0 - (N - i);
c[i] = 0.0;
reference_c[i] = a[i] + b[i];
}

cu::DeviceMemory d_a(bytesize);
cu::DeviceMemory d_b(bytesize);
cu::DeviceMemory d_c(bytesize);

cu::Stream stream;
stream.memcpyHtoDAsync(d_a, a, bytesize);
stream.memcpyHtoDAsync(d_b, b, bytesize);

std::vector<std::string> options = {};
nvrtc::Program program(kernel, "vector_add_kernel.cu");
Expand All @@ -68,11 +56,87 @@ TEST_CASE("Vector add") {
cu::Function function(module, "vector_add");

SECTION("Run kernel") {
cu::HostMemory h_a(bytesize);
cu::HostMemory h_b(bytesize);
cu::HostMemory h_c(bytesize);
std::vector<float> reference_c(N);

initialize_arrays(static_cast<float *>(h_a), static_cast<float *>(h_b),
static_cast<float *>(h_c), reference_c.data(), N);

cu::DeviceMemory d_a(bytesize);
cu::DeviceMemory d_b(bytesize);
cu::DeviceMemory d_c(bytesize);

stream.memcpyHtoDAsync(d_a, h_a, bytesize);
stream.memcpyHtoDAsync(d_b, h_b, bytesize);
std::vector<const void *> parameters = {d_c.parameter(), d_a.parameter(),
d_b.parameter(), &N};
stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters);
stream.memcpyDtoHAsync(h_c, d_c, bytesize);
stream.synchronize();

check_arrays_equal(h_c, reference_c.data(), N);
}

SECTION("Run kernel with managed memory") {
cu::DeviceMemory d_a(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST);
cu::DeviceMemory d_b(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST);
cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST);

float *h_a = d_a;
float *h_b = d_b;
float *h_c = d_c;
std::vector<float> reference_c(N);

initialize_arrays(h_a, h_b, h_c, reference_c.data(), N);

std::vector<const void *> parameters = {d_c.parameter(), d_a.parameter(),
d_b.parameter(), &N};
stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters);
stream.memcpyDtoHAsync(c, d_c, bytesize);
stream.synchronize();
check_arrays_equal(c, reference_c.data(), N);

check_arrays_equal(h_c, reference_c.data(), N);
}

SECTION("Run kernel with managed memory and prefetch") {
if (device.getAttribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
cu::DeviceMemory d_a(bytesize, CU_MEMORYTYPE_UNIFIED,
CU_MEM_ATTACH_GLOBAL);
cu::DeviceMemory d_b(bytesize, CU_MEMORYTYPE_UNIFIED,
CU_MEM_ATTACH_GLOBAL);
cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED,
CU_MEM_ATTACH_GLOBAL);

float *h_a = d_a;
float *h_b = d_b;
float *h_c = d_c;
std::vector<float> reference_c(N);

initialize_arrays(h_a, h_b, h_c, reference_c.data(), N);

std::vector<const void *> parameters = {d_c.parameter(), d_a.parameter(),
d_b.parameter(), &N};
stream.memPrefetchAsync(d_a, bytesize, device);
stream.memPrefetchAsync(d_b, bytesize, device);
stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters);
stream.memPrefetchAsync(d_c, bytesize);
stream.synchronize();

check_arrays_equal(h_c, reference_c.data(), N);
}
}

SECTION("Pass invalid CUmemorytype to cu::DeviceMemory constructor") {
CHECK_THROWS(cu::DeviceMemory(bytesize, CU_MEMORYTYPE_ARRAY));
CHECK_THROWS(cu::DeviceMemory(bytesize, CU_MEMORYTYPE_HOST));
}

SECTION("Pass flags with CU_MEMORYTYPE_DEVICE") {
CHECK_NOTHROW(cu::DeviceMemory(bytesize, CU_MEMORYTYPE_DEVICE, 0));
CHECK_THROWS(
cu::DeviceMemory(bytesize, CU_MEMORYTYPE_DEVICE, CU_MEM_ATTACH_GLOBAL));
CHECK_THROWS(
cu::DeviceMemory(bytesize, CU_MEMORYTYPE_DEVICE, CU_MEM_ATTACH_HOST));
}
}