diff --git a/CHANGELOG.md b/CHANGELOG.md index 1be684d8..a15cfc54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ This project adheres to [Semantic Versioning](http://semver.org/). ### Added - cufft wrappers for 1D and 2D complex-to-complex FFTs - cu::HostMemory constuctor for pre-allocated memory +- cu::DeviceMemory constructor for managed memory +- cu::Stream::cuMemPrefetchAsync for pre-fetching of managed memory ### Changed - The vector_add example has now become a test diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 431bcc07..dbd45117 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -243,8 +243,15 @@ class HostMemory : public Wrapper { class DeviceMemory : public Wrapper { public: - explicit DeviceMemory(size_t size) { - checkCudaCall(cuMemAlloc(&_obj, size)); + explicit DeviceMemory(size_t size, CUmemorytype type = CU_MEMORYTYPE_DEVICE, + unsigned int flags = 0) { + if (type == CU_MEMORYTYPE_DEVICE and !flags) { + checkCudaCall(cuMemAlloc(&_obj, size)); + } else if (type == CU_MEMORYTYPE_UNIFIED) { + checkCudaCall(cuMemAllocManaged(&_obj, size, flags)); + } else { + throw Error(CUDA_ERROR_INVALID_VALUE); + } manager = std::shared_ptr(new CUdeviceptr(_obj), [](CUdeviceptr *ptr) { cuMemFree(*ptr); @@ -267,6 +274,19 @@ class DeviceMemory : public Wrapper { { return &_obj; } + + template + operator T *() { + bool data; + checkCudaCall( + cuPointerGetAttribute(&data, CU_POINTER_ATTRIBUTE_IS_MANAGED, _obj)); + if (data) { + return reinterpret_cast(_obj); + } else { + throw std::runtime_error( + "Cannot return memory of type CU_MEMORYTYPE_DEVICE as pointer."); + } + } }; class Array : public Wrapper { @@ -429,6 +449,11 @@ class Stream : public Wrapper { checkCudaCall(cuMemcpyAsync(dstPtr, srcPtr, size, _obj)); } + void memPrefetchAsync(CUdeviceptr devPtr, size_t size, + CUdevice dstDevice = CU_DEVICE_CPU) { + checkCudaCall(cuMemPrefetchAsync(devPtr, size, dstDevice, _obj)); + } + void launchKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, diff --git a/tests/test_cu.cpp b/tests/test_cu.cpp index 769bcfbe..0f7805c7 100644 --- a/tests/test_cu.cpp +++ b/tests/test_cu.cpp @@ -143,4 +143,17 @@ TEST_CASE("Test zeroing cu::DeviceMemory", "[zero]") { CHECK(data_in == data_out); } + + SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_DEVICE as host pointer") { + cu::DeviceMemory mem(sizeof(float), CU_MEMORYTYPE_DEVICE, 0); + float* ptr; + CHECK_THROWS(ptr = mem); + } + + SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_UNIFIED as host pointer") { + cu::DeviceMemory mem(sizeof(float), CU_MEMORYTYPE_UNIFIED, + CU_MEM_ATTACH_GLOBAL); + float* ptr = mem; + CHECK_NOTHROW(ptr[0] = 42.f); + } } diff --git a/tests/test_vector_add.cpp b/tests/test_vector_add.cpp index 1ad71d1c..1ac90c95 100644 --- a/tests/test_vector_add.cpp +++ b/tests/test_vector_add.cpp @@ -15,6 +15,15 @@ void check_arrays_equal(const float *a, const float *b, size_t n) { } } +void initialize_arrays(float *a, float *b, float *c, float *r, int N) { + for (int i = 0; i < N; i++) { + a[i] = 1.0 + i; + b[i] = 2.0 - (N - i); + c[i] = 0.0; + r[i] = a[i] + b[i]; + } +} + TEST_CASE("Vector add") { const std::string kernel = R"( extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) { @@ -32,28 +41,7 @@ TEST_CASE("Vector add") { cu::Device device(0); cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device); - cu::HostMemory h_a(bytesize); - cu::HostMemory h_b(bytesize); - cu::HostMemory h_c(bytesize); - std::vector reference_c(N); - - float *a = static_cast(h_a); - float *b = static_cast(h_b); - float *c = static_cast(h_c); - for (int i = 0; i < N; i++) { - a[i] = 1.0 + i; - b[i] = 2.0 - (N - i); - c[i] = 0.0; - reference_c[i] = a[i] + b[i]; - } - - cu::DeviceMemory d_a(bytesize); - cu::DeviceMemory d_b(bytesize); - cu::DeviceMemory d_c(bytesize); - cu::Stream stream; - stream.memcpyHtoDAsync(d_a, a, bytesize); - stream.memcpyHtoDAsync(d_b, b, bytesize); std::vector options = {}; nvrtc::Program program(kernel, "vector_add_kernel.cu"); @@ -68,11 +56,87 @@ TEST_CASE("Vector add") { cu::Function function(module, "vector_add"); SECTION("Run kernel") { + cu::HostMemory h_a(bytesize); + cu::HostMemory h_b(bytesize); + cu::HostMemory h_c(bytesize); + std::vector reference_c(N); + + initialize_arrays(static_cast(h_a), static_cast(h_b), + static_cast(h_c), reference_c.data(), N); + + cu::DeviceMemory d_a(bytesize); + cu::DeviceMemory d_b(bytesize); + cu::DeviceMemory d_c(bytesize); + + stream.memcpyHtoDAsync(d_a, h_a, bytesize); + stream.memcpyHtoDAsync(d_b, h_b, bytesize); + std::vector parameters = {d_c.parameter(), d_a.parameter(), + d_b.parameter(), &N}; + stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters); + stream.memcpyDtoHAsync(h_c, d_c, bytesize); + stream.synchronize(); + + check_arrays_equal(h_c, reference_c.data(), N); + } + + SECTION("Run kernel with managed memory") { + cu::DeviceMemory d_a(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST); + cu::DeviceMemory d_b(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST); + cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST); + + float *h_a = d_a; + float *h_b = d_b; + float *h_c = d_c; + std::vector reference_c(N); + + initialize_arrays(h_a, h_b, h_c, reference_c.data(), N); + std::vector parameters = {d_c.parameter(), d_a.parameter(), d_b.parameter(), &N}; stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters); - stream.memcpyDtoHAsync(c, d_c, bytesize); stream.synchronize(); - check_arrays_equal(c, reference_c.data(), N); + + check_arrays_equal(h_c, reference_c.data(), N); + } + + SECTION("Run kernel with managed memory and prefetch") { + if (device.getAttribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + cu::DeviceMemory d_a(bytesize, CU_MEMORYTYPE_UNIFIED, + CU_MEM_ATTACH_GLOBAL); + cu::DeviceMemory d_b(bytesize, CU_MEMORYTYPE_UNIFIED, + CU_MEM_ATTACH_GLOBAL); + cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED, + CU_MEM_ATTACH_GLOBAL); + + float *h_a = d_a; + float *h_b = d_b; + float *h_c = d_c; + std::vector reference_c(N); + + initialize_arrays(h_a, h_b, h_c, reference_c.data(), N); + + std::vector parameters = {d_c.parameter(), d_a.parameter(), + d_b.parameter(), &N}; + stream.memPrefetchAsync(d_a, bytesize, device); + stream.memPrefetchAsync(d_b, bytesize, device); + stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters); + stream.memPrefetchAsync(d_c, bytesize); + stream.synchronize(); + + check_arrays_equal(h_c, reference_c.data(), N); + } + } + + SECTION("Pass invalid CUmemorytype to cu::DeviceMemory constructor") { + CHECK_THROWS(cu::DeviceMemory(bytesize, CU_MEMORYTYPE_ARRAY)); + CHECK_THROWS(cu::DeviceMemory(bytesize, CU_MEMORYTYPE_HOST)); + } + + SECTION("Pass flags with CU_MEMORYTYPE_DEVICE") { + CHECK_NOTHROW(cu::DeviceMemory(bytesize, CU_MEMORYTYPE_DEVICE, 0)); + CHECK_THROWS( + cu::DeviceMemory(bytesize, CU_MEMORYTYPE_DEVICE, CU_MEM_ATTACH_GLOBAL)); + CHECK_THROWS( + cu::DeviceMemory(bytesize, CU_MEMORYTYPE_DEVICE, CU_MEM_ATTACH_HOST)); } }