From a8c34d06c75818440c91e1480ee6d6a6d204aeb0 Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Tue, 19 Sep 2023 14:41:14 +0200 Subject: [PATCH 01/13] Add option to allocate managed memory in DeviceMemory --- include/cudawrappers/cu.hpp | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 431bcc07..d22640f9 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -243,13 +243,25 @@ class HostMemory : public Wrapper { class DeviceMemory : public Wrapper { public: - explicit DeviceMemory(size_t size) { - checkCudaCall(cuMemAlloc(&_obj, size)); - manager = std::shared_ptr(new CUdeviceptr(_obj), - [](CUdeviceptr *ptr) { - cuMemFree(*ptr); - delete ptr; - }); + explicit DeviceMemory(size_t size, CUmemorytype type = CU_MEMORYTYPE_DEVICE, + unsigned int flags = 0) { + if (type == CU_MEMORYTYPE_DEVICE and !flags) { + checkCudaCall(cuMemAlloc(&_obj, size)); + manager = std::shared_ptr(new CUdeviceptr(_obj), + [](CUdeviceptr *ptr) { + cuMemFree(*ptr); + delete ptr; + }); + } else if (type == CU_MEMORYTYPE_UNIFIED) { + checkCudaCall(cuMemAllocManaged(&_obj, size, flags)); + manager = std::shared_ptr(new CUdeviceptr(_obj), + [](CUdeviceptr *ptr) { + cuMemFree(*ptr); + delete ptr; + }); + } else { + throw Error(CUDA_ERROR_INVALID_VALUE); + } } explicit DeviceMemory(CUdeviceptr ptr) : Wrapper(ptr) {} From d9cd6fb4a6ee3feb3f6ed8ead74d4e9392664c3e Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Tue, 19 Sep 2023 14:42:10 +0200 Subject: [PATCH 02/13] Update test_vector_add --- tests/test_vector_add.cpp | 49 +++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/tests/test_vector_add.cpp b/tests/test_vector_add.cpp index 1ad71d1c..bdcf0d95 100644 --- a/tests/test_vector_add.cpp +++ b/tests/test_vector_add.cpp @@ -15,6 +15,15 @@ void check_arrays_equal(const float *a, const float *b, size_t n) { } } +void initialize_arrays(float *a, float *b, float *c, float *r, int N) { + for (int i = 0; i < N; i++) { + a[i] = 1.0 + i; + b[i] = 2.0 - (N - i); + c[i] = 0.0; + r[i] = a[i] + b[i]; + } +} + TEST_CASE("Vector add") { const std::string kernel = R"( extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) { @@ -32,28 +41,7 @@ TEST_CASE("Vector add") { cu::Device device(0); cu::Context context(CU_CTX_SCHED_BLOCKING_SYNC, device); - cu::HostMemory h_a(bytesize); - cu::HostMemory h_b(bytesize); - cu::HostMemory h_c(bytesize); - std::vector reference_c(N); - - float *a = static_cast(h_a); - float *b = static_cast(h_b); - float *c = static_cast(h_c); - for (int i = 0; i < N; i++) { - a[i] = 1.0 + i; - b[i] = 2.0 - (N - i); - c[i] = 0.0; - reference_c[i] = a[i] + b[i]; - } - - cu::DeviceMemory d_a(bytesize); - cu::DeviceMemory d_b(bytesize); - cu::DeviceMemory d_c(bytesize); - cu::Stream stream; - stream.memcpyHtoDAsync(d_a, a, bytesize); - stream.memcpyHtoDAsync(d_b, b, bytesize); std::vector options = {}; nvrtc::Program program(kernel, "vector_add_kernel.cu"); @@ -68,11 +56,26 @@ TEST_CASE("Vector add") { cu::Function function(module, "vector_add"); SECTION("Run kernel") { + cu::HostMemory h_a(bytesize); + cu::HostMemory h_b(bytesize); + cu::HostMemory h_c(bytesize); + std::vector reference_c(N); + + initialize_arrays(static_cast(h_a), static_cast(h_b), + static_cast(h_c), reference_c.data(), N); + + cu::DeviceMemory d_a(bytesize); + cu::DeviceMemory d_b(bytesize); + cu::DeviceMemory d_c(bytesize); + + stream.memcpyHtoDAsync(d_a, h_a, bytesize); + stream.memcpyHtoDAsync(d_b, h_b, bytesize); std::vector parameters = {d_c.parameter(), d_a.parameter(), d_b.parameter(), &N}; stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters); - stream.memcpyDtoHAsync(c, d_c, bytesize); + stream.memcpyDtoHAsync(h_c, d_c, bytesize); stream.synchronize(); - check_arrays_equal(c, reference_c.data(), N); + + check_arrays_equal(h_c, reference_c.data(), N); } } From 78d3a0180338ee0ee901df5fedb1d580dd46b5f9 Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Tue, 19 Sep 2023 14:50:12 +0200 Subject: [PATCH 03/13] Add vector_add test with managed memory --- tests/test_vector_add.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_vector_add.cpp b/tests/test_vector_add.cpp index bdcf0d95..f526c84a 100644 --- a/tests/test_vector_add.cpp +++ b/tests/test_vector_add.cpp @@ -78,4 +78,24 @@ TEST_CASE("Vector add") { check_arrays_equal(h_c, reference_c.data(), N); } + + SECTION("Run kenrel with managed memory") { + cu::DeviceMemory d_a(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST); + cu::DeviceMemory d_b(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST); + cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST); + + float *h_a = reinterpret_cast(static_cast(d_a)); + float *h_b = reinterpret_cast(static_cast(d_b)); + float *h_c = reinterpret_cast(static_cast(d_c)); + std::vector reference_c(N); + + initialize_arrays(h_a, h_b, h_c, reference_c.data(), N); + + std::vector parameters = {d_c.parameter(), d_a.parameter(), + d_b.parameter(), &N}; + stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters); + stream.synchronize(); + + check_arrays_equal(h_c, reference_c.data(), N); + } } From 24ec11e6ecb758dac1a895eda3ddc3eb5ac74dda Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Tue, 19 Sep 2023 15:47:53 +0200 Subject: [PATCH 04/13] Add Stream::memPrefetchAsync and test --- include/cudawrappers/cu.hpp | 4 ++++ tests/test_vector_add.cpp | 30 +++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index d22640f9..447ed3a2 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -441,6 +441,10 @@ class Stream : public Wrapper { checkCudaCall(cuMemcpyAsync(dstPtr, srcPtr, size, _obj)); } + void memPrefetchAsync(CUdeviceptr devPtr, CUdevice dstDevice, size_t size) { + cuMemPrefetchAsync(devPtr, size, dstDevice, _obj); + } + void launchKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, diff --git a/tests/test_vector_add.cpp b/tests/test_vector_add.cpp index f526c84a..ed4d0995 100644 --- a/tests/test_vector_add.cpp +++ b/tests/test_vector_add.cpp @@ -79,7 +79,7 @@ TEST_CASE("Vector add") { check_arrays_equal(h_c, reference_c.data(), N); } - SECTION("Run kenrel with managed memory") { + SECTION("Run kernel with managed memory") { cu::DeviceMemory d_a(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST); cu::DeviceMemory d_b(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST); cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST); @@ -98,4 +98,32 @@ TEST_CASE("Vector add") { check_arrays_equal(h_c, reference_c.data(), N); } + + SECTION("Run kernel with managed memory and prefetch") { + if (device.getAttribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + cu::DeviceMemory d_a(bytesize, CU_MEMORYTYPE_UNIFIED, + CU_MEM_ATTACH_GLOBAL); + cu::DeviceMemory d_b(bytesize, CU_MEMORYTYPE_UNIFIED, + CU_MEM_ATTACH_GLOBAL); + cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED, + CU_MEM_ATTACH_GLOBAL); + + float *h_a = reinterpret_cast(static_cast(d_a)); + float *h_b = reinterpret_cast(static_cast(d_b)); + float *h_c = reinterpret_cast(static_cast(d_c)); + std::vector reference_c(N); + + initialize_arrays(h_a, h_b, h_c, reference_c.data(), N); + + std::vector parameters = {d_c.parameter(), d_a.parameter(), + d_b.parameter(), &N}; + stream.memPrefetchAsync(d_a, device, bytesize); + stream.memPrefetchAsync(d_b, device, bytesize); + stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters); + stream.memPrefetchAsync(d_c, CU_DEVICE_CPU, bytesize); + stream.synchronize(); + + check_arrays_equal(h_c, reference_c.data(), N); + } + } } From e3ce071752d068c53feb8e767ee56abb6a0bf442 Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Tue, 19 Sep 2023 15:54:48 +0200 Subject: [PATCH 05/13] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 34b53e0c..235e5ad0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ This project adheres to [Semantic Versioning](http://semver.org/). ### Added - cufft wrappers for 1D and 2D complex-to-complex FFTs - cu::HostMemory constuctor for pre-allocated memory +- cu::DeviceMemory constructor for managed memory +- cu::Stream::cuMemPrefetchAsync for pre-fetching of managed memory ### Changed - The vector_add example has now become a test From b630efd6148324e0a9e273d06a41ba40b5cfae62 Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Wed, 20 Sep 2023 09:09:08 +0200 Subject: [PATCH 06/13] Add missing checkCudaCall --- include/cudawrappers/cu.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 447ed3a2..397598d5 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -442,7 +442,7 @@ class Stream : public Wrapper { } void memPrefetchAsync(CUdeviceptr devPtr, CUdevice dstDevice, size_t size) { - cuMemPrefetchAsync(devPtr, size, dstDevice, _obj); + checkCudaCall(cuMemPrefetchAsync(devPtr, size, dstDevice, _obj)); } void launchKernel(Function &function, unsigned gridX, unsigned gridY, From 32d06e6695fb53445b853ae23bd2a35728248103 Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Wed, 20 Sep 2023 09:09:56 +0200 Subject: [PATCH 07/13] Move manager assignment outside of if-else block --- include/cudawrappers/cu.hpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 397598d5..463bb1c2 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -247,21 +247,16 @@ class DeviceMemory : public Wrapper { unsigned int flags = 0) { if (type == CU_MEMORYTYPE_DEVICE and !flags) { checkCudaCall(cuMemAlloc(&_obj, size)); - manager = std::shared_ptr(new CUdeviceptr(_obj), - [](CUdeviceptr *ptr) { - cuMemFree(*ptr); - delete ptr; - }); } else if (type == CU_MEMORYTYPE_UNIFIED) { checkCudaCall(cuMemAllocManaged(&_obj, size, flags)); - manager = std::shared_ptr(new CUdeviceptr(_obj), - [](CUdeviceptr *ptr) { - cuMemFree(*ptr); - delete ptr; - }); } else { throw Error(CUDA_ERROR_INVALID_VALUE); } + manager = std::shared_ptr(new CUdeviceptr(_obj), + [](CUdeviceptr *ptr) { + cuMemFree(*ptr); + delete ptr; + }); } explicit DeviceMemory(CUdeviceptr ptr) : Wrapper(ptr) {} From 6d18c824c773311d692db5bf4c4d5d58b50536c7 Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Wed, 20 Sep 2023 15:26:28 +0200 Subject: [PATCH 08/13] Add tests for invalid arguments --- tests/test_vector_add.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_vector_add.cpp b/tests/test_vector_add.cpp index ed4d0995..4d01a205 100644 --- a/tests/test_vector_add.cpp +++ b/tests/test_vector_add.cpp @@ -126,4 +126,17 @@ TEST_CASE("Vector add") { check_arrays_equal(h_c, reference_c.data(), N); } } + + SECTION("Pass invalid CUmemorytype to cu::DeviceMemory constructor") { + CHECK_THROWS(cu::DeviceMemory(bytesize, CU_MEMORYTYPE_ARRAY)); + CHECK_THROWS(cu::DeviceMemory(bytesize, CU_MEMORYTYPE_HOST)); + } + + SECTION("Pass flags with CU_MEMORYTYPE_DEVICE") { + CHECK_NOTHROW(cu::DeviceMemory(bytesize, CU_MEMORYTYPE_DEVICE, 0)); + CHECK_THROWS( + cu::DeviceMemory(bytesize, CU_MEMORYTYPE_DEVICE, CU_MEM_ATTACH_GLOBAL)); + CHECK_THROWS( + cu::DeviceMemory(bytesize, CU_MEMORYTYPE_DEVICE, CU_MEM_ATTACH_HOST)); + } } From e884449e1d0607ad937aa66a187f8aa07c417013 Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Wed, 20 Sep 2023 15:36:43 +0200 Subject: [PATCH 09/13] Swap arguments for Stream::memPrefetchAsync --- include/cudawrappers/cu.hpp | 2 +- tests/test_vector_add.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index 463bb1c2..a6528f1f 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -436,7 +436,7 @@ class Stream : public Wrapper { checkCudaCall(cuMemcpyAsync(dstPtr, srcPtr, size, _obj)); } - void memPrefetchAsync(CUdeviceptr devPtr, CUdevice dstDevice, size_t size) { + void memPrefetchAsync(CUdeviceptr devPtr, size_t size, CUdevice dstDevice) { checkCudaCall(cuMemPrefetchAsync(devPtr, size, dstDevice, _obj)); } diff --git a/tests/test_vector_add.cpp b/tests/test_vector_add.cpp index 4d01a205..5de62cc9 100644 --- a/tests/test_vector_add.cpp +++ b/tests/test_vector_add.cpp @@ -117,10 +117,10 @@ TEST_CASE("Vector add") { std::vector parameters = {d_c.parameter(), d_a.parameter(), d_b.parameter(), &N}; - stream.memPrefetchAsync(d_a, device, bytesize); - stream.memPrefetchAsync(d_b, device, bytesize); + stream.memPrefetchAsync(d_a, bytesize, device); + stream.memPrefetchAsync(d_b, bytesize, device); stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters); - stream.memPrefetchAsync(d_c, CU_DEVICE_CPU, bytesize); + stream.memPrefetchAsync(d_c, bytesize, CU_DEVICE_CPU); stream.synchronize(); check_arrays_equal(h_c, reference_c.data(), N); From 43128db7c203c9cf537f739219960d6c96bae6b2 Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Wed, 20 Sep 2023 15:38:11 +0200 Subject: [PATCH 10/13] Make CU_DEVICE_CPU the default option for memPrefetchAsync --- include/cudawrappers/cu.hpp | 3 ++- tests/test_vector_add.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index a6528f1f..e53171da 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -436,7 +436,8 @@ class Stream : public Wrapper { checkCudaCall(cuMemcpyAsync(dstPtr, srcPtr, size, _obj)); } - void memPrefetchAsync(CUdeviceptr devPtr, size_t size, CUdevice dstDevice) { + void memPrefetchAsync(CUdeviceptr devPtr, size_t size, + CUdevice dstDevice = CU_DEVICE_CPU) { checkCudaCall(cuMemPrefetchAsync(devPtr, size, dstDevice, _obj)); } diff --git a/tests/test_vector_add.cpp b/tests/test_vector_add.cpp index 5de62cc9..de42b38d 100644 --- a/tests/test_vector_add.cpp +++ b/tests/test_vector_add.cpp @@ -120,7 +120,7 @@ TEST_CASE("Vector add") { stream.memPrefetchAsync(d_a, bytesize, device); stream.memPrefetchAsync(d_b, bytesize, device); stream.launchKernel(function, 1, 1, 1, N, 1, 1, 0, parameters); - stream.memPrefetchAsync(d_c, bytesize, CU_DEVICE_CPU); + stream.memPrefetchAsync(d_c, bytesize); stream.synchronize(); check_arrays_equal(h_c, reference_c.data(), N); From ac512be4eba688e3a1ca2821eb9649ca9ec13700 Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Thu, 21 Sep 2023 10:58:35 +0200 Subject: [PATCH 11/13] Add operator T *() to DeviceMemory --- include/cudawrappers/cu.hpp | 5 +++++ tests/test_vector_add.cpp | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index e53171da..db5a92c2 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -274,6 +274,11 @@ class DeviceMemory : public Wrapper { { return &_obj; } + + template + operator T *() { + return reinterpret_cast(_obj); + } }; class Array : public Wrapper { diff --git a/tests/test_vector_add.cpp b/tests/test_vector_add.cpp index de42b38d..1ac90c95 100644 --- a/tests/test_vector_add.cpp +++ b/tests/test_vector_add.cpp @@ -84,9 +84,9 @@ TEST_CASE("Vector add") { cu::DeviceMemory d_b(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST); cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_HOST); - float *h_a = reinterpret_cast(static_cast(d_a)); - float *h_b = reinterpret_cast(static_cast(d_b)); - float *h_c = reinterpret_cast(static_cast(d_c)); + float *h_a = d_a; + float *h_b = d_b; + float *h_c = d_c; std::vector reference_c(N); initialize_arrays(h_a, h_b, h_c, reference_c.data(), N); @@ -108,9 +108,9 @@ TEST_CASE("Vector add") { cu::DeviceMemory d_c(bytesize, CU_MEMORYTYPE_UNIFIED, CU_MEM_ATTACH_GLOBAL); - float *h_a = reinterpret_cast(static_cast(d_a)); - float *h_b = reinterpret_cast(static_cast(d_b)); - float *h_c = reinterpret_cast(static_cast(d_c)); + float *h_a = d_a; + float *h_b = d_b; + float *h_c = d_c; std::vector reference_c(N); initialize_arrays(h_a, h_b, h_c, reference_c.data(), N); From 7435205795b0a99d628dc0b876efc9cdad92a8bc Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Thu, 21 Sep 2023 12:25:28 +0200 Subject: [PATCH 12/13] Add safeguard to operator --- include/cudawrappers/cu.hpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/cudawrappers/cu.hpp b/include/cudawrappers/cu.hpp index db5a92c2..dbd45117 100644 --- a/include/cudawrappers/cu.hpp +++ b/include/cudawrappers/cu.hpp @@ -277,7 +277,15 @@ class DeviceMemory : public Wrapper { template operator T *() { - return reinterpret_cast(_obj); + bool data; + checkCudaCall( + cuPointerGetAttribute(&data, CU_POINTER_ATTRIBUTE_IS_MANAGED, _obj)); + if (data) { + return reinterpret_cast(_obj); + } else { + throw std::runtime_error( + "Cannot return memory of type CU_MEMORYTYPE_DEVICE as pointer."); + } } }; From d86bb12bd5961f956725cbab32a66cf26228bdc9 Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Thu, 21 Sep 2023 12:25:37 +0200 Subject: [PATCH 13/13] Add tests for cu::DeviceMemory operator T *() --- tests/test_cu.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_cu.cpp b/tests/test_cu.cpp index 769bcfbe..0f7805c7 100644 --- a/tests/test_cu.cpp +++ b/tests/test_cu.cpp @@ -143,4 +143,17 @@ TEST_CASE("Test zeroing cu::DeviceMemory", "[zero]") { CHECK(data_in == data_out); } + + SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_DEVICE as host pointer") { + cu::DeviceMemory mem(sizeof(float), CU_MEMORYTYPE_DEVICE, 0); + float* ptr; + CHECK_THROWS(ptr = mem); + } + + SECTION("Test cu::DeviceMemory with CU_MEMORYTYPE_UNIFIED as host pointer") { + cu::DeviceMemory mem(sizeof(float), CU_MEMORYTYPE_UNIFIED, + CU_MEM_ATTACH_GLOBAL); + float* ptr = mem; + CHECK_NOTHROW(ptr[0] = 42.f); + } }