Refactor the GPU shared memory interfaces (#1585)

Related issue number -------------------- Fixes #1582, refactor the CMake script to minimize the dependency set as well. Signed-off-by: Tao He <[email protected]>
v6d-io · Sep 29, 2023 · dc44bbd · dc44bbd
1 parent d79f2e1
commit dc44bbd
Show file tree

Hide file tree

Showing 31 changed files with 592 additions and 832 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,6 +19,14 @@ if(POLICY CMP0069)
     cmake_policy(SET CMP0069 NEW)
     set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
 endif()
+if(POLICY CMP0104)
+    cmake_policy(SET CMP0104 OLD)
+    set(CMAKE_POLICY_DEFAULT_CMP0104 OLD)
+endif()
+if(POLICY CMP0146)
+    cmake_policy(SET CMP0146 OLD)
+    set(CMAKE_POLICY_DEFAULT_CMP0146 OLD)
+endif()
 
 project(vineyard LANGUAGES C CXX VERSION ${VINEYARD_VERSION})
 
@@ -32,10 +40,12 @@ option(VINEYARD_USE_LTO "Using IPO/LTO support for link-time optimization" OFF)
 option(USE_LIBUNWIND "Using libunwind to retrieve the stack backtrace when exception occurs" ON)
 option(USE_INCLUDE_WHAT_YOU_USE "Simply the intra-module dependencies with iwyu" OFF)
 option(USE_JSON_DIAGNOSTICS "Using json diagnostics to check the validity of metadata" OFF)
-option(USE_GPU "Using gpu support" OFF)
+option(USE_CUDA "Enabling GPU (CUDA) support" OFF)
 
 option(BUILD_VINEYARD_SERVER "Build vineyard's server" ON)
 option(BUILD_VINEYARD_SERVER_REDIS "Enable redis as the metadata backend" OFF)
+option(BUILD_VINEYARD_SERVER_ETCD "Enable redis as the metadata backend" ON)
+option(BUILD_VINEYARD_SERVER_SPILLING "Enable spilling functionalities in vineyardd" ON)
 option(BUILD_VINEYARD_CLIENT "Build vineyard's client" ON)
 option(BUILD_VINEYARD_CLIENT_VERBOSE "Build vineyard's client with the most verbose logs" OFF)
 option(BUILD_VINEYARD_PYTHON_BINDINGS "Build vineyard's python bindings" ON)
@@ -310,7 +320,7 @@ macro(find_boost)
         set(Boost_USE_STATIC_LIBS ON CACHE BOOL "Use static version of boost libraries.")
         set(Boost_USE_STATIC_RUNTIME ON CACHE BOOL "Use static version of boost runtimes.")
     endif()
-    find_package(Boost COMPONENTS system)
+    find_package(Boost COMPONENTS system filesystem)
     # Make boost::property_tree thread safe.
     add_compile_options(-DBOOST_SPIRIT_THREADSAFE)
     # Don't depends on the boost::system library.
@@ -473,11 +483,9 @@ macro(find_pthread)
 endmacro()
 
 macro(find_cuda)
-  #find cuda
-  enable_language(CUDA)
-  include(CheckLanguage)
-  check_language(CUDA)
-  find_package(CUDA REQUIRED)
+    # find cuda runtime library
+    set(CUDA_USE_STATIC_CUDA_RUNTIME ON)
+    find_package(CUDA REQUIRED)
 endmacro(find_cuda)
 
 macro(find_zstd)
@@ -649,16 +657,23 @@ endif()
 
 # build vineyardd
 if(BUILD_VINEYARD_SERVER)
-    find_apache_arrow()
-    find_etcd_cpp_apiv3()
     find_gflags()
     find_mimalloc()
-    find_openssl_libraries(REQUIRED)
 
     if(BUILD_VINEYARD_SERVER_REDIS)
         find_hiredis()
         find_redis_plus_plus()
     endif()
+    if(BUILD_VINEYARD_SERVER_ETCD)
+        find_etcd_cpp_apiv3()
+        find_openssl_libraries(REQUIRED)
+    endif()
+    if(BUILD_VINEYARD_SERVER_SPILLING)
+        find_apache_arrow()
+    endif()
+    if(USE_CUDA)
+        find_cuda()
+    endif()
 
     file(GLOB_RECURSE SERVER_SRC_FILES "src/server/*.cc"
                                        "src/common/compression/*.cc"
@@ -674,32 +689,13 @@ if(BUILD_VINEYARD_SERVER)
     target_compile_options(vineyardd PRIVATE -DBUILD_VINEYARDD)
     target_link_libraries(vineyardd PRIVATE ${Boost_LIBRARIES}
                                             ${CMAKE_DL_LIBS}
-                                            ${CPPREST_LIB}
-                                            ${ETCD_CPP_LIBRARIES}
                                             ${GFLAGS_LIBRARIES}
                                             ${GLOG_LIBRARIES}
-                                            ${OPENSSL_LIBRARIES}
                                             libzstd_static
                                             mimalloc-static
     )
-    target_include_directories(vineyardd PRIVATE ${ARROW_INCLUDE_DIR}
-                                                 ${GFLAGS_INCLUDE_DIR}
-    )
-    if(ARROW_SHARED_LIB)
-        target_link_libraries(vineyardd PRIVATE ${ARROW_SHARED_LIB})
-    else()
-        target_link_libraries(vineyardd PRIVATE ${ARROW_STATIC_LIB})
-    endif()
-
-    if(USE_GPU)
-        enable_language(CUDA)
-        find_package(CUDA REQUIRED)
-        add_definitions(-DENABLE_GPU)
-        include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-        target_link_libraries(vineyardd PUBLIC ${CUDA_LIBRARIES})
-    endif()
+    target_include_directories(vineyardd PRIVATE ${GFLAGS_INCLUDE_DIR})
 
-    target_include_directories(vineyardd PRIVATE ${ETCD_CPP_INCLUDE_DIR})
     if(BUILD_VINEYARD_SERVER_REDIS)
         target_compile_options(vineyardd PRIVATE -DBUILD_VINEYARDD_REDIS)
         target_include_directories(vineyardd PRIVATE ${HIREDIS_INCLUDE_DIR}
@@ -709,6 +705,29 @@ if(BUILD_VINEYARD_SERVER)
                                                 ${REDIS_PLUS_PLUS_LIBRARIES}
         )
     endif()
+    if(BUILD_VINEYARD_SERVER_ETCD)
+        target_compile_options(vineyardd PRIVATE -DBUILD_VINEYARDD_ETCD)
+        target_include_directories(vineyardd PRIVATE ${ETCD_CPP_INCLUDE_DIR})
+        target_link_libraries(vineyardd PRIVATE ${CPPREST_LIB}
+                                                ${ETCD_CPP_LIBRARIES}
+                                                ${OPENSSL_LIBRARIES}
+        )
+    endif()
+    if(BUILD_VINEYARD_SERVER_SPILLING)
+        target_compile_options(vineyardd PRIVATE -DBUILD_VINEYARDD_SPILLING)
+        target_include_directories(vineyardd PRIVATE ${ARROW_INCLUDE_DIR})
+        if(ARROW_SHARED_LIB)
+            target_link_libraries(vineyardd PRIVATE ${ARROW_SHARED_LIB})
+        else()
+            target_link_libraries(vineyardd PRIVATE ${ARROW_STATIC_LIB})
+        endif()
+    endif()
+
+    if(USE_CUDA)
+        target_compile_options(vineyardd PRIVATE -DENABLE_CUDA)
+        target_include_directories(vineyardd PRIVATE ${CUDA_INCLUDE_DIRS})
+        target_link_libraries(vineyardd PRIVATE ${CUDA_LIBRARIES})
+    endif()
 
     if(${LIBUNWIND_FOUND})
         target_link_libraries(vineyardd PRIVATE ${LIBUNWIND_LIBRARIES})
@@ -771,9 +790,13 @@ if(BUILD_VINEYARD_CLIENT)
                                                  Threads::Threads
     )
     target_link_libraries(vineyard_client PRIVATE libzstd_static)
-    if(USE_GPU)
-        find_package(CUDA REQUIRED)
-        target_link_libraries(vineyard_client PUBLIC ${CUDA_LIBRARIES})
+
+    if(USE_CUDA)
+        find_cuda()
+
+        target_compile_options(vineyard_client PRIVATE -DENABLE_CUDA)
+        target_include_directories(vineyard_client PRIVATE ${CUDA_INCLUDE_DIRS})
+        target_link_libraries(vineyard_client PRIVATE ${CUDA_LIBRARIES})
     endif()
 
     # make sure `vineyard_internal_registry` been built.

diff --git a/docs/tutorials/data-processing/gpu-memory-sharing.rst b/docs/tutorials/data-processing/gpu-memory-sharing.rst
@@ -24,12 +24,8 @@ kinds of implementation (GPU, PPU, etc.) as the abstraction to share GPU memory
 between different processes, as well as sharing memory between the host and
 device.
 
-The unified memory abstractions provides the following utilities:
-
-- Accessing the memory from GPU or CPU using an unified abstraction: the
-  :code:`GPUData()` and :code:`CPUData()`;
-- Synchronizing the memory between GPU and CPU: the :code:`syncFromCPU()` and
-  :code:`syncFromGPU()`.
+The unified memory abstraction is able to automatically synchronize the memory
+between host and devices by leverage the RAII mechanism of C++.
 
 Example
 ~~~~~~~
@@ -45,48 +41,66 @@ Example
 
       ObjectID object_id;
       Payload object;
-      std::shared_ptr<GPUUnifiedAddress> gua = nullptr;
-      RETURN_ON_ERROR(client.CreateGPUBuffer(data_size(), object_id, object, gua));
+      std::shared_ptr<MutableBuffer> buffer = nullptr;
+      RETURN_ON_ERROR(client.CreateGPUBuffer(data_size(), object_id, object, buffer));
+
+      CHECK(!buffer->is_cpu());
+      CHECK(buffer->is_mutable());
 
-- Write data to the GPU buffer:
+  The result buffer's data :code:`buffer->mutable_data()` is a GPU memory pointer,
+  which can be directly passed to GPU kernels, e.g.,
 
   .. code:: c++
 
-      void* gpu_ptr = nullptr;
-      RETURN_ON_ERROR(gua->GPUData(&gpu_ptr));
-      cudaMemcpy(gpu_ptr, data, data_size(), cudaMemcpyHostToDevice);
+      printKernel<<<1, 1>>>(buffer->data());
 
-  or, copy to CPU memory first and then synchronize to GPU memory (like CUDA unified memory):
+- Composing the buffer content from host code like Unified Memory:
 
   .. code:: c++
 
-      void* cpu_ptr = nullptr;
-      RETURN_ON_ERROR(gua->CPUData(&cpu_ptr));
-      memcpy(cpu_ptr, data, data_size());
-      RETURN_ON_ERROR(gua->syncFromCPU());
+      {
+        CUDABufferMirror mirror(*buffer, false);
+        memcpy(mirror.mutable_data(), "hello world", 12);
+      }
+
+  Here the :code:`mirror`'s :code:`data()` and :code:`mutable_data()` are host memory pointers
+  allocated using the :code:`cudaHostAlloc()` API. When :code:`CUDABufferMirror` destructing,
+  the host memory will be copied back to the GPU memory automatically.
+
+  The second argument of :code:`CUDABufferMirror` indicates whether the initial memory of the
+  GPU buffer needs to be copied to the host memory. Defaults to :code:`false`.
 
 - Accessing the GPU buffer from another process:
 
   .. code:: c++
 
       ObjectID object_id = ...;
-      std::shared_ptr<GPUUnifiedAddress> gua = nullptr;
-      RETURN_ON_ERROR(client.GetGPUBuffer(object_id, true, gua));
+      std::shared_ptr<Buffer> buffer = nullptr;
+      RETURN_ON_ERROR(client.GetGPUBuffer(object_id, true, buffer));
+      CHECK(!buffer->is_cpu());
+      CHECK(!buffer->is_mutable());
 
-      void* gpu_ptr = nullptr;
-      RETURN_ON_ERROR(gua->GPUData(&gpu_ptr));
+  The result buffer's data :code:`buffer->data()` is a GPU memory pointer, which can be directly
+  passed to GPU kernels, e.g.,
+
+  .. code:: c++
+
+      printKernel<<<1, 1>>>(buffer->data());
 
 - Accessing the shared GPU buffer from CPU:
 
   .. code:: c++
 
-      ObjectID object_id = ...;
-      std::shared_ptr<GPUUnifiedAddress> gua = nullptr;
-      RETURN_ON_ERROR(client.GetGPUBuffer(object_id, true, gua));
+      {
+        CUDABufferMirror mirror(*buffer, true);
+        printf("CPU data from GPU is: %s\n",
+              reinterpret_cast<const char*>(mirror.data()));
+      }
 
-      void* cpu_ptr = nullptr;
-      RETURN_ON_ERROR(gua->CPUData(&cpu_ptr));
-      RETURN_ON_ERROR(gua->syncFromGPU());
+  Using the :code:`CUDABufferMirror` to access the GPU buffer from CPU, the mirror's :code:`data()`
+  is a host memory pointer allocated using the :code:`cudaHostAlloc()` API. For immutable :code:`Buffer`,
+  the second argument of :code:`CUDABufferMirror` must be :code:`true`, and the GPU memory will be
+  copied to the host memory when the mirror is constructed.
 
 - Freeing the shared GPU buffer:
 
@@ -95,133 +109,5 @@ Example
       ObjectID object_id = ...;
       RETURN_ON_ERROR(client.DelData(object_id));
 
-:code:`UnifiedMemory` APIs
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The complete :code:`UnifiedMemory` APIs are defined as:
-
-.. code:: c++
-
-    class GPUUnifiedAddress {
-      /**
-      * @brief get the cpu memry address
-      *
-      * @param ptr the return cpu data address
-      * @return GUAError_t the error type
-      */
-      GUAError_t CPUData(void** ptr);
-
-      /**
-      * @brief get the gpu memory address
-      *
-      * @param ptr the return gpu data address
-      * @return GUAError_t the error type
-      */
-      GUAError_t GPUData(void** ptr);
-
-      /**
-      * @brief sync data from GPU related to this gua
-      *
-      * @return GUAError_t the error type
-      */
-      GUAError_t syncFromCPU();
-
-      /**
-      * @brief sync data from CPU related to this gua
-      *
-      * @return GUAError_t the error type
-      */
-      GUAError_t syncFromGPU();
-
-      /**
-      * @brief  Malloc memory related to this gua if needed.
-      *
-      * @param size the memory size to be allocated
-      * @param ptr the memory address on cpu or GPU
-      * @param is_GPU allocate on GPU
-      * @return GUAError_t the error type
-      */
-
-      GUAError_t ManagedMalloc(size_t size, void** ptr, bool is_GPU = false);
-      /**
-      * @brief Free the memory
-      *
-      */
-      void ManagedFree();
-
-      /**
-      * @brief GUA to json
-      *
-      */
-      void GUAToJSON();
-
-      /**
-      * @brief Get the Ipc Handle object
-      *
-      * @param handle the returned handle
-      * @return GUAError_t the error type
-      */
-
-      GUAError_t getIpcHandle(cudaIpcMemHandle_t& handle);
-      /**
-      * @brief Set the IpcHandle of this GUA
-      *
-      * @param handle
-      */
-      void setIpcHandle(cudaIpcMemHandle_t handle);
-
-      /**
-      * @brief Get the IpcHandle of this GUA as vector
-      *
-      * @return std::vector<int64_t>
-      */
-      std::vector<int64_t> getIpcHandleVec();
-
-      /**
-      * @brief Set the IpcHandle vector of this GUA
-      *
-      * @param handle_vec
-      */
-      void setIpcHandleVec(std::vector<int64_t> handle_vec);
-
-      /**
-      * @brief Set the GPU Mem Ptr object
-      *
-      * @param ptr
-      */
-      void setGPUMemPtr(void* ptr);
-
-      /**
-      * @brief return the GPU memory pointer
-      *
-      * @return void* the GPU-side memory address
-      */
-      void* getGPUMemPtr();
-      /**
-      * @brief Set the Cpu Mem Ptr object
-      *
-      * @param ptr
-      */
-      void setCPUMemPtr(void* ptr);
-
-      /**
-      * @brief Get the Cpu Mem Ptr object
-      *
-      * @return void*
-      */
-      void* getCPUMemPtr();
-
-      /**
-      * @brief Get the Size object
-      *
-      * @return int64_t
-      */
-      int64_t getSize();
-
-      /**
-      * @brief Set the Size object
-      *
-      * @param data_size
-      */
-      void setSize(int64_t data_size);
-    };
+For complete example about GPU memory sharing, please refer to
+`gpumalloc_test.cu <https://github.com/v6d-io/v6d/blob/main/test/gpumalloc_test.cu>`_