Support multi-threads and batch, and support nvJPEG for JPEG-compress…

…ed images (#191) This patch is for supporting/addressing the following issues. - #139 - #123 - #149 Please see #149 for the detailed design. - Use [TaskFlow](https://github.com/taskflow/taskflow) to use a thread pool and distributes workload to multiple threads if `num_workers` parameter to `read_region()` API is larger than zero. - Implement a buffered loader to load images in a batch manner. - Uses nvJPEG to decode JPEG-compressed image if `device` parameter to `read_region()` API is `cuda`. (It requires performance improvement though). - File handler is now shared among multiple CuImage objects. - Clean up some code This implementation is already released as part of cuCIM [v21.12.01](https://github.com/rapidsai/cucim/wiki/release_notes_v21.12.01) release. The followings are added on top of existing changes: - Use nvjpeg.a from the publicly available package. - Fix GPU memory leak when using nvjpeg API (when `device='cuda'` parameter is used in `read_region` method). - Get `libculibos.a` from /usr/local/lib[64] if the library cannot be found in Conda environment. (nvjpeg depends on the static library). Authors: - Gigon Bae (https://github.com/gigony) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - https://github.com/jakirkham URL: #191
rapidsai · Jan 28, 2022 · 292c963 · 292c963
1 parent 8b909e3
commit 292c963
Show file tree

Hide file tree

Showing 63 changed files with 3,314 additions and 647 deletions.
diff --git a/.gitignore b/.gitignore
@@ -153,3 +153,5 @@ conda-bld
 # Large Images
 *.svs
 
+# Custom debug environment setup script for VS Code (used by scripts/debug_python)
+/scripts/debug_env.sh
diff --git a/.idea/cucim.iml b/.idea/cucim.iml
diff --git a/3rdparty/LICENSE.taskflow b/3rdparty/LICENSE.taskflow
@@ -0,0 +1,21 @@
+TASKFLOW MIT LICENSE
+
+Copyright (c) 2018-2021 Dr. Tsung-Wei Huang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -126,6 +126,7 @@ superbuild_depend(json)
 superbuild_depend(libcuckoo)
 superbuild_depend(boost-header-only)
 superbuild_depend(nvtx3)
+superbuild_depend(taskflow)
 
 ################################################################################
 # Define some names

diff --git a/LICENSE-3rdparty.md b/LICENSE-3rdparty.md
@@ -85,7 +85,7 @@ DLPack
   - https://github.com/dmlc/dlpack/blob/main/LICENSE
 - Copyright: DLPack Contributors
 
-NVIDIA CUDA TOOLKIT (including libcufile)
+NVIDIA CUDA TOOLKIT (including libcufile/libnvjpeg)
 - License: NVIDIA License
   - https://docs.nvidia.com/cuda/pdf/EULA.pdf
 - Copyright: NVIDIA Corporation
@@ -269,3 +269,9 @@ NVTX (NVIDIA Tool Extension Library)
   - https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/LICENSE.txt
 - Copyright: NVIDIA Corporation
 - Usage: Support for profiling with NVIDIA Nsight Systems
+
+Taskflow
+- License: MIT License
+  - https://github.com/taskflow/taskflow
+- Copyright: Dr. Tsung-Wei Huang
+- Usage: Threadpool implementation for batch processing.
diff --git a/benchmarks/main.cpp b/benchmarks/main.cpp
@@ -65,8 +65,7 @@ static void test_cucim(benchmark::State& state)
 
         cucim::CuImage image = cucim::CuImage(input_path.c_str());
         cucim::CuImage region =
-            image.read_region({ request_location[0], request_location[1] }, { state.range(0), state.range(0) }, 0,
-                              cucim::DimIndices{}, "cpu", nullptr, "");
+            image.read_region({ request_location[0], request_location[1] }, { state.range(0), state.range(0) }, 0);
     }
 }
 

diff --git a/conda/recipes/libcucim/build.sh b/conda/recipes/libcucim/build.sh
@@ -5,6 +5,7 @@ CUCIM_BUILD_TYPE=${CUCIM_BUILD_TYPE:-release}
 echo "CC          : ${CC}"
 echo "CXX         : ${CXX}"
 echo "CUDAHOSTCXX : ${CUDAHOSTCXX}"
+echo "CUDA        : ${CUDA}"
 
 # For now CUDAHOSTCXX is set to `/usr/bin/g++` by
 # https://github.com/rapidsai/docker/blob/161b200157206660d88fb02cf69fe58d363ac95e/generated-dockerfiles/rapidsai-core_ubuntu18.04-devel.Dockerfile

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -44,6 +44,7 @@ add_library(${CUCIM_PACKAGE_NAME}
         include/cucim/codec/base64.h
         include/cucim/codec/hash_function.h
         include/cucim/codec/methods.h
+        include/cucim/concurrent/threadpool.h
         include/cucim/config/config.h
         include/cucim/core/framework.h
         include/cucim/core/plugin.h
@@ -58,6 +59,9 @@ add_library(${CUCIM_PACKAGE_NAME}
         include/cucim/io/device.h
         include/cucim/io/device_type.h
         include/cucim/io/format/image_format.h
+        include/cucim/loader/batch_data_processor.h
+        include/cucim/loader/thread_batch_data_loader.h
+        include/cucim/loader/tile_info.h
         include/cucim/logger/logger.h
         include/cucim/logger/timer.h
         include/cucim/macros/defines.h
@@ -85,6 +89,7 @@ add_library(${CUCIM_PACKAGE_NAME}
         src/cache/image_cache_shared_memory.h
         src/cache/image_cache_shared_memory.cpp
         src/codec/base64.cpp
+        src/concurrent/threadpool.cpp
         src/config/config.cpp
         src/core/cucim_framework.h
         src/core/cucim_framework.cpp
@@ -98,6 +103,8 @@ add_library(${CUCIM_PACKAGE_NAME}
         src/io/device.cpp
         src/io/device_type.cpp
         src/io/format/image_format.cpp
+        src/loader/batch_data_processor.cpp
+        src/loader/thread_batch_data_loader.cpp
         src/logger/logger.cpp
         src/logger/timer.cpp
         src/memory/memory_manager.cpp
@@ -144,6 +151,7 @@ target_compile_definitions(${CUCIM_PACKAGE_NAME}
 target_link_libraries(${CUCIM_PACKAGE_NAME}
         PUBLIC
             ${CMAKE_DL_LIBS}
+            Threads::Threads # -lpthread
             $<BUILD_INTERFACE:deps::fmt>
             $<INSTALL_INTERFACE:cucim::fmt-header-only>
         PRIVATE
@@ -154,6 +162,7 @@ target_link_libraries(${CUCIM_PACKAGE_NAME}
             deps::boost-header-only
             deps::json
             deps::nvtx3
+            deps::taskflow
         )
 
 if (CUCIM_STATIC_GDS)

diff --git a/cpp/cmake/deps/taskflow.cmake b/cpp/cmake/deps/taskflow.cmake
@@ -0,0 +1,39 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if (NOT TARGET deps::taskflow)
+    FetchContent_Declare(
+            deps-taskflow
+            GIT_REPOSITORY https://github.com/taskflow/taskflow.git
+            GIT_TAG v3.2.0
+            GIT_SHALLOW TRUE
+    )
+    FetchContent_GetProperties(deps-taskflow)
+    if (NOT deps-taskflow_POPULATED)
+        message(STATUS "Fetching taskflow sources")
+        FetchContent_Populate(deps-taskflow)
+        message(STATUS "Fetching taskflow sources - done")
+    endif ()
+
+    set(TF_BUILD_TESTS OFF)
+    set(TF_BUILD_EXAMPLES OFF)
+
+    add_subdirectory(${deps-taskflow_SOURCE_DIR} ${deps-taskflow_BINARY_DIR} EXCLUDE_FROM_ALL)
+
+    add_library(deps::taskflow INTERFACE IMPORTED GLOBAL)
+    target_link_libraries(deps::taskflow INTERFACE Taskflow)
+    set(deps-taskflow_SOURCE_DIR ${deps-taskflow_SOURCE_DIR} CACHE INTERNAL "" FORCE)
+    mark_as_advanced(deps-taskflow_SOURCE_DIR)
+endif ()
diff --git a/cpp/include/cucim/cache/image_cache.h b/cpp/include/cucim/cache/image_cache.h
@@ -43,14 +43,18 @@ struct EXPORT_VISIBLE ImageCacheKey
 
 struct EXPORT_VISIBLE ImageCacheValue
 {
-    ImageCacheValue(void* data, uint64_t size, void* user_obj = nullptr);
+    ImageCacheValue(void* data,
+                    uint64_t size,
+                    void* user_obj = nullptr,
+                    const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU);
     virtual ~ImageCacheValue(){};
 
     operator bool() const;
 
     void* data = nullptr;
     uint64_t size = 0;
     void* user_obj = nullptr;
+    cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU;
 };
 
 /**
@@ -63,11 +67,14 @@ struct EXPORT_VISIBLE ImageCacheValue
 class EXPORT_VISIBLE ImageCache : public std::enable_shared_from_this<ImageCache>
 {
 public:
-    ImageCache(const ImageCacheConfig& config, CacheType type = CacheType::kNoCache);
+    ImageCache(const ImageCacheConfig& config,
+               CacheType type = CacheType::kNoCache,
+               const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU);
     virtual ~ImageCache(){};
 
     virtual CacheType type() const;
     virtual const char* type_str() const;
+    virtual cucim::io::DeviceType device_type() const;
     virtual ImageCacheConfig& config();
     virtual ImageCacheConfig get_config() const;
 
@@ -79,14 +86,17 @@ class EXPORT_VISIBLE ImageCache : public std::enable_shared_from_this<ImageCache
      * @return std::shared_ptr<ImageCacheKey> A shared pointer containing %ImageCacheKey.
      */
     virtual std::shared_ptr<ImageCacheKey> create_key(uint64_t file_hash, uint64_t index) = 0;
-    virtual std::shared_ptr<ImageCacheValue> create_value(void* data, uint64_t size) = 0;
+    virtual std::shared_ptr<ImageCacheValue> create_value(
+        void* data, uint64_t size, const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU) = 0;
 
     virtual void* allocate(std::size_t n) = 0;
 
     virtual void lock(uint64_t index) = 0;
     virtual void unlock(uint64_t index) = 0;
+    virtual void* mutex(uint64_t index) = 0;
 
     virtual bool insert(std::shared_ptr<ImageCacheKey>& key, std::shared_ptr<ImageCacheValue>& value) = 0;
+    virtual void remove_front() = 0;
 
     virtual uint32_t size() const = 0;
     virtual uint64_t memory_size() const = 0;
@@ -128,6 +138,7 @@ class EXPORT_VISIBLE ImageCache : public std::enable_shared_from_this<ImageCache
 
 protected:
     CacheType type_ = CacheType::kNoCache;
+    cucim::io::DeviceType device_type_ = cucim::io::DeviceType::kCPU;
     ImageCacheConfig config_;
 };
 

diff --git a/cpp/include/cucim/cache/image_cache_manager.h b/cpp/include/cucim/cache/image_cache_manager.h
@@ -20,14 +20,15 @@
 #include "cucim/core/framework.h"
 
 #include "cucim/cache/image_cache.h"
+#include "cucim/io/device_type.h"
 
 namespace cucim::cache
 {
 
 constexpr uint32_t kDefaultTileSize = 256;
 constexpr uint32_t kDefaultPatchSize = 256;
 
-uint32_t EXPORT_VISIBLE preferred_memory_capacity(const std::vector<uint32_t>& image_size,
+uint32_t EXPORT_VISIBLE preferred_memory_capacity(const std::vector<uint64_t>& image_size,
                                                   const std::vector<uint32_t>& tile_size,
                                                   const std::vector<uint32_t>& patch_size,
                                                   uint32_t bytes_per_pixel = 3);
@@ -43,9 +44,11 @@ class EXPORT_VISIBLE ImageCacheManager
     void reserve(uint32_t new_memory_capacity);
     void reserve(uint32_t new_memory_capacity, uint32_t new_capacity);
 
+    static std::unique_ptr<ImageCache> create_cache(const ImageCacheConfig& cache_config,
+                                                    const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU);
+
 private:
     std::unique_ptr<ImageCache> create_cache() const;
-    std::unique_ptr<ImageCache> create_cache(const ImageCacheConfig& cache_config) const;
 
     std::shared_ptr<ImageCache> cache_;
 };

diff --git a/cpp/include/cucim/concurrent/threadpool.h b/cpp/include/cucim/concurrent/threadpool.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUCIM_CONCURRENT_THREADPOOL_H
+#define CUCIM_CONCURRENT_THREADPOOL_H
+
+#include "cucim/macros/api_header.h"
+
+#include <functional>
+#include <future>
+#include <memory>
+
+namespace cucim::concurrent
+{
+
+class EXPORT_VISIBLE ThreadPool
+{
+public:
+    explicit ThreadPool(int32_t num_workers);
+    ThreadPool(const ThreadPool&) = delete;
+
+    ThreadPool& operator=(const ThreadPool&) = delete;
+
+    operator bool() const;
+
+    ~ThreadPool();
+
+    std::future<void> enqueue(std::function<void()> task);
+    void wait();
+
+private:
+    struct Executor;
+    std::unique_ptr<Executor> executor_;
+    size_t num_workers_;
+};
+
+} // namespace cucim::concurrent
+
+#endif // CUCIM_CONCURRENT_THREADPOOL_H