From 292c963d7449e9fff86c4a2fa68f5788c15cb259 Mon Sep 17 00:00:00 2001 From: Gigon Bae Date: Thu, 27 Jan 2022 23:55:36 -0800 Subject: [PATCH] Support multi-threads and batch, and support nvJPEG for JPEG-compressed images (#191) This patch is for supporting/addressing the following issues. - #139 - #123 - #149 Please see #149 for the detailed design. - Use [TaskFlow](https://github.com/taskflow/taskflow) to use a thread pool and distributes workload to multiple threads if `num_workers` parameter to `read_region()` API is larger than zero. - Implement a buffered loader to load images in a batch manner. - Uses nvJPEG to decode JPEG-compressed image if `device` parameter to `read_region()` API is `cuda`. (It requires performance improvement though). - File handler is now shared among multiple CuImage objects. - Clean up some code This implementation is already released as part of cuCIM [v21.12.01](https://github.com/rapidsai/cucim/wiki/release_notes_v21.12.01) release. The followings are added on top of existing changes: - Use nvjpeg.a from the publicly available package. - Fix GPU memory leak when using nvjpeg API (when `device='cuda'` parameter is used in `read_region` method). - Get `libculibos.a` from /usr/local/lib[64] if the library cannot be found in Conda environment. (nvjpeg depends on the static library). Authors: - Gigon Bae (https://github.com/gigony) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - https://github.com/jakirkham URL: https://github.com/rapidsai/cucim/pull/191 --- .gitignore | 2 + .idea/cucim.iml | 10 +- 3rdparty/LICENSE.taskflow | 21 + CMakeLists.txt | 1 + LICENSE-3rdparty.md | 8 +- benchmarks/main.cpp | 3 +- conda/recipes/libcucim/build.sh | 1 + cpp/CMakeLists.txt | 9 + cpp/cmake/deps/taskflow.cmake | 39 + cpp/include/cucim/cache/image_cache.h | 17 +- cpp/include/cucim/cache/image_cache_manager.h | 7 +- cpp/include/cucim/concurrent/threadpool.h | 52 ++ cpp/include/cucim/cuimage.h | 73 +- cpp/include/cucim/filesystem/cufile_driver.h | 2 +- cpp/include/cucim/filesystem/file_handle.h | 73 +- cpp/include/cucim/io/format/image_format.h | 37 +- .../cucim/loader/batch_data_processor.h | 59 ++ .../cucim/loader/thread_batch_data_loader.h | 114 +++ cpp/include/cucim/loader/tile_info.h | 37 + cpp/include/cucim/profiler/nvtx3.h | 2 + cpp/include/cucim/util/cuda.h | 37 +- cpp/plugins/cucim.kit.cumed/CMakeLists.txt | 2 + .../cucim.kit.cumed/src/cumed/cumed.cpp | 25 +- .../.idea/cucim.kit.cuslide.iml | 2 +- cpp/plugins/cucim.kit.cuslide/CMakeLists.txt | 12 + .../cmake/deps/libculibos.cmake | 42 + .../cmake/deps/libopenjpeg.cmake | 1 - .../cucim.kit.cuslide/cmake/deps/nvjpeg.cmake | 41 + .../cucim.kit.cuslide/src/cuslide/cuslide.cpp | 21 +- .../src/cuslide/jpeg/libnvjpeg.cpp | 57 ++ .../src/cuslide/jpeg/libnvjpeg.h | 36 + .../src/cuslide/loader/nvjpeg_processor.cpp | 439 ++++++++++ .../src/cuslide/loader/nvjpeg_processor.h | 110 +++ .../src/cuslide/tiff/ifd.cpp | 801 ++++++++++++------ .../cucim.kit.cuslide/src/cuslide/tiff/ifd.h | 25 +- .../src/cuslide/tiff/tiff.cpp | 36 +- .../cucim.kit.cuslide/src/cuslide/tiff/tiff.h | 7 +- cpp/src/cache/image_cache.cpp | 11 +- cpp/src/cache/image_cache_empty.cpp | 10 +- cpp/src/cache/image_cache_empty.h | 5 +- cpp/src/cache/image_cache_manager.cpp | 27 +- cpp/src/cache/image_cache_per_process.cpp | 117 ++- cpp/src/cache/image_cache_per_process.h | 14 +- cpp/src/cache/image_cache_shared_memory.cpp | 88 +- cpp/src/cache/image_cache_shared_memory.h | 14 +- cpp/src/concurrent/threadpool.cpp | 70 ++ cpp/src/cuimage.cpp | 424 +++++++-- cpp/src/filesystem/cufile_driver.cpp | 100 ++- cpp/src/loader/batch_data_processor.cpp | 77 ++ cpp/src/loader/thread_batch_data_loader.cpp | 320 +++++++ cpp/tests/CMakeLists.txt | 9 + cucim.code-workspace | 32 +- examples/cpp/tiff_image/main.cpp | 19 +- python/cucim/src/localtest.py | 2 +- .../clara/test_read_region_memory_usage.py | 11 +- python/pybind11/cache/cache_py.cpp | 4 +- python/pybind11/cache/cache_py.h | 2 +- python/pybind11/cache/cache_pydoc.h | 2 +- python/pybind11/cucim_py.cpp | 216 ++++- python/pybind11/cucim_py.h | 12 +- python/pybind11/cucim_pydoc.h | 10 + run | 72 ++ scripts/debug_python | 32 + 63 files changed, 3314 insertions(+), 647 deletions(-) create mode 100644 3rdparty/LICENSE.taskflow create mode 100644 cpp/cmake/deps/taskflow.cmake create mode 100644 cpp/include/cucim/concurrent/threadpool.h create mode 100644 cpp/include/cucim/loader/batch_data_processor.h create mode 100644 cpp/include/cucim/loader/thread_batch_data_loader.h create mode 100644 cpp/include/cucim/loader/tile_info.h create mode 100644 cpp/plugins/cucim.kit.cuslide/cmake/deps/libculibos.cmake create mode 100644 cpp/plugins/cucim.kit.cuslide/cmake/deps/nvjpeg.cmake create mode 100644 cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.cpp create mode 100644 cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.h create mode 100644 cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.cpp create mode 100644 cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.h create mode 100644 cpp/src/concurrent/threadpool.cpp create mode 100644 cpp/src/loader/batch_data_processor.cpp create mode 100644 cpp/src/loader/thread_batch_data_loader.cpp create mode 100755 scripts/debug_python diff --git a/.gitignore b/.gitignore index ef3f2babb..40bbb6354 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,5 @@ conda-bld # Large Images *.svs +# Custom debug environment setup script for VS Code (used by scripts/debug_python) +/scripts/debug_env.sh diff --git a/.idea/cucim.iml b/.idea/cucim.iml index 24b221981..6d70257c7 100644 --- a/.idea/cucim.iml +++ b/.idea/cucim.iml @@ -1,8 +1,2 @@ - - - - - - - - \ No newline at end of file + + \ No newline at end of file diff --git a/3rdparty/LICENSE.taskflow b/3rdparty/LICENSE.taskflow new file mode 100644 index 000000000..33887bc43 --- /dev/null +++ b/3rdparty/LICENSE.taskflow @@ -0,0 +1,21 @@ +TASKFLOW MIT LICENSE + +Copyright (c) 2018-2021 Dr. Tsung-Wei Huang + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 585314c27..3c5c6e91e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,6 +126,7 @@ superbuild_depend(json) superbuild_depend(libcuckoo) superbuild_depend(boost-header-only) superbuild_depend(nvtx3) +superbuild_depend(taskflow) ################################################################################ # Define some names diff --git a/LICENSE-3rdparty.md b/LICENSE-3rdparty.md index 680306f85..05c3f8cc7 100644 --- a/LICENSE-3rdparty.md +++ b/LICENSE-3rdparty.md @@ -85,7 +85,7 @@ DLPack - https://github.com/dmlc/dlpack/blob/main/LICENSE - Copyright: DLPack Contributors -NVIDIA CUDA TOOLKIT (including libcufile) +NVIDIA CUDA TOOLKIT (including libcufile/libnvjpeg) - License: NVIDIA License - https://docs.nvidia.com/cuda/pdf/EULA.pdf - Copyright: NVIDIA Corporation @@ -269,3 +269,9 @@ NVTX (NVIDIA Tool Extension Library) - https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/LICENSE.txt - Copyright: NVIDIA Corporation - Usage: Support for profiling with NVIDIA Nsight Systems + +Taskflow +- License: MIT License + - https://github.com/taskflow/taskflow +- Copyright: Dr. Tsung-Wei Huang +- Usage: Threadpool implementation for batch processing. diff --git a/benchmarks/main.cpp b/benchmarks/main.cpp index d33516bd6..7c2577713 100644 --- a/benchmarks/main.cpp +++ b/benchmarks/main.cpp @@ -65,8 +65,7 @@ static void test_cucim(benchmark::State& state) cucim::CuImage image = cucim::CuImage(input_path.c_str()); cucim::CuImage region = - image.read_region({ request_location[0], request_location[1] }, { state.range(0), state.range(0) }, 0, - cucim::DimIndices{}, "cpu", nullptr, ""); + image.read_region({ request_location[0], request_location[1] }, { state.range(0), state.range(0) }, 0); } } diff --git a/conda/recipes/libcucim/build.sh b/conda/recipes/libcucim/build.sh index 364bb9533..9d2217bbc 100644 --- a/conda/recipes/libcucim/build.sh +++ b/conda/recipes/libcucim/build.sh @@ -5,6 +5,7 @@ CUCIM_BUILD_TYPE=${CUCIM_BUILD_TYPE:-release} echo "CC : ${CC}" echo "CXX : ${CXX}" echo "CUDAHOSTCXX : ${CUDAHOSTCXX}" +echo "CUDA : ${CUDA}" # For now CUDAHOSTCXX is set to `/usr/bin/g++` by # https://github.com/rapidsai/docker/blob/161b200157206660d88fb02cf69fe58d363ac95e/generated-dockerfiles/rapidsai-core_ubuntu18.04-devel.Dockerfile diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f6670c982..67409a699 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -44,6 +44,7 @@ add_library(${CUCIM_PACKAGE_NAME} include/cucim/codec/base64.h include/cucim/codec/hash_function.h include/cucim/codec/methods.h + include/cucim/concurrent/threadpool.h include/cucim/config/config.h include/cucim/core/framework.h include/cucim/core/plugin.h @@ -58,6 +59,9 @@ add_library(${CUCIM_PACKAGE_NAME} include/cucim/io/device.h include/cucim/io/device_type.h include/cucim/io/format/image_format.h + include/cucim/loader/batch_data_processor.h + include/cucim/loader/thread_batch_data_loader.h + include/cucim/loader/tile_info.h include/cucim/logger/logger.h include/cucim/logger/timer.h include/cucim/macros/defines.h @@ -85,6 +89,7 @@ add_library(${CUCIM_PACKAGE_NAME} src/cache/image_cache_shared_memory.h src/cache/image_cache_shared_memory.cpp src/codec/base64.cpp + src/concurrent/threadpool.cpp src/config/config.cpp src/core/cucim_framework.h src/core/cucim_framework.cpp @@ -98,6 +103,8 @@ add_library(${CUCIM_PACKAGE_NAME} src/io/device.cpp src/io/device_type.cpp src/io/format/image_format.cpp + src/loader/batch_data_processor.cpp + src/loader/thread_batch_data_loader.cpp src/logger/logger.cpp src/logger/timer.cpp src/memory/memory_manager.cpp @@ -144,6 +151,7 @@ target_compile_definitions(${CUCIM_PACKAGE_NAME} target_link_libraries(${CUCIM_PACKAGE_NAME} PUBLIC ${CMAKE_DL_LIBS} + Threads::Threads # -lpthread $ $ PRIVATE @@ -154,6 +162,7 @@ target_link_libraries(${CUCIM_PACKAGE_NAME} deps::boost-header-only deps::json deps::nvtx3 + deps::taskflow ) if (CUCIM_STATIC_GDS) diff --git a/cpp/cmake/deps/taskflow.cmake b/cpp/cmake/deps/taskflow.cmake new file mode 100644 index 000000000..147b6f4c3 --- /dev/null +++ b/cpp/cmake/deps/taskflow.cmake @@ -0,0 +1,39 @@ +# +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if (NOT TARGET deps::taskflow) + FetchContent_Declare( + deps-taskflow + GIT_REPOSITORY https://github.com/taskflow/taskflow.git + GIT_TAG v3.2.0 + GIT_SHALLOW TRUE + ) + FetchContent_GetProperties(deps-taskflow) + if (NOT deps-taskflow_POPULATED) + message(STATUS "Fetching taskflow sources") + FetchContent_Populate(deps-taskflow) + message(STATUS "Fetching taskflow sources - done") + endif () + + set(TF_BUILD_TESTS OFF) + set(TF_BUILD_EXAMPLES OFF) + + add_subdirectory(${deps-taskflow_SOURCE_DIR} ${deps-taskflow_BINARY_DIR} EXCLUDE_FROM_ALL) + + add_library(deps::taskflow INTERFACE IMPORTED GLOBAL) + target_link_libraries(deps::taskflow INTERFACE Taskflow) + set(deps-taskflow_SOURCE_DIR ${deps-taskflow_SOURCE_DIR} CACHE INTERNAL "" FORCE) + mark_as_advanced(deps-taskflow_SOURCE_DIR) +endif () diff --git a/cpp/include/cucim/cache/image_cache.h b/cpp/include/cucim/cache/image_cache.h index f8824be7b..d389bc10a 100644 --- a/cpp/include/cucim/cache/image_cache.h +++ b/cpp/include/cucim/cache/image_cache.h @@ -43,7 +43,10 @@ struct EXPORT_VISIBLE ImageCacheKey struct EXPORT_VISIBLE ImageCacheValue { - ImageCacheValue(void* data, uint64_t size, void* user_obj = nullptr); + ImageCacheValue(void* data, + uint64_t size, + void* user_obj = nullptr, + const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU); virtual ~ImageCacheValue(){}; operator bool() const; @@ -51,6 +54,7 @@ struct EXPORT_VISIBLE ImageCacheValue void* data = nullptr; uint64_t size = 0; void* user_obj = nullptr; + cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU; }; /** @@ -63,11 +67,14 @@ struct EXPORT_VISIBLE ImageCacheValue class EXPORT_VISIBLE ImageCache : public std::enable_shared_from_this { public: - ImageCache(const ImageCacheConfig& config, CacheType type = CacheType::kNoCache); + ImageCache(const ImageCacheConfig& config, + CacheType type = CacheType::kNoCache, + const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU); virtual ~ImageCache(){}; virtual CacheType type() const; virtual const char* type_str() const; + virtual cucim::io::DeviceType device_type() const; virtual ImageCacheConfig& config(); virtual ImageCacheConfig get_config() const; @@ -79,14 +86,17 @@ class EXPORT_VISIBLE ImageCache : public std::enable_shared_from_this A shared pointer containing %ImageCacheKey. */ virtual std::shared_ptr create_key(uint64_t file_hash, uint64_t index) = 0; - virtual std::shared_ptr create_value(void* data, uint64_t size) = 0; + virtual std::shared_ptr create_value( + void* data, uint64_t size, const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU) = 0; virtual void* allocate(std::size_t n) = 0; virtual void lock(uint64_t index) = 0; virtual void unlock(uint64_t index) = 0; + virtual void* mutex(uint64_t index) = 0; virtual bool insert(std::shared_ptr& key, std::shared_ptr& value) = 0; + virtual void remove_front() = 0; virtual uint32_t size() const = 0; virtual uint64_t memory_size() const = 0; @@ -128,6 +138,7 @@ class EXPORT_VISIBLE ImageCache : public std::enable_shared_from_this& image_size, +uint32_t EXPORT_VISIBLE preferred_memory_capacity(const std::vector& image_size, const std::vector& tile_size, const std::vector& patch_size, uint32_t bytes_per_pixel = 3); @@ -43,9 +44,11 @@ class EXPORT_VISIBLE ImageCacheManager void reserve(uint32_t new_memory_capacity); void reserve(uint32_t new_memory_capacity, uint32_t new_capacity); + static std::unique_ptr create_cache(const ImageCacheConfig& cache_config, + const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU); + private: std::unique_ptr create_cache() const; - std::unique_ptr create_cache(const ImageCacheConfig& cache_config) const; std::shared_ptr cache_; }; diff --git a/cpp/include/cucim/concurrent/threadpool.h b/cpp/include/cucim/concurrent/threadpool.h new file mode 100644 index 000000000..33e0a18ca --- /dev/null +++ b/cpp/include/cucim/concurrent/threadpool.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CUCIM_CONCURRENT_THREADPOOL_H +#define CUCIM_CONCURRENT_THREADPOOL_H + +#include "cucim/macros/api_header.h" + +#include +#include +#include + +namespace cucim::concurrent +{ + +class EXPORT_VISIBLE ThreadPool +{ +public: + explicit ThreadPool(int32_t num_workers); + ThreadPool(const ThreadPool&) = delete; + + ThreadPool& operator=(const ThreadPool&) = delete; + + operator bool() const; + + ~ThreadPool(); + + std::future enqueue(std::function task); + void wait(); + +private: + struct Executor; + std::unique_ptr executor_; + size_t num_workers_; +}; + +} // namespace cucim::concurrent + +#endif // CUCIM_CONCURRENT_THREADPOOL_H diff --git a/cpp/include/cucim/cuimage.h b/cpp/include/cucim/cuimage.h index a7a9b22d9..28a9120b7 100644 --- a/cpp/include/cucim/cuimage.h +++ b/cpp/include/cucim/cuimage.h @@ -23,20 +23,28 @@ #include "cucim/filesystem/file_path.h" #include "cucim/io/device.h" #include "cucim/io/format/image_format.h" +#include "cucim/loader/thread_batch_data_loader.h" #include "cucim/memory/dlpack.h" #include "cucim/plugin/image_format.h" #include "cucim/profiler/profiler.h" #include -#include -#include +#include // for std::ptrdiff_t +#include // for std::forward_iterator_tag #include #include +#include +#include #include namespace cucim { +// Forward declarations +class CuImage; +template +class CuImageIterator; + using DetectedFormat = std::pair>; using Metadata = std::string; using Shape = std::vector; @@ -154,11 +162,19 @@ class EXPORT_VISIBLE CuImage : public std::enable_shared_from_this ResolutionInfo resolutions() const; + loader::ThreadBatchDataLoader* loader() const; + memory::DLTContainer container() const; CuImage read_region(std::vector&& location, std::vector&& size, uint16_t level = 0, + uint32_t num_workers = 0, + uint32_t batch_size = 1, + bool drop_last = false, + uint32_t prefetch_factor = 2, + bool shuffle = false, + uint64_t seed = 0, const DimIndices& region_dim_indices = {}, const io::Device& device = "cpu", DLTensor* buf = nullptr, @@ -171,6 +187,22 @@ class EXPORT_VISIBLE CuImage : public std::enable_shared_from_this void close(); + ///////////////////////////// + // Iterator implementation // + ///////////////////////////// + + using iterator = CuImageIterator; + using const_iterator = CuImageIterator; + + friend class CuImageIterator; + friend class CuImageIterator; + + iterator begin(); + iterator end(); + + const_iterator begin() const; + const_iterator end() const; + private: using Mutex = std::mutex; using ScopedLock = std::scoped_lock; @@ -191,7 +223,7 @@ class EXPORT_VISIBLE CuImage : public std::enable_shared_from_this mutable Mutex mutex_; cucim::io::format::ImageFormatDesc* image_format_ = nullptr; - CuCIMFileHandle file_handle_{}; + std::shared_ptr file_handle_; io::format::ImageMetadataDesc* image_metadata_ = nullptr; io::format::ImageDataDesc* image_data_ = nullptr; bool is_loaded_ = false; @@ -199,6 +231,41 @@ class EXPORT_VISIBLE CuImage : public std::enable_shared_from_this std::set associated_images_; }; +template +class EXPORT_VISIBLE CuImageIterator +{ +public: + using iterator_category = std::forward_iterator_tag; + using difference_type = std::ptrdiff_t; + using value_type = DataType; + using pointer = value_type*; + using reference = std::shared_ptr; + + CuImageIterator(std::shared_ptr cuimg, bool ending = false); + CuImageIterator(const CuImageIterator& it) = default; + + reference operator*() const; + pointer operator->(); + CuImageIterator& operator++(); + CuImageIterator operator++(int); + bool operator==(const CuImageIterator& other); + bool operator!=(const CuImageIterator& other); + + int64_t index(); /// batch index + uint64_t size() const; /// number of batches + +private: + void increase_index_(); + + std::shared_ptr cuimg_; + void* loader_ = nullptr; + int64_t batch_index_ = 0; + uint64_t total_batch_count_ = 0; +}; + +template class CuImageIterator; +template class CuImageIterator; + } // namespace cucim #endif // CUCIM_CUIMAGE_H diff --git a/cpp/include/cucim/filesystem/cufile_driver.h b/cpp/include/cucim/filesystem/cufile_driver.h index f2ec749cd..e1877d5aa 100644 --- a/cpp/include/cucim/filesystem/cufile_driver.h +++ b/cpp/include/cucim/filesystem/cufile_driver.h @@ -190,7 +190,7 @@ class EXPORT_VISIBLE CuFileDriver : public std::enable_shared_from_this handle_; }; } // namespace cucim::filesystem diff --git a/cpp/include/cucim/filesystem/file_handle.h b/cpp/include/cucim/filesystem/file_handle.h index 3bf2c005b..5ecbd56fa 100644 --- a/cpp/include/cucim/filesystem/file_handle.h +++ b/cpp/include/cucim/filesystem/file_handle.h @@ -21,8 +21,16 @@ #include "../macros/defines.h" #include #include +#include + +#include + +#include "cucim/memory/memory_manager.h" typedef void* CUfileHandle_t; +typedef void* CuCIMFileHandle_share; +typedef void* CuCIMFileHandle_ptr; +typedef bool (*CuCIMFileHandleDeleter)(CuCIMFileHandle_ptr); enum class FileHandleType: uint16_t { @@ -35,29 +43,50 @@ enum class FileHandleType: uint16_t #if CUCIM_PLATFORM_LINUX -struct CuCIMFileHandle + +struct EXPORT_VISIBLE CuCIMFileHandle : public std::enable_shared_from_this { -# ifdef __cplusplus - EXPORT_VISIBLE CuCIMFileHandle(); - EXPORT_VISIBLE CuCIMFileHandle(int fd, CUfileHandle_t cufile, FileHandleType type, char* path, void* client_data); - EXPORT_VISIBLE CuCIMFileHandle(int fd, - CUfileHandle_t cufile, - FileHandleType type, - char* path, - void* client_data, - uint64_t dev, - uint64_t ino, - int64_t mtime); -# endif - int fd; - CUfileHandle_t cufile; - FileHandleType type; /// 1: POSIX, 2: POSIX+ODIRECT, 4: MemoryMapped, 8: GPUDirect - char* path; - void* client_data; - uint64_t hash_value; - uint64_t dev; - uint64_t ino; - int64_t mtime; + CuCIMFileHandle(); + CuCIMFileHandle(int fd, CUfileHandle_t cufile, FileHandleType type, char* path, void* client_data); + CuCIMFileHandle(int fd, + CUfileHandle_t cufile, + FileHandleType type, + char* path, + void* client_data, + uint64_t dev, + uint64_t ino, + int64_t mtime); + + ~CuCIMFileHandle() + { + if (path && path[0] != '\0') + { + cucim_free(path); + path = nullptr; + } + + if (deleter) + { + deleter(this); + deleter = nullptr; + } + } + + CuCIMFileHandleDeleter set_deleter(CuCIMFileHandleDeleter deleter) + { + return this->deleter = deleter; + } + + int fd = -1; + CUfileHandle_t cufile = nullptr; + FileHandleType type = FileHandleType::kUnknown; /// 1: POSIX, 2: POSIX+ODIRECT, 4: MemoryMapped, 8: GPUDirect + char* path = nullptr; + void* client_data = nullptr; + uint64_t hash_value = 0; + uint64_t dev = 0; + uint64_t ino = 0; + int64_t mtime = 0; + CuCIMFileHandleDeleter deleter = nullptr; }; #else # error "This platform is not supported!" diff --git a/cpp/include/cucim/io/format/image_format.h b/cpp/include/cucim/io/format/image_format.h index 091ea098b..c83f72dc9 100644 --- a/cpp/include/cucim/io/format/image_format.h +++ b/cpp/include/cucim/io/format/image_format.h @@ -160,6 +160,7 @@ struct ImageDataDesc { DLTensor container; char* shm_name; + void* loader; }; struct ImageCheckerDesc @@ -183,7 +184,7 @@ struct ImageParserDesc * @param file_path * @return */ - CuCIMFileHandle(CUCIM_ABI* open)(const char* file_path); + CuCIMFileHandle_share(CUCIM_ABI* open)(const char* file_path); /** * @@ -191,26 +192,36 @@ struct ImageParserDesc * @param out_metadata * @return */ - bool(CUCIM_ABI* parse)(CuCIMFileHandle* handle, ImageMetadataDesc* out_metadata); + bool(CUCIM_ABI* parse)(CuCIMFileHandle_ptr handle, ImageMetadataDesc* out_metadata); /** * * @param handle * @return */ - bool(CUCIM_ABI* close)(CuCIMFileHandle* handle); + bool(CUCIM_ABI* close)(CuCIMFileHandle_ptr handle); }; struct ImageReaderRegionRequestDesc { - int64_t* location; - int64_t* size; - uint16_t level; - DimIndicesDesc region_dim_indices; - char* associated_image_name; - char* device; - DLTensor* buf; - char* shm_name; + int64_t* location = nullptr; + void* location_unique = nullptr; + int64_t* size = nullptr; + void* size_unique = nullptr; + uint64_t location_len = 1; + int32_t size_ndim = 2; + uint16_t level = 0; + uint32_t num_workers = 0; + uint32_t batch_size = 1; + bool drop_last = false; + uint32_t prefetch_factor = 2; + bool shuffle = false; + uint64_t seed = 0; + DimIndicesDesc region_dim_indices{}; + char* associated_image_name = nullptr; + char* device = nullptr; + DLTensor* buf = nullptr; + char* shm_name = nullptr; }; struct ImageReaderDesc @@ -223,7 +234,7 @@ struct ImageReaderDesc * @param out_image_metadata needed for associated_image * @return */ - bool(CUCIM_ABI* read)(const CuCIMFileHandle* handle, + bool(CUCIM_ABI* read)(const CuCIMFileHandle_ptr handle, const ImageMetadataDesc* metadata, const ImageReaderRegionRequestDesc* request, ImageDataDesc* out_image_data, @@ -239,7 +250,7 @@ struct ImageWriterDesc * @param image_data * @return */ - bool(CUCIM_ABI* write)(const CuCIMFileHandle* handle, + bool(CUCIM_ABI* write)(const CuCIMFileHandle_ptr handle, const ImageMetadataDesc* metadata, const ImageDataDesc* image_data); }; diff --git a/cpp/include/cucim/loader/batch_data_processor.h b/cpp/include/cucim/loader/batch_data_processor.h new file mode 100644 index 000000000..7570c772a --- /dev/null +++ b/cpp/include/cucim/loader/batch_data_processor.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CUCIM_LOADER_BATCH_DATA_PROCESSOR_H +#define CUCIM_LOADER_BATCH_DATA_PROCESSOR_H + +#include "cucim/macros/api_header.h" + +#include +#include + +#include + +#include "tile_info.h" + +namespace cucim::loader +{ + +class EXPORT_VISIBLE BatchDataProcessor +{ +public: + BatchDataProcessor(uint32_t batch_size); + virtual ~BatchDataProcessor(); + + void add_tile(const TileInfo& tile); + TileInfo remove_front_tile(); + + virtual uint32_t request(std::deque& batch_item_counts, const uint32_t num_remaining_patches); + virtual uint32_t wait_batch(const uint32_t index_in_task, + std::deque& batch_item_counts, + const uint32_t num_remaining_patches); + + virtual std::shared_ptr wait_for_processing(uint32_t); + + virtual void shutdown(); + +protected: + uint32_t batch_size_ = 1; + uint64_t total_index_count_ = 0; + uint64_t processed_index_count_ = 0; + std::deque tiles_; +}; + +} // namespace cucim::loader + +#endif // CUCIM_LOADER_BATCH_DATA_PROCESSOR_H diff --git a/cpp/include/cucim/loader/thread_batch_data_loader.h b/cpp/include/cucim/loader/thread_batch_data_loader.h new file mode 100644 index 000000000..c368184c9 --- /dev/null +++ b/cpp/include/cucim/loader/thread_batch_data_loader.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CUCIM_LOADER_THREAD_BATCH_DATA_LOADER_H +#define CUCIM_LOADER_THREAD_BATCH_DATA_LOADER_H + +#include "cucim/macros/api_header.h" + +#include +#include +#include +#include + +#include "cucim/cache/image_cache.h" +#include "cucim/concurrent/threadpool.h" +#include "cucim/io/device.h" +#include "cucim/loader/batch_data_processor.h" +#include "cucim/loader/tile_info.h" + +namespace cucim::loader +{ + +class EXPORT_VISIBLE ThreadBatchDataLoader +{ +public: + using LoadFunc = std::function; + + ThreadBatchDataLoader(LoadFunc load_func, + std::unique_ptr batch_data_processor, + cucim::io::Device out_device, + std::unique_ptr> location, + std::unique_ptr> image_size, + uint64_t location_len, + size_t one_raster_size, + uint32_t batch_size, + uint32_t prefetch_factor, + uint32_t num_workers); + + ~ThreadBatchDataLoader(); + + operator bool() const; + + uint8_t* raster_pointer(uint64_t location_index) const; + uint32_t request(uint32_t load_size = 0); + uint32_t wait_batch(); + /** + * @brief Return the next batch of data. + * + * If the number of workers is zero, this function will return the ownership of the data. + * @return uint8_t* The pointer to the data. + */ + uint8_t* next_data(); + + BatchDataProcessor* batch_data_processor(); + std::shared_ptr wait_for_processing(uint32_t index); + + uint64_t size() const; + uint32_t batch_size() const; + + uint64_t total_batch_count() const; + uint64_t processed_batch_count() const; + uint8_t* data() const; + uint32_t data_batch_size() const; + + bool enqueue(std::function task, const TileInfo& tile); + +private: + bool stopped_ = false; + LoadFunc load_func_; + cucim::io::Device out_device_; + std::unique_ptr> location_ = nullptr; + std::unique_ptr> image_size_ = nullptr; + uint64_t location_len_ = 0; + size_t one_rester_size_ = 0; + uint32_t batch_size_ = 1; + uint32_t prefetch_factor_ = 2; + uint32_t num_workers_ = 0; + + // For nvjpeg + std::unique_ptr batch_data_processor_; + + size_t buffer_item_len_ = 0; + size_t buffer_size_ = 0; + std::vector raster_data_; + std::deque> tasks_; + // NOTE: the order is important ('thread_pool_' depends on 'raster_data_' and 'tasks_') + cucim::concurrent::ThreadPool thread_pool_; + + uint64_t queued_item_count_ = 0; + uint64_t buffer_item_head_index_ = 0; + uint64_t buffer_item_tail_index_ = 0; + + std::deque batch_item_counts_; + uint64_t processed_batch_count_ = 0; + uint8_t* current_data_ = nullptr; + uint32_t current_data_batch_size_ = 0; +}; + +} // namespace cucim::loader + +#endif // CUCIM_LOADER_THREAD_BATCH_DATA_LOADER_H diff --git a/cpp/include/cucim/loader/tile_info.h b/cpp/include/cucim/loader/tile_info.h new file mode 100644 index 000000000..e55bc03a4 --- /dev/null +++ b/cpp/include/cucim/loader/tile_info.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CUCIM_LOADER_TILE_INFO_H +#define CUCIM_LOADER_TILE_INFO_H + +#include "cucim/macros/api_header.h" + +#include + +namespace cucim::loader +{ + +struct EXPORT_VISIBLE TileInfo +{ + int64_t location_index = 0; // patch # + int64_t index = 0; // tile # + uint64_t offset = 0; + uint64_t size = 0; +}; + +} // namespace cucim::loader + +#endif // CUCIM_LOADER_TILE_INFO_H diff --git a/cpp/include/cucim/profiler/nvtx3.h b/cpp/include/cucim/profiler/nvtx3.h index 494bc9320..e438a305a 100644 --- a/cpp/include/cucim/profiler/nvtx3.h +++ b/cpp/include/cucim/profiler/nvtx3.h @@ -188,8 +188,10 @@ DEFINE_EVENT(ifd_read, "IFD::read()", io, 255, 255, 0, 0); DEFINE_EVENT(ifd_read_slowpath, "IFD::read::slow_path", io, 255, 255, 0, 0); DEFINE_EVENT(ifd_read_region_tiles, "IFD::read_region_tiles()", io, 255, 255, 0, 0); DEFINE_EVENT(ifd_read_region_tiles_iter, "IFD::read_region_tiles::iter", io, 255, 255, 0, 0); +DEFINE_EVENT(ifd_read_region_tiles_task, "IFD::read_region_tiles::task", io, 255, 255, 0, 0); DEFINE_EVENT(ifd_read_region_tiles_boundary, "IFD::read_region_tiles_boundary()", io, 255, 255, 0, 0); DEFINE_EVENT(ifd_read_region_tiles_boundary_iter, "IFD::read_region_tiles_boundary::iter", io, 255, 255, 0, 0); +DEFINE_EVENT(ifd_read_region_tiles_boundary_task, "IFD::read_region_tiles_boundary::task", io, 255, 255, 0, 0); DEFINE_EVENT(ifd_decompression, "IFD::decompression", compute, 255, 0, 255, 0); DEFINE_EVENT(decoder_libjpeg_turbo_tjAlloc, "libjpeg-turbo::tjAlloc()", memory, 255, 63, 72, 204); diff --git a/cpp/include/cucim/util/cuda.h b/cpp/include/cucim/util/cuda.h index f89b1adaf..12f161f1e 100644 --- a/cpp/include/cucim/util/cuda.h +++ b/cpp/include/cucim/util/cuda.h @@ -17,7 +17,10 @@ #ifndef CUCIM_UTIL_CUDA_H #define CUCIM_UTIL_CUDA_H -#include + +#if CUCIM_SUPPORT_CUDA +# include +#endif #define CUDA_TRY(stmt) \ { \ @@ -29,6 +32,38 @@ } \ } +#define CUDA_ERROR(stmt) \ + { \ + cuda_status = stmt; \ + if (cudaSuccess != cuda_status) \ + { \ + throw std::runtime_error( \ + fmt::format("[Error] CUDA Runtime call {} in line {} of file {} failed with '{}' ({}).\n", #stmt, \ + __LINE__, __FILE__, cudaGetErrorString(cuda_status), cuda_status)); \ + } \ + } + +#define NVJPEG_TRY(stmt) \ + { \ + nvjpegStatus_t _nvjpeg_status = stmt; \ + if (_nvjpeg_status != NVJPEG_STATUS_SUCCESS) \ + { \ + fmt::print("[Error] NVJPEG call {} in line {} of file {} failed with the error code {}.\n", #stmt, \ + __LINE__, __FILE__, _nvjpeg_status)); \ + } \ + } + +#define NVJPEG_ERROR(stmt) \ + { \ + nvjpegStatus_t _nvjpeg_status = stmt; \ + if (_nvjpeg_status != NVJPEG_STATUS_SUCCESS) \ + { \ + throw std::runtime_error( \ + fmt::format("[Error] NVJPEG call {} in line {} of file {} failed with the error code {}.\n", #stmt, \ + __LINE__, __FILE__, _nvjpeg_status)); \ + } \ + } + namespace cucim::util { diff --git a/cpp/plugins/cucim.kit.cumed/CMakeLists.txt b/cpp/plugins/cucim.kit.cumed/CMakeLists.txt index 3917bb571..4073ecfd5 100644 --- a/cpp/plugins/cucim.kit.cumed/CMakeLists.txt +++ b/cpp/plugins/cucim.kit.cumed/CMakeLists.txt @@ -70,6 +70,8 @@ endif() # Find CUDAToolkit as rmm depends on it find_package(CUDAToolkit REQUIRED) +# For Threads::Threads +find_package(Threads REQUIRED) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED YES) diff --git a/cpp/plugins/cucim.kit.cumed/src/cumed/cumed.cpp b/cpp/plugins/cucim.kit.cumed/src/cumed/cumed.cpp index ebea041a5..8f0cb5f6e 100644 --- a/cpp/plugins/cucim.kit.cumed/src/cumed/cumed.cpp +++ b/cpp/plugins/cucim.kit.cumed/src/cumed/cumed.cpp @@ -77,7 +77,7 @@ static bool CUCIM_ABI checker_is_valid(const char* file_name, const char* buf, s return false; } -static CuCIMFileHandle CUCIM_ABI parser_open(const char* file_path_) +static CuCIMFileHandle_share CUCIM_ABI parser_open(const char* file_path_) { const cucim::filesystem::Path& file_path = file_path_; @@ -95,13 +95,13 @@ static CuCIMFileHandle CUCIM_ABI parser_open(const char* file_path_) throw std::invalid_argument(fmt::format("Cannot open {}!", file_path)); } - // TODO: make file_handle_ object to pointer - auto file_handle = CuCIMFileHandle{ fd, nullptr, FileHandleType::kPosix, file_path_cstr, nullptr }; + auto file_handle = std::make_shared(fd, nullptr, FileHandleType::kPosix, file_path_cstr, nullptr); + CuCIMFileHandle_share handle = new std::shared_ptr(std::move(file_handle)); - return file_handle; + return handle; } -static bool CUCIM_ABI parser_parse(CuCIMFileHandle* handle, cucim::io::format::ImageMetadataDesc* out_metadata_desc) +static bool CUCIM_ABI parser_parse(CuCIMFileHandle_ptr handle, cucim::io::format::ImageMetadataDesc* out_metadata_desc) { (void)handle; if (!out_metadata_desc || !out_metadata_desc->handle) @@ -210,13 +210,10 @@ static bool CUCIM_ABI parser_parse(CuCIMFileHandle* handle, cucim::io::format::I return true; } -static bool CUCIM_ABI parser_close(CuCIMFileHandle* handle) +static bool CUCIM_ABI parser_close(CuCIMFileHandle_ptr handle_ptr) { - if (handle->path) - { - cucim_free(handle->path); - handle->path = nullptr; - } + CuCIMFileHandle* handle = reinterpret_cast(handle_ptr); + if (handle->client_data) { // TODO: comment out and reinterpret_cast when needed. @@ -227,12 +224,13 @@ static bool CUCIM_ABI parser_close(CuCIMFileHandle* handle) return true; } -static bool CUCIM_ABI reader_read(const CuCIMFileHandle* handle, +static bool CUCIM_ABI reader_read(const CuCIMFileHandle_ptr handle_ptr, const cucim::io::format::ImageMetadataDesc* metadata, const cucim::io::format::ImageReaderRegionRequestDesc* request, cucim::io::format::ImageDataDesc* out_image_data, cucim::io::format::ImageMetadataDesc* out_metadata_desc = nullptr) { + CuCIMFileHandle* handle = reinterpret_cast(handle_ptr); (void)handle; (void)metadata; @@ -378,10 +376,11 @@ static bool CUCIM_ABI reader_read(const CuCIMFileHandle* handle, return true; } -static bool CUCIM_ABI writer_write(const CuCIMFileHandle* handle, +static bool CUCIM_ABI writer_write(const CuCIMFileHandle_ptr handle_ptr, const cucim::io::format::ImageMetadataDesc* metadata, const cucim::io::format::ImageDataDesc* image_data) { + CuCIMFileHandle* handle = reinterpret_cast(handle_ptr); (void)handle; (void)metadata; (void)image_data; diff --git a/cpp/plugins/cucim.kit.cuslide/.idea/cucim.kit.cuslide.iml b/cpp/plugins/cucim.kit.cuslide/.idea/cucim.kit.cuslide.iml index f08604bb6..6d70257c7 100644 --- a/cpp/plugins/cucim.kit.cuslide/.idea/cucim.kit.cuslide.iml +++ b/cpp/plugins/cucim.kit.cuslide/.idea/cucim.kit.cuslide.iml @@ -1,2 +1,2 @@ - + \ No newline at end of file diff --git a/cpp/plugins/cucim.kit.cuslide/CMakeLists.txt b/cpp/plugins/cucim.kit.cuslide/CMakeLists.txt index ce4314677..37dbca4f7 100644 --- a/cpp/plugins/cucim.kit.cuslide/CMakeLists.txt +++ b/cpp/plugins/cucim.kit.cuslide/CMakeLists.txt @@ -70,6 +70,8 @@ endif() # Find CUDAToolkit as rmm depends on it find_package(CUDAToolkit REQUIRED) +# For Threads::Threads +find_package(Threads REQUIRED) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED YES) @@ -123,6 +125,8 @@ superbuild_depend(cli11) superbuild_depend(pugixml) superbuild_depend(json) superbuild_depend(libdeflate) +superbuild_depend(nvjpeg) +superbuild_depend(libculibos) ################################################################################ # Find cucim package @@ -160,11 +164,15 @@ add_library(${CUCIM_PLUGIN_NAME} src/cuslide/deflate/deflate.h src/cuslide/jpeg/libjpeg_turbo.cpp src/cuslide/jpeg/libjpeg_turbo.h + src/cuslide/jpeg/libnvjpeg.cpp + src/cuslide/jpeg/libnvjpeg.h src/cuslide/jpeg2k/color_conversion.cpp src/cuslide/jpeg2k/color_conversion.h src/cuslide/jpeg2k/color_table.h src/cuslide/jpeg2k/libopenjpeg.cpp src/cuslide/jpeg2k/libopenjpeg.h + src/cuslide/loader/nvjpeg_processor.cpp + src/cuslide/loader/nvjpeg_processor.h ${deps-libopenjpeg_SOURCE_DIR}/src/bin/common/color.c # for color_sycc_to_rgb() and color_apply_icc_profile() src/cuslide/lzw/lzw.cpp src/cuslide/lzw/lzw.h @@ -208,6 +216,10 @@ target_compile_options(${CUCIM_PLUGIN_NAME} PRIVATE $<$:-W # Link libraries target_link_libraries(${CUCIM_PLUGIN_NAME} PRIVATE + # Use custom nvjpeg_static that supports GPU input (>= CUDA 11.6) + deps::nvjpeg_static # add this before cudart so that nvjpeg.h in static library takes precedence. + # Add CUDA::culibos to link necessary methods for 'deps::nvjpeg_static' + CUDA::culibos # for nvjpeg CUDA::cudart deps::fmt cucim::cucim diff --git a/cpp/plugins/cucim.kit.cuslide/cmake/deps/libculibos.cmake b/cpp/plugins/cucim.kit.cuslide/cmake/deps/libculibos.cmake new file mode 100644 index 000000000..2938198ea --- /dev/null +++ b/cpp/plugins/cucim.kit.cuslide/cmake/deps/libculibos.cmake @@ -0,0 +1,42 @@ +# +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This module tries to find libculibos.a from /usr/local/cuda if +# CUDA::culibos is not available through `find_package(CUDAToolkit REQUIRED)`. +if (NOT TARGET CUDA::culibos) + + find_package(CUDAToolkit REQUIRED) + + if(NOT TARGET CUDA::culibos) + find_library(CUDA_culibos_LIBRARY + NAMES culibos + HINTS /usr/local/cuda + ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 lib/x64 lib + ) + + mark_as_advanced(CUDA_culibos_LIBRARY) + + if (NOT TARGET CUDA::culibos AND CUDA_culibos_LIBRARY) + add_library(CUDA::culibos STATIC IMPORTED GLOBAL) + target_include_directories(CUDA::culibos SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + set_property(TARGET CUDA::culibos PROPERTY IMPORTED_LOCATION "${CUDA_culibos_LIBRARY}") + message("Set CUDA_culibos_LIBRARY to '${CUDA_culibos_LIBRARY}'.") + else () + message(FATAL_ERROR "Could not find CUDA::culibos.") + endif() + endif () + +endif () diff --git a/cpp/plugins/cucim.kit.cuslide/cmake/deps/libopenjpeg.cmake b/cpp/plugins/cucim.kit.cuslide/cmake/deps/libopenjpeg.cmake index aea3eb7e4..428b04f0f 100644 --- a/cpp/plugins/cucim.kit.cuslide/cmake/deps/libopenjpeg.cmake +++ b/cpp/plugins/cucim.kit.cuslide/cmake/deps/libopenjpeg.cmake @@ -38,7 +38,6 @@ if (NOT TARGET deps::libopenjpeg) ########################################################################### # Build liblcms2 with the source in libopenjpeg ########################################################################### - message(" ##GG ${deps-libopenjpeg_SOURCE_DIR}") add_subdirectory(${deps-libopenjpeg_SOURCE_DIR}/thirdparty/liblcms2 ${deps-libopenjpeg_BINARY_DIR}/thirdparty/liblcms2) diff --git a/cpp/plugins/cucim.kit.cuslide/cmake/deps/nvjpeg.cmake b/cpp/plugins/cucim.kit.cuslide/cmake/deps/nvjpeg.cmake new file mode 100644 index 000000000..8bcf8c911 --- /dev/null +++ b/cpp/plugins/cucim.kit.cuslide/cmake/deps/nvjpeg.cmake @@ -0,0 +1,41 @@ +# +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if (NOT TARGET deps::nvjpeg_static) + + add_library(deps::nvjpeg_static STATIC IMPORTED GLOBAL) + + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../../../temp/cuda/include/nvjpeg.h) + set(NVJPEG_INCLUDE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../../temp/cuda/include) + else () + message(FATAL_ERROR "nvjpeg.h not found") + endif () + + message("Set NVJPEG_INCLUDE_PATH to '${NVJPEG_INCLUDE_PATH}'.") + + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../../../temp/cuda/lib64/libnvjpeg_static.a) + set(NVJPEG_STATIC_LIB_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../../temp/cuda/lib64/libnvjpeg_static.a) + else () + message(FATAL_ERROR "libnvjpeg_static.a not found") + endif () + + message("Set NVJPEG_STATIC_LIB_PATH to '${NVJPEG_STATIC_LIB_PATH}'.") + + set_target_properties(deps::nvjpeg_static PROPERTIES + IMPORTED_LOCATION "${NVJPEG_STATIC_LIB_PATH}" + INTERFACE_INCLUDE_DIRECTORIES "${NVJPEG_INCLUDE_PATH}" + ) + +endif () diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/cuslide.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/cuslide.cpp index 5e7c17d19..42b0dc894 100644 --- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/cuslide.cpp +++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/cuslide.cpp @@ -82,15 +82,20 @@ static bool CUCIM_ABI checker_is_valid(const char* file_name, const char* buf, s return false; } -static CuCIMFileHandle CUCIM_ABI parser_open(const char* file_path) +static CuCIMFileHandle_share CUCIM_ABI parser_open(const char* file_path) { auto tif = new cuslide::tiff::TIFF(file_path, O_RDONLY); tif->construct_ifds(); - return tif->file_handle(); + // Move the ownership of the file handle object to the caller (CuImage). + auto handle_t = tif->file_handle(); + tif->file_handle() = nullptr; + CuCIMFileHandle_share handle = new std::shared_ptr(handle_t); + return handle; } -static bool CUCIM_ABI parser_parse(CuCIMFileHandle* handle, cucim::io::format::ImageMetadataDesc* out_metadata_desc) +static bool CUCIM_ABI parser_parse(CuCIMFileHandle_ptr handle_ptr, cucim::io::format::ImageMetadataDesc* out_metadata_desc) { + CuCIMFileHandle* handle = reinterpret_cast(handle_ptr); if (!out_metadata_desc || !out_metadata_desc->handle) { throw std::runtime_error("out_metadata_desc shouldn't be nullptr!"); @@ -237,30 +242,34 @@ static bool CUCIM_ABI parser_parse(CuCIMFileHandle* handle, cucim::io::format::I return true; } -static bool CUCIM_ABI parser_close(CuCIMFileHandle* handle) +static bool CUCIM_ABI parser_close(CuCIMFileHandle_ptr handle_ptr) { + CuCIMFileHandle* handle = reinterpret_cast(handle_ptr); + auto tif = static_cast(handle->client_data); delete tif; handle->client_data = nullptr; return true; } -static bool CUCIM_ABI reader_read(const CuCIMFileHandle* handle, +static bool CUCIM_ABI reader_read(const CuCIMFileHandle_ptr handle_ptr, const cucim::io::format::ImageMetadataDesc* metadata, const cucim::io::format::ImageReaderRegionRequestDesc* request, cucim::io::format::ImageDataDesc* out_image_data, cucim::io::format::ImageMetadataDesc* out_metadata = nullptr) { + CuCIMFileHandle* handle = reinterpret_cast(handle_ptr); auto tif = static_cast(handle->client_data); bool result = tif->read(metadata, request, out_image_data, out_metadata); return result; } -static bool CUCIM_ABI writer_write(const CuCIMFileHandle* handle, +static bool CUCIM_ABI writer_write(const CuCIMFileHandle_ptr handle_ptr, const cucim::io::format::ImageMetadataDesc* metadata, const cucim::io::format::ImageDataDesc* image_data) { + CuCIMFileHandle* handle = reinterpret_cast(handle_ptr); (void)handle; (void)metadata; (void)image_data; diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.cpp new file mode 100644 index 000000000..a638215d5 --- /dev/null +++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.cpp @@ -0,0 +1,57 @@ +/* + * Apache License, Version 2.0 + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "libnvjpeg.h" + +#include +#include + +namespace cuslide::jpeg +{ + + +#define THROW(action, message) \ + { \ + printf("ERROR in line %d while %s:\n%s\n", __LINE__, action, message); \ + retval = -1; \ + goto bailout; \ + } + + +bool decode_libnvjpeg(const int fd, + const unsigned char* jpeg_buf, + const uint64_t offset, + const uint64_t size, + const void* jpegtable_data, + const uint32_t jpegtable_count, + uint8_t** dest, + const cucim::io::Device& out_device) +{ + (void)out_device; + (void)fd; + (void)jpeg_buf; + (void)offset; + (void)size; + (void)jpegtable_data; + (void)jpegtable_count; + (void)dest; + (void)out_device; + + return true; +} + +} // namespace cuslide::jpeg diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.h b/cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.h new file mode 100644 index 000000000..6457d3a0c --- /dev/null +++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.h @@ -0,0 +1,36 @@ +/* + * Apache License, Version 2.0 + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef CUSLIDE_LIBNVJPEG_H +#define CUSLIDE_LIBNVJPEG_H + +#include + +namespace cuslide::jpeg +{ + +EXPORT_VISIBLE bool decode_libnvjpeg(int fd, + const unsigned char* jpeg_buf, + uint64_t offset, + uint64_t size, + const void* jpegtable_data, + uint32_t jpegtable_count, + uint8_t** dest, + const cucim::io::Device& out_device); + +} // namespace cuslide::jpeg + +#endif // CUSLIDE_LIBNVJPEG_H diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.cpp new file mode 100644 index 000000000..eccd62549 --- /dev/null +++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.cpp @@ -0,0 +1,439 @@ +/* + * Apache License, Version 2.0 + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nvjpeg_processor.h" + +#include + +#include +#include +#include +#include +#include + +#define ALIGN_UP(x, align_to) (((uint64_t)(x) + ((uint64_t)(align_to)-1)) & ~((uint64_t)(align_to)-1)) +#define ALIGN_DOWN(x, align_to) ((uint64_t)(x) & ~((uint64_t)(align_to)-1)) +namespace cuslide::loader +{ + +constexpr uint32_t MAX_CUDA_BATCH_SIZE = 1024; + +NvJpegProcessor::NvJpegProcessor(CuCIMFileHandle* file_handle, + const cuslide::tiff::IFD* ifd, + const int64_t* request_location, + const int64_t* request_size, + const uint64_t location_len, + const uint32_t batch_size, + uint32_t maximum_tile_count, + const uint8_t* jpegtable_data, + const uint32_t jpegtable_size) + : cucim::loader::BatchDataProcessor(batch_size), file_handle_(file_handle), ifd_(ifd) +{ + if (maximum_tile_count > 1) + { + // Calculate nearlest power of 2 that is equal or larger than the given number. + // (Test with https://godbolt.org/z/n7qhPYzfP) + int next_candidate = maximum_tile_count & (maximum_tile_count - 1); + if (next_candidate > 0) + { + maximum_tile_count <<= 1; + while (true) + { + next_candidate = maximum_tile_count & (maximum_tile_count - 1); + if (next_candidate == 0) + { + break; + } + maximum_tile_count = next_candidate; + } + } + + // Do not exceed MAX_CUDA_BATCH_SIZE for decoding JPEG with nvJPEG + uint32_t cuda_batch_size = std::min(maximum_tile_count, MAX_CUDA_BATCH_SIZE); + + // Update prefetch_factor + // (We can decode/cache tiles at least two times of the number of tiles for batch decoding) + // E.g., (128 - 1) / 32 + 1 ~= 4 => 8 (for 256 tiles) for cuda_batch_size(=128) and batch_size(=32) + preferred_loader_prefetch_factor_ = ((cuda_batch_size - 1) / batch_size_ + 1) * 2; + + // Create cuda image cache + cucim::cache::ImageCacheConfig cache_config{}; + cache_config.type = cucim::cache::CacheType::kPerProcess; + cache_config.memory_capacity = 1024 * 1024; // 1TB: set to fairly large memory so that memory_capacity is not a + // limiter. + cache_config.capacity = cuda_batch_size * 2; // limit the number of cache item to cuda_batch_size * 2 + cuda_image_cache_ = + std::move(cucim::cache::ImageCacheManager::create_cache(cache_config, cucim::io::DeviceType::kCUDA)); + + cuda_batch_size_ = cuda_batch_size; + + // Initialize nvjpeg + cudaError_t cuda_status; + + if (NVJPEG_STATUS_SUCCESS != nvjpegCreate(backend_, NULL, &handle_)) + { + throw std::runtime_error(fmt::format("NVJPEG initialization error")); + } + if (NVJPEG_STATUS_SUCCESS != nvjpegJpegStateCreate(handle_, &state_)) + { + throw std::runtime_error(fmt::format("JPEG state initialization error")); + } + + nvjpegDecodeBatchedParseJpegTables(handle_, state_, jpegtable_data, jpegtable_size); + nvjpegDecodeBatchedInitialize(handle_, state_, cuda_batch_size_, 1, output_format_); + + CUDA_ERROR(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); + + raw_cuda_inputs_.reserve(cuda_batch_size_); + raw_cuda_inputs_len_.reserve(cuda_batch_size_); + + for (uint32_t i = 0; i < cuda_batch_size_; ++i) + { + raw_cuda_outputs_.emplace_back(); // add all-zero nvjpegImage_t object + } + + // Read file block in advance + tile_width_ = ifd->tile_width(); + tile_width_bytes_ = tile_width_ * ifd->pixel_size_nbytes(); + tile_height_ = ifd->tile_height(); + tile_raster_nbytes_ = tile_width_bytes_ * tile_height_; + + struct stat sb; + fstat(file_handle_->fd, &sb); + file_size_ = sb.st_size; + file_start_offset_ = 0; + file_block_size_ = file_size_; + + update_file_block_info(request_location, request_size, location_len); + + constexpr int BLOCK_SECTOR_SIZE = 4096; + switch (backend_) + { + case NVJPEG_BACKEND_GPU_HYBRID: + cufile_ = cucim::filesystem::open(file_handle->path, "rp"); + unaligned_host_ = static_cast(cucim_malloc(file_block_size_ + BLOCK_SECTOR_SIZE * 2)); + aligned_host_ = reinterpret_cast(ALIGN_UP(unaligned_host_, BLOCK_SECTOR_SIZE)); + cufile_->pread(aligned_host_, file_block_size_, file_start_offset_); + break; + case NVJPEG_BACKEND_GPU_HYBRID_DEVICE: + cufile_ = cucim::filesystem::open(file_handle->path, "r"); + CUDA_ERROR(cudaMalloc(&unaligned_device_, file_block_size_ + BLOCK_SECTOR_SIZE)); + aligned_device_ = reinterpret_cast(ALIGN_UP(unaligned_device_, BLOCK_SECTOR_SIZE)); + cufile_->pread(aligned_device_, file_block_size_, file_start_offset_); + break; + default: + throw std::runtime_error("Unsupported backend type"); + } + } +} + +NvJpegProcessor::~NvJpegProcessor() +{ + if (unaligned_host_) + { + cucim_free(unaligned_host_); + unaligned_host_ = nullptr; + } + + cudaError_t cuda_status; + if (unaligned_device_) + { + CUDA_ERROR(cudaFree(unaligned_device_)); + unaligned_device_ = nullptr; + } + + for (uint32_t i = 0; i < cuda_batch_size_; ++i) + { + if (raw_cuda_outputs_[i].channel[0]) + { + CUDA_ERROR(cudaFree(raw_cuda_outputs_[i].channel[0])); + raw_cuda_outputs_[i].channel[0] = nullptr; + } + } + + if (state_) + { + NVJPEG_ERROR(nvjpegJpegStateDestroy(state_)); + state_ = nullptr; + } + if (handle_) + { + NVJPEG_ERROR(nvjpegDestroy(handle_)); + handle_ = nullptr; + } +} + +uint32_t NvJpegProcessor::request(std::deque& batch_item_counts, const uint32_t num_remaining_patches) +{ + (void)batch_item_counts; + std::vector tile_to_request; + if (tiles_.empty()) + { + return 0; + } + + // Return if we need to wait until previous cuda batch is consumed. + auto& first_tile = tiles_.front(); + if (first_tile.location_index <= fetch_after_.location_index) + { + if (first_tile.location_index < fetch_after_.location_index || first_tile.index < fetch_after_.index) + { + return 0; + } + } + + // Set fetch_after_ to the last tile info of previously processed cuda batch + if (!cache_tile_queue_.empty()) + { + fetch_after_ = cache_tile_map_[cache_tile_queue_.back()]; + } + + // Remove previous batch (keep last 'cuda_batch_size_' items) before adding/processing new cuda batch + std::vector removed_tiles; + while (cache_tile_queue_.size() > cuda_batch_size_) + { + uint32_t removed_tile_index = cache_tile_queue_.front(); + auto removed_tile = cache_tile_map_.find(removed_tile_index); + removed_tiles.push_back(removed_tile->second); + cache_tile_queue_.pop_front(); + cache_tile_map_.erase(removed_tile_index); + } + + // Collect candidates + for (auto tile : tiles_) + { + auto index = tile.index; + if (tile_to_request.size() >= cuda_batch_size_) + { + break; + } + if (cache_tile_map_.find(index) == cache_tile_map_.end()) + { + if (tile.size == 0) + { + continue; + } + cache_tile_queue_.emplace_back(index); + cache_tile_map_.emplace(index, tile); + tile_to_request.emplace_back(tile); + } + } + + // Return if we need to wait until more patches are requested + if (tile_to_request.size() < cuda_batch_size_) + { + if (num_remaining_patches > 0) + { + // Restore cache_tile_queue_ and cache_tile_map_ + for (auto& added_tile : tile_to_request) + { + uint32_t added_index = added_tile.index; + cache_tile_queue_.pop_back(); + cache_tile_map_.erase(added_index); + } + for (auto rit = removed_tiles.rbegin(); rit != removed_tiles.rend(); ++rit) + { + uint32_t removed_index = rit->index; + cache_tile_queue_.emplace_front(removed_index); + cache_tile_map_.emplace(removed_index, *rit); + } + return 0; + } + else + { + // Completed, set fetch_after_ to the last tile info. + fetch_after_ = tiles_.back(); + } + } + + uint8_t* file_block_ptr = nullptr; + switch (backend_) + { + case NVJPEG_BACKEND_GPU_HYBRID: + file_block_ptr = aligned_host_; + break; + case NVJPEG_BACKEND_GPU_HYBRID_DEVICE: + file_block_ptr = aligned_device_; + break; + default: + throw std::runtime_error("Unsupported backend type"); + } + + cudaError_t cuda_status; + + // Initialize batch data with the first data + if (raw_cuda_inputs_.empty()) + { + for (uint32_t i = 0; i < cuda_batch_size_; ++i) + { + uint8_t* mem_offset = file_block_ptr + tile_to_request[0].offset - file_start_offset_; + raw_cuda_inputs_.push_back((const unsigned char*)mem_offset); + raw_cuda_inputs_len_.push_back(tile_to_request[0].size); + CUDA_ERROR(cudaMallocPitch( + &raw_cuda_outputs_[i].channel[0], &raw_cuda_outputs_[i].pitch[0], tile_width_bytes_, tile_height_)); + } + CUDA_ERROR(cudaStreamSynchronize(stream_)); + } + + // Set inputs to nvJPEG + size_t request_count = tile_to_request.size(); + for (uint32_t i = 0; i < request_count; ++i) + { + uint8_t* mem_offset = file_block_ptr + tile_to_request[i].offset - file_start_offset_; + raw_cuda_inputs_[i] = mem_offset; + raw_cuda_inputs_len_[i] = tile_to_request[i].size; + } + + int error_code = nvjpegDecodeBatched( + handle_, state_, raw_cuda_inputs_.data(), raw_cuda_inputs_len_.data(), raw_cuda_outputs_.data(), stream_); + + if (NVJPEG_STATUS_SUCCESS != error_code) + { + throw std::runtime_error(fmt::format("Error in batched decode: {}", error_code)); + } + CUDA_ERROR(cudaStreamSynchronize(stream_)); + + // Remove previous batch (keep last 'cuda_batch_size_' items) before adding to cuda_image_cache_ + // TODO: Utilize the removed tiles if next batch uses them. + while (cuda_image_cache_->size() > cuda_batch_size_) + { + cuda_image_cache_->remove_front(); + } + + // Add to image cache + for (uint32_t i = 0; i < request_count; ++i) + { + auto& added_tile = tile_to_request[i]; + + uint32_t index = added_tile.index; + uint64_t index_hash = cucim::codec::splitmix64(index); + + auto key = cuda_image_cache_->create_key(0, index); + + cuda_image_cache_->lock(index_hash); + + uint8_t* tile_data = static_cast(cuda_image_cache_->allocate(tile_raster_nbytes_)); + + cudaError_t cuda_status; + CUDA_TRY(cudaMemcpy2D(tile_data, tile_width_bytes_, raw_cuda_outputs_[i].channel[0], + raw_cuda_outputs_[i].pitch[0], tile_width_bytes_, tile_height_, cudaMemcpyDeviceToDevice)); + + const size_t tile_raster_nbytes = raw_cuda_inputs_len_[i]; + auto value = cuda_image_cache_->create_value(tile_data, tile_raster_nbytes, cucim::io::DeviceType::kCUDA); + cuda_image_cache_->insert(key, value); + cuda_image_cache_->unlock(index_hash); + } + + ++processed_cuda_batch_count_; + + cuda_batch_cond_.notify_all(); + return request_count; +} + +uint32_t NvJpegProcessor::wait_batch(const uint32_t index_in_task, + std::deque& batch_item_counts, + const uint32_t num_remaining_patches) +{ + // Check if the next (cuda) batch needs to be requested whenever an index in a task is divided by cuda batch size. + // (each task which is for a patch consists of multiple tile processing) + if (index_in_task % cuda_batch_size_ == 0) + { + request(batch_item_counts, num_remaining_patches); + } + return 0; +} + +std::shared_ptr NvJpegProcessor::wait_for_processing(const uint32_t index) +{ + uint64_t index_hash = cucim::codec::splitmix64(index); + std::mutex* m = reinterpret_cast(cuda_image_cache_->mutex(index_hash)); + std::shared_ptr value; + + std::unique_lock lock(*m); + cuda_batch_cond_.wait(lock, [this, index, &value] { + // Exit waiting if the thread needs to be stopped or cache value is available. + if (stopped_) + { + value = std::shared_ptr(); + return true; + } + auto key = cuda_image_cache_->create_key(0, index); + value = cuda_image_cache_->find(key); + return static_cast(value); + }); + return value; +} + +void NvJpegProcessor::shutdown() +{ + stopped_ = true; + cuda_batch_cond_.notify_all(); +} + +uint32_t NvJpegProcessor::preferred_loader_prefetch_factor() +{ + return preferred_loader_prefetch_factor_; +} + +void NvJpegProcessor::update_file_block_info(const int64_t* request_location, + const int64_t* request_size, + const uint64_t location_len) +{ + + uint32_t width = ifd_->width(); + uint32_t height = ifd_->height(); + uint32_t stride_y = width / tile_width_ + !!(width % tile_width_); // # of tiles in a row(y) in the ifd tile array + // as grid + uint32_t stride_x = height / tile_height_ + !!(height % tile_height_); // # of tiles in a col(x) in the ifd tile + // array as grid + int64_t min_tile_index = 1000000000; + int64_t max_tile_index = 0; + + // Assume that offset for tiles are increasing as the index is increasing. + for (size_t loc_index = 0; loc_index < location_len; ++loc_index) + { + int64_t sx = request_location[loc_index * 2]; + int64_t sy = request_location[loc_index * 2 + 1]; + int64_t offset_sx = static_cast(sx) / tile_width_; // x-axis start offset for the requested region in + // the ifd tile array as grid + int64_t offset_sy = static_cast(sy) / tile_height_; // y-axis start offset for the requested region in + // the ifd tile array as grid + int64_t tile_index = (offset_sy * stride_y) + offset_sx; + min_tile_index = std::min(min_tile_index, tile_index); + max_tile_index = std::max(max_tile_index, tile_index); + } + + int64_t w = request_size[0]; + int64_t h = request_size[1]; + int64_t additional_index_x = (static_cast(w) + (tile_width_ - 1)) / tile_width_; + int64_t additional_index_y = (static_cast(h) + (tile_height_ - 1)) / tile_height_; + min_tile_index = std::max(min_tile_index, 0L); + max_tile_index = + std::min(stride_x * stride_y - 1, + static_cast(max_tile_index + (additional_index_y * stride_y) + additional_index_x)); + + auto& image_piece_offsets = const_cast&>(ifd_->image_piece_offsets()); + auto& image_piece_bytecounts = const_cast&>(ifd_->image_piece_bytecounts()); + + uint64_t min_offset = image_piece_offsets[min_tile_index]; + uint64_t max_offset = image_piece_offsets[max_tile_index] + image_piece_bytecounts[max_tile_index]; + + file_start_offset_ = min_offset; + file_block_size_ = max_offset - min_offset + 1; +} + +} // namespace cuslide::loader diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.h b/cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.h new file mode 100644 index 000000000..221adf4c7 --- /dev/null +++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.h @@ -0,0 +1,110 @@ +/* + * Apache License, Version 2.0 + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef CUSLIDE_NVJPEG_PROCESSOR_H +#define CUSLIDE_NVJPEG_PROCESSOR_H + + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include "cuslide/tiff/ifd.h" + +namespace cuslide::loader +{ + +class NvJpegProcessor : public cucim::loader::BatchDataProcessor +{ +public: + NvJpegProcessor(CuCIMFileHandle* file_handle, + const cuslide::tiff::IFD* ifd, + const int64_t* request_location, + const int64_t* request_size, + uint64_t location_len, + uint32_t batch_size, + uint32_t maximum_tile_count, + const uint8_t* jpegtable_data, + uint32_t jpegtable_size); + ~NvJpegProcessor(); + + uint32_t request(std::deque& batch_item_counts, uint32_t num_remaining_patches) override; + uint32_t wait_batch(uint32_t index_in_task, + std::deque& batch_item_counts, + uint32_t num_remaining_patches) override; + + std::shared_ptr wait_for_processing(uint32_t index) override; + + void shutdown() override; + + uint32_t preferred_loader_prefetch_factor(); + +private: + void update_file_block_info(const int64_t* request_location, const int64_t* request_size, uint64_t location_len); + + bool stopped_ = false; + uint32_t preferred_loader_prefetch_factor_ = 2; + + CuCIMFileHandle* file_handle_ = nullptr; + const cuslide::tiff::IFD* ifd_ = nullptr; + std::shared_ptr cufile_; + size_t tile_width_ = 0; + size_t tile_width_bytes_ = 0; + size_t tile_height_ = 0; + size_t tile_raster_nbytes_ = 0; + size_t file_size_ = 0; + size_t file_start_offset_ = 0; + size_t file_block_size_ = 0; + + uint32_t cuda_batch_size_ = 1; + nvjpegHandle_t handle_ = nullptr; + nvjpegOutputFormat_t output_format_ = NVJPEG_OUTPUT_RGBI; + nvjpegJpegState_t state_; + nvjpegBackend_t backend_ = NVJPEG_BACKEND_GPU_HYBRID_DEVICE; + cudaStream_t stream_ = nullptr; + + std::condition_variable cuda_batch_cond_; + std::unique_ptr cuda_image_cache_; + uint64_t processed_cuda_batch_count_ = 0; + cucim::loader::TileInfo fetch_after_{ -1, -1, 0, 0 }; + + std::deque cache_tile_queue_; + std::unordered_map cache_tile_map_; + + uint8_t* unaligned_host_ = nullptr; + uint8_t* aligned_host_ = nullptr; + uint8_t* unaligned_device_ = nullptr; + uint8_t* aligned_device_ = nullptr; + + std::vector raw_cuda_inputs_; + std::vector raw_cuda_inputs_len_; + std::vector raw_cuda_outputs_; +}; + +} // namespace cuslide::loader + +#endif // CUSLIDE_NVJPEG_PROCESSOR_H diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.cpp index b4e59b6c0..9082a98ee 100644 --- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.cpp +++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.cpp @@ -19,7 +19,9 @@ #include #include +#include #include +#include #include #include @@ -32,10 +34,12 @@ #include #include #include +#include #include "cuslide/deflate/deflate.h" #include "cuslide/jpeg/libjpeg_turbo.h" #include "cuslide/jpeg2k/libopenjpeg.h" +#include "cuslide/loader/nvjpeg_processor.h" #include "cuslide/lzw/lzw.h" #include "cuslide/raw/raw.h" #include "tiff.h" @@ -48,17 +52,16 @@ IFD::IFD(TIFF* tiff, uint16_t index, ifd_offset_t offset) : tiff_(tiff), ifd_ind { PROF_SCOPED_RANGE(PROF_EVENT(ifd_ifd)); auto tif = tiff->client(); - int ret; char* software_char_ptr = nullptr; char* model_char_ptr = nullptr; - // TODO: error handling - ret = TIFFGetField(tif, TIFFTAG_SOFTWARE, &software_char_ptr); + // TODO: error handling + TIFFGetField(tif, TIFFTAG_SOFTWARE, &software_char_ptr); software_ = std::string(software_char_ptr ? software_char_ptr : ""); - ret = TIFFGetField(tif, TIFFTAG_MODEL, &model_char_ptr); + TIFFGetField(tif, TIFFTAG_MODEL, &model_char_ptr); model_ = std::string(model_char_ptr ? model_char_ptr : ""); - ret = TIFFGetField(tif, TIFFTAG_IMAGEDESCRIPTION, &model_char_ptr); + TIFFGetField(tif, TIFFTAG_IMAGEDESCRIPTION, &model_char_ptr); image_description_ = std::string(model_char_ptr ? model_char_ptr : ""); TIFFDirectory& tif_dir = tif->tif_dir; @@ -82,26 +85,8 @@ IFD::IFD(TIFF* tiff, uint16_t index, ifd_offset_t offset) : tiff_(tiff), ifd_ind photometric_ = tif_dir.td_photometric; compression_ = tif_dir.td_compression; TIFFGetField(tif, TIFFTAG_PREDICTOR, &predictor_); - - // ret = TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width_); - // ret = TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height_); - // ret = TIFFGetField(tif, TIFFTAG_TILEWIDTH, &tile_width_); - // ret = TIFFGetField(tif, TIFFTAG_TILELENGTH, &tile_height_); - // ret = TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bits_per_sample_); - // ret = TIFFGetField(tif, TIFFTAG_SAMPLESPERPIXEL, &samples_per_pixel_); - // ret = TIFFGetField(tif, TIFFTAG_SUBFILETYPE, &subfile_type_); // for checking if FILETYPE_REDUCEDIMAGE - // ret = TIFFGetField(tif, TIFFTAG_PLANARCONFIG, &planar_config_); - // ret = TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometric_); - // ret = TIFFGetField(tif, TIFFTAG_COMPRESSION, &compression_); - // printf("[GB] offset_entry:%lu %p\n", tif->tif_dir.td_stripoffset_entry.tdir_count, - // tif->tif_dir.td_stripoffset_p); printf("[GB] width: %d %d\n", tif->tif_dir.td_imagewidth, width_); - // printf("[GB] bytecount entry2:%lu %p\n", tif->tif_dir.td_stripbytecount_entry.tdir_count, - // tif->tif_dir.td_stripbytecount_p); - (void)ret; - subifd_count_ = tif_dir.td_nsubifd; uint64_t* subifd_offsets = tif_dir.td_subifd; - // ret = TIFFGetField(tif, TIFFTAG_SUBIFD, &subifd_count, &subifd_offsets); if (subifd_count_) { subifd_offsets_.resize(subifd_count_); @@ -113,7 +98,7 @@ IFD::IFD(TIFF* tiff, uint16_t index, ifd_offset_t offset) : tiff_(tiff), ifd_ind uint8_t* jpegtable_data = nullptr; uint32_t jpegtable_count = 0; - ret = TIFFGetField(tif, TIFFTAG_JPEGTABLES, &jpegtable_count, &jpegtable_data); + TIFFGetField(tif, TIFFTAG_JPEGTABLES, &jpegtable_count, &jpegtable_data); jpegtable_.reserve(jpegtable_count); jpegtable_.insert(jpegtable_.end(), jpegtable_data, jpegtable_data + jpegtable_count); @@ -139,7 +124,7 @@ IFD::IFD(TIFF* tiff, uint16_t index, ifd_offset_t offset) : tiff_(tiff), ifd_ind image_piece_bytecounts_.end(), &td_stripbytecount_p[0], &td_stripbytecount_p[image_piece_count_]); // Calculate hash value with IFD index - hash_value_ = tiff->file_handle_.hash_value ^ cucim::codec::splitmix64(index); + hash_value_ = tiff->file_handle_->hash_value ^ cucim::codec::splitmix64(index); // TIFFPrintDirectory(tif, stdout, TIFFPRINT_STRIPS); } @@ -164,12 +149,16 @@ bool IFD::read(const TIFF* tiff, int64_t sx = request->location[0]; int64_t sy = request->location[1]; + uint32_t batch_size = request->batch_size; int64_t w = request->size[0]; int64_t h = request->size[1]; int32_t n_ch = samples_per_pixel_; // number of channels + int ndim = 3; size_t raster_size = w * h * samples_per_pixel_; void* raster = nullptr; + auto raster_type = cucim::io::DeviceType::kCPU; + DLTensor* out_buf = request->buf; bool is_buf_available = out_buf && out_buf->data; @@ -181,16 +170,143 @@ bool IFD::read(const TIFF* tiff, if (is_read_optimizable()) { - if (!raster) + if (batch_size > 1) + { + ndim = 4; + } + int64_t* location = request->location; + uint64_t location_len = request->location_len; + const uint32_t num_workers = request->num_workers; + const bool drop_last = request->drop_last; + uint32_t prefetch_factor = request->prefetch_factor; + const bool shuffle = request->shuffle; + const uint64_t seed = request->seed; + + if (num_workers == 0 && location_len > 1) { - raster = cucim_malloc(raster_size); // RGB image - memset(raster, 0, raster_size); + throw std::runtime_error("Cannot read multiple locations with zero workers!"); } + // Shuffle data + if (shuffle) + { + auto rng = std::default_random_engine{ seed }; + struct position + { + int64_t x; + int64_t y; + }; + std::shuffle(reinterpret_cast(&location[0]), + reinterpret_cast(&location[location_len * 2]), rng); + } + + // Adjust location length based on 'drop_last' + const uint32_t remaining_len = location_len % batch_size; + if (drop_last) + { + location_len -= remaining_len; + } - if (!read_region_tiles(tiff, this, sx, sy, w, h, raster, out_device)) + // Do not use prefetch if the image is too small + if (1 + prefetch_factor > location_len) { - fmt::print(stderr, "[Error] Failed to read region with libjpeg!\n"); + prefetch_factor = location_len - 1; + } + + size_t one_raster_size = raster_size; + raster_size *= batch_size; + + const IFD* ifd = this; + + if (location_len > 1 || batch_size > 1 || num_workers > 0) + { + // Reconstruct location + std::unique_ptr>* location_unique = + reinterpret_cast>*>(request->location_unique); + std::unique_ptr> request_location = std::move(*location_unique); + delete location_unique; + + // Reconstruct size + std::unique_ptr>* size_unique = + reinterpret_cast>*>(request->size_unique); + std::unique_ptr> request_size = std::move(*size_unique); + delete size_unique; + + auto load_func = [tiff, ifd, location, w, h, out_device]( + cucim::loader::ThreadBatchDataLoader* loader_ptr, uint64_t location_index) { + uint8_t* raster_ptr = loader_ptr->raster_pointer(location_index); + + if (!read_region_tiles(tiff, ifd, location, location_index, w, h, + raster_ptr, out_device, loader_ptr)) + { + fmt::print(stderr, "[Error] Failed to read region!\n"); + } + }; + + uint32_t maximum_tile_count = 0; + + std::unique_ptr batch_processor; + + // Set raster_type to CUDA because loader will handle this with nvjpeg + if (out_device.type() == cucim::io::DeviceType::kCUDA) + { + raster_type = cucim::io::DeviceType::kCUDA; + + // The maximal number of tiles (x-axis) overapped with the given patch + uint32_t tile_across_count = std::min(static_cast(ifd->width_) + (ifd->tile_width_ - 1), + static_cast(w) + (ifd->tile_width_ - 1)) / + ifd->tile_width_ + + 1; + // The maximal number of tiles (y-axis) overapped with the given patch + uint32_t tile_down_count = std::min(static_cast(ifd->height_) + (ifd->tile_height_ - 1), + static_cast(h) + (ifd->tile_height_ - 1)) / + ifd->tile_height_ + + 1; + // The maximal number of possible tiles (# of tasks) to load for the given image batch + maximum_tile_count = tile_across_count * tile_down_count * batch_size; + + // Create NvJpegProcessor + auto& jpegtable = ifd->jpegtable_; + const void* jpegtable_data = jpegtable.data(); + uint32_t jpegtable_size = jpegtable.size(); + + auto nvjpeg_processor = std::make_unique( + tiff->file_handle_, ifd, request_location->data(), request_size->data(), location_len, batch_size, + maximum_tile_count, static_cast(jpegtable_data), jpegtable_size); + + // Update prefetch_factor + prefetch_factor = nvjpeg_processor->preferred_loader_prefetch_factor(); + + batch_processor = std::move(nvjpeg_processor); + } + + auto loader = std::make_unique( + load_func, std::move(batch_processor), out_device, std::move(request_location), std::move(request_size), + location_len, one_raster_size, batch_size, prefetch_factor, num_workers); + + const uint32_t load_size = std::min(static_cast(batch_size) * (1 + prefetch_factor), location_len); + + loader->request(load_size); + + // If it reads entire image with multi threads (using loader), fetch the next item. + if (location_len == 1 && batch_size == 1) + { + raster = loader->next_data(); + } + + out_image_data->loader = loader.release(); // set loader to out_image_data + } + else + { + if (!raster) + { + raster = cucim_malloc(one_raster_size); + } + + if (!read_region_tiles(tiff, ifd, location, 0, w, h, raster, out_device, nullptr)) + { + fmt::print(stderr, "[Error] Failed to read region!\n"); + } } } else @@ -260,14 +376,23 @@ bool IFD::read(const TIFF* tiff, } } - int ndim = 3; int64_t* shape = static_cast(cucim_malloc(sizeof(int64_t) * ndim)); - shape[0] = h; - shape[1] = w; - shape[2] = n_ch; + if (ndim == 3) + { + shape[0] = h; + shape[1] = w; + shape[2] = n_ch; + } + else // ndim == 4 + { + shape[0] = batch_size; + shape[1] = h; + shape[2] = w; + shape[3] = n_ch; + } // Copy the raster memory and free it if needed. - if (!is_buf_available) + if (!is_buf_available && raster && raster_type == cucim::io::DeviceType::kCPU) { cucim::memory::move_raster_from_host(&raster, raster_size, out_device); } @@ -386,6 +511,19 @@ const std::vector& IFD::image_piece_bytecounts() const return image_piece_bytecounts_; } +size_t IFD::pixel_size_nbytes() const +{ + const int pixel_format = TJPF_RGB; // TODO: support other pixel format + const int nbytes = tjPixelSize[pixel_format]; + return nbytes; +} + +size_t IFD::tile_raster_size_nbytes() const +{ + const size_t nbytes = tile_width_ * tile_height_ * pixel_size_nbytes(); + return nbytes; +} + bool IFD::is_compression_supported() const { switch (compression_) @@ -419,16 +557,19 @@ bool IFD::is_format_supported() const bool IFD::read_region_tiles(const TIFF* tiff, const IFD* ifd, - const int64_t sx, - const int64_t sy, + const int64_t* location, + const int64_t location_index, const int64_t w, const int64_t h, void* raster, - const cucim::io::Device& out_device) + const cucim::io::Device& out_device, + cucim::loader::ThreadBatchDataLoader* loader) { PROF_SCOPED_RANGE(PROF_EVENT(ifd_read_region_tiles)); // Reference code: https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/tjexample.c + int64_t sx = location[location_index * 2]; + int64_t sy = location[location_index * 2 + 1]; int64_t ex = sx + w - 1; int64_t ey = sy + h - 1; @@ -438,7 +579,7 @@ bool IFD::read_region_tiles(const TIFF* tiff, // Handle out-of-boundary case if (sx < 0 || sy < 0 || sx >= width || sy >= height || ex < 0 || ey < 0 || ex >= width || ey >= height) { - return read_region_tiles_boundary(tiff, ifd, sx, sy, w, h, raster, out_device); + return read_region_tiles_boundary(tiff, ifd, location, location_index, w, h, raster, out_device, loader); } cucim::cache::ImageCache& image_cache = cucim::CuImage::cache_manager().cache(); cucim::cache::CacheType cache_type = image_cache.type(); @@ -476,19 +617,9 @@ bool IFD::read_region_tiles(const TIFF* tiff, uint32_t start_index_y = offset_sy * stride_y; uint32_t end_index_y = offset_ey * stride_y; - // Memory for tile_raster would be manually allocated here, instead of using decode_libjpeg(). - // Need to free the manually. Usually it is set to nullptr and memory is created by decode_libjpeg() by using - // tjAlloc() (Also need to free with tjFree() after use. See the documentation of tjAlloc() for the detail.) - const int pixel_format = TJPF_RGB; // TODO: support other pixel format - const int pixel_size_nbytes = tjPixelSize[pixel_format]; - const size_t tile_raster_nbytes = tw * th * pixel_size_nbytes; - uint8_t* tile_raster = nullptr; - if (cache_type == cucim::cache::CacheType::kNoCache) - { - tile_raster = static_cast(cucim_malloc(tile_raster_nbytes)); - } + const size_t tile_raster_nbytes = ifd->tile_raster_size_nbytes(); - int tiff_file = tiff->file_handle_.fd; + int tiff_file = tiff->file_handle_->fd; uint64_t ifd_hash_value = ifd->hash_value_; uint32_t dest_pixel_step_y = w * samples_per_pixel; @@ -506,132 +637,187 @@ bool IFD::read_region_tiles(const TIFF* tiff, uint32_t dest_pixel_index_x = 0; uint32_t index = index_y + offset_sx; - // Calculate a simple hash value for the tile index - uint64_t index_hash = ifd_hash_value ^ (static_cast(index) | (static_cast(index) << 32)); for (uint32_t offset_x = offset_sx; offset_x <= offset_ex; ++offset_x, ++index) { PROF_SCOPED_RANGE(PROF_EVENT_P(ifd_read_region_tiles_iter, index)); auto tiledata_offset = static_cast(ifd->image_piece_offsets_[index]); auto tiledata_size = static_cast(ifd->image_piece_bytecounts_[index]); + // Calculate a simple hash value for the tile index + uint64_t index_hash = ifd_hash_value ^ (static_cast(index) | (static_cast(index) << 32)); + uint32_t tile_pixel_offset_x = (offset_x == offset_sx) ? pixel_offset_sx : 0; uint32_t nbytes_tile_pixel_size_x = (offset_x == offset_ex) ? (pixel_offset_ex - tile_pixel_offset_x + 1) * samples_per_pixel : (tw - tile_pixel_offset_x) * samples_per_pixel; - - uint32_t nbytes_tile_index = (tile_pixel_offset_sy * tw + tile_pixel_offset_x) * samples_per_pixel; - uint32_t dest_pixel_index = dest_pixel_index_x; - - uint8_t* tile_data = tile_raster; - - if (tiledata_size > 0) - { - auto key = image_cache.create_key(ifd_hash_value, index); - image_cache.lock(index_hash); - auto value = image_cache.find(key); - if (value) + auto decode_func = [=, &image_cache]() { + PROF_SCOPED_RANGE(PROF_EVENT_P(ifd_read_region_tiles_task, index_hash)); + uint32_t nbytes_tile_index = (tile_pixel_offset_sy * tw + tile_pixel_offset_x) * samples_per_pixel; + uint32_t dest_pixel_index = dest_pixel_index_x; + uint8_t* tile_data = nullptr; + if (tiledata_size > 0) { - image_cache.unlock(index_hash); - tile_data = static_cast(value->data); - } - else - { - // Lifetime of tile_data is same with `value` - // : do not access this data when `value` is not accessible. - if (cache_type != cucim::cache::CacheType::kNoCache) - { - tile_data = static_cast(image_cache.allocate(tile_raster_nbytes)); - } + std::unique_ptr tile_raster = + std::unique_ptr(nullptr, cucim_free); + if (loader && loader->batch_data_processor()) { - PROF_SCOPED_RANGE(PROF_EVENT(ifd_decompression)); switch (compression_method) { - case COMPRESSION_NONE: - cuslide::raw::decode_raw(tiff_file, nullptr, tiledata_offset, tiledata_size, &tile_data, - tile_raster_nbytes, out_device); - break; case COMPRESSION_JPEG: - cuslide::jpeg::decode_libjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size, - jpegtable_data, jpegtable_count, &tile_data, out_device, - jpeg_color_space); - break; - case COMPRESSION_ADOBE_DEFLATE: - case COMPRESSION_DEFLATE: - cuslide::deflate::decode_deflate(tiff_file, nullptr, tiledata_offset, tiledata_size, - &tile_data, tile_raster_nbytes, out_device); - break; - case cuslide::jpeg2k::kAperioJpeg2kYCbCr: // 33003 - cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size, - &tile_data, tile_raster_nbytes, out_device, - cuslide::jpeg2k::ColorSpace::kSYCC); - break; - case cuslide::jpeg2k::kAperioJpeg2kRGB: // 33005 - cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size, - &tile_data, tile_raster_nbytes, out_device, - cuslide::jpeg2k::ColorSpace::kRGB); - break; - case COMPRESSION_LZW: - cuslide::lzw::decode_lzw(tiff_file, nullptr, tiledata_offset, tiledata_size, &tile_data, - tile_raster_nbytes, out_device); - // Apply unpredictor - // 1: none, 2: horizontal differencing, 3: floating point predictor - // https://www.adobe.io/content/dam/udp/en/open/standards/tiff/TIFF6.pdf - if (predictor == 2) - { - cuslide::lzw::horAcc8(tile_data, tile_raster_nbytes, nbytes_tw); - } break; default: throw std::runtime_error("Unsupported compression method"); } + auto value = loader->wait_for_processing(index); + if (!value) // if shutdown + { + return; + } + tile_data = static_cast(value->data); + + cudaError_t cuda_status; + CUDA_ERROR(cudaMemcpy2D(dest_start_ptr + dest_pixel_index, dest_pixel_step_y, + tile_data + nbytes_tile_index, nbytes_tw, nbytes_tile_pixel_size_x, + tile_pixel_offset_ey - tile_pixel_offset_sy + 1, + cudaMemcpyDeviceToDevice)); } + else + { + auto key = image_cache.create_key(ifd_hash_value, index); + image_cache.lock(index_hash); + auto value = image_cache.find(key); + if (value) + { + image_cache.unlock(index_hash); + tile_data = static_cast(value->data); + } + else + { + // Lifetime of tile_data is same with `value` + // : do not access this data when `value` is not accessible. + if (cache_type != cucim::cache::CacheType::kNoCache) + { + tile_data = static_cast(image_cache.allocate(tile_raster_nbytes)); + } + else + { + // Allocate temporary buffer for tile data + tile_raster = std::unique_ptr( + reinterpret_cast(cucim_malloc(tile_raster_nbytes)), cucim_free); + tile_data = tile_raster.get(); + } + { + PROF_SCOPED_RANGE(PROF_EVENT(ifd_decompression)); + switch (compression_method) + { + case COMPRESSION_NONE: + cuslide::raw::decode_raw(tiff_file, nullptr, tiledata_offset, tiledata_size, + &tile_data, tile_raster_nbytes, out_device); + break; + case COMPRESSION_JPEG: + cuslide::jpeg::decode_libjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size, + jpegtable_data, jpegtable_count, &tile_data, + out_device, jpeg_color_space); + break; + case COMPRESSION_ADOBE_DEFLATE: + case COMPRESSION_DEFLATE: + cuslide::deflate::decode_deflate(tiff_file, nullptr, tiledata_offset, tiledata_size, + &tile_data, tile_raster_nbytes, out_device); + break; + case cuslide::jpeg2k::kAperioJpeg2kYCbCr: // 33003 + cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset, + tiledata_size, &tile_data, tile_raster_nbytes, + out_device, cuslide::jpeg2k::ColorSpace::kSYCC); + break; + case cuslide::jpeg2k::kAperioJpeg2kRGB: // 33005 + cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset, + tiledata_size, &tile_data, tile_raster_nbytes, + out_device, cuslide::jpeg2k::ColorSpace::kRGB); + break; + case COMPRESSION_LZW: + cuslide::lzw::decode_lzw(tiff_file, nullptr, tiledata_offset, tiledata_size, + &tile_data, tile_raster_nbytes, out_device); + // Apply unpredictor + // 1: none, 2: horizontal differencing, 3: floating point predictor + // https://www.adobe.io/content/dam/udp/en/open/standards/tiff/TIFF6.pdf + if (predictor == 2) + { + cuslide::lzw::horAcc8(tile_data, tile_raster_nbytes, nbytes_tw); + } + break; + default: + throw std::runtime_error("Unsupported compression method"); + } + } - value = image_cache.create_value(tile_data, tile_raster_nbytes); - image_cache.insert(key, value); - image_cache.unlock(index_hash); - } + value = image_cache.create_value(tile_data, tile_raster_nbytes); + image_cache.insert(key, value); + image_cache.unlock(index_hash); + } - for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey; - ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw) + for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey; + ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw) + { + memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index, + nbytes_tile_pixel_size_x); + } + } + } + else { - memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index, nbytes_tile_pixel_size_x); + if (out_device.type() == cucim::io::DeviceType::kCPU) + { + for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey; + ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw) + { + // Set background value such as (255,255,255) + memset(dest_start_ptr + dest_pixel_index, background_value, nbytes_tile_pixel_size_x); + } + } + else + { + cudaError_t cuda_status; + CUDA_ERROR(cudaMemset2D(dest_start_ptr + dest_pixel_index, dest_pixel_step_y, background_value, + nbytes_tile_pixel_size_x, + tile_pixel_offset_ey - tile_pixel_offset_sy + 1)); + } } + }; + + if (loader && *loader) + { + loader->enqueue(std::move(decode_func), + cucim::loader::TileInfo{ location_index, index, tiledata_offset, tiledata_size }); } else { - for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey; - ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw) - { - // Set (255,255,255) - memset(dest_start_ptr + dest_pixel_index, background_value, nbytes_tile_pixel_size_x); - } + decode_func(); } + dest_pixel_index_x += nbytes_tile_pixel_size_x; } dest_start_ptr += dest_pixel_step_y * dest_pixel_offset_len_y; } - if (tile_raster) - { - cucim_free(tile_raster); - } - return true; } bool IFD::read_region_tiles_boundary(const TIFF* tiff, const IFD* ifd, - const int64_t sx, - const int64_t sy, + const int64_t* location, + const int64_t location_index, const int64_t w, const int64_t h, void* raster, - const cucim::io::Device& out_device) + const cucim::io::Device& out_device, + cucim::loader::ThreadBatchDataLoader* loader) { PROF_SCOPED_RANGE(PROF_EVENT(ifd_read_region_tiles_boundary)); (void)out_device; // Reference code: https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/tjexample.c + int64_t sx = location[location_index * 2]; + int64_t sy = location[location_index * 2 + 1]; uint8_t background_value = tiff->background_value_; uint16_t compression_method = ifd->compression_; @@ -647,14 +833,13 @@ bool IFD::read_region_tiles_boundary(const TIFF* tiff, // Memory for tile_raster would be manually allocated here, instead of using decode_libjpeg(). // Need to free the manually. Usually it is set to nullptr and memory is created by decode_libjpeg() by using // tjAlloc() (Also need to free with tjFree() after use. See the documentation of tjAlloc() for the detail.) - const int pixel_format = TJPF_RGB; // TODO: support other pixel format - const int pixel_size_nbytes = tjPixelSize[pixel_format]; + const int pixel_size_nbytes = ifd->pixel_size_nbytes(); auto dest_start_ptr = static_cast(raster); bool is_out_of_image = (ex < 0 || width <= sx || ey < 0 || height <= sy); if (is_out_of_image) { - // Fill (255,255,255) and return + // Fill background color(255,255,255) and return memset(dest_start_ptr, background_value, w * h * pixel_size_nbytes); return true; } @@ -665,11 +850,6 @@ bool IFD::read_region_tiles_boundary(const TIFF* tiff, uint32_t th = ifd->tile_height_; const size_t tile_raster_nbytes = tw * th * pixel_size_nbytes; - uint8_t* tile_raster = nullptr; - if (cache_type == cucim::cache::CacheType::kNoCache) - { - tile_raster = static_cast(cucim_malloc(tile_raster_nbytes)); - } // TODO: revert this once we can get RGB data instead of RGBA uint32_t samples_per_pixel = 3; // ifd->samples_per_pixel(); @@ -737,7 +917,7 @@ bool IFD::read_region_tiles_boundary(const TIFF* tiff, int64_t boundary_index_y = offset_boundary_y * stride_y; - int tiff_file = tiff->file_handle_.fd; + int tiff_file = tiff->file_handle_->fd; uint64_t ifd_hash_value = ifd->hash_value_; uint32_t dest_pixel_step_y = w * samples_per_pixel; @@ -756,13 +936,15 @@ bool IFD::read_region_tiles_boundary(const TIFF* tiff, uint32_t dest_pixel_index_x = 0; int64_t index = index_y + offset_sx; - // Calculate a simple hash value for the tile index - uint64_t index_hash = ifd_hash_value ^ (static_cast(index) | (static_cast(index) << 32)); for (int64_t offset_x = offset_sx; offset_x <= offset_ex; ++offset_x, ++index) { PROF_SCOPED_RANGE(PROF_EVENT_P(ifd_read_region_tiles_boundary_iter, index)); uint64_t tiledata_offset = 0; uint64_t tiledata_size = 0; + + // Calculate a simple hash value for the tile index + uint64_t index_hash = ifd_hash_value ^ (static_cast(index) | (static_cast(index) << 32)); + if (offset_x >= offset_min_x && offset_x <= offset_max_x && index_y >= start_index_min_y && index_y <= end_index_max_y) { @@ -775,166 +957,257 @@ bool IFD::read_region_tiles_boundary(const TIFF* tiff, (pixel_offset_ex - tile_pixel_offset_x + 1) * samples_per_pixel : (tw - tile_pixel_offset_x) * samples_per_pixel; - uint32_t nbytes_tile_index = (tile_pixel_offset_sy * tw + tile_pixel_offset_x) * samples_per_pixel; - uint32_t dest_pixel_index = dest_pixel_index_x; - if (tiledata_size > 0) - { - bool copy_partial = false; - uint32_t fixed_nbytes_tile_pixel_size_x = nbytes_tile_pixel_size_x; - uint32_t fixed_tile_pixel_offset_ey = tile_pixel_offset_ey; + uint32_t nbytes_tile_index_orig = (tile_pixel_offset_sy * tw + tile_pixel_offset_x) * samples_per_pixel; + uint32_t dest_pixel_index_orig = dest_pixel_index_x; - if (offset_x == offset_boundary_x) - { - copy_partial = true; - if (offset_x != offset_ex) - { - fixed_nbytes_tile_pixel_size_x = - (pixel_offset_boundary_x - tile_pixel_offset_x + 1) * samples_per_pixel; - } - else - { - fixed_nbytes_tile_pixel_size_x = - (std::min(pixel_offset_boundary_x, pixel_offset_ex) - tile_pixel_offset_x + 1) * - samples_per_pixel; - } - } - if (index_y == boundary_index_y) + auto decode_func = [=, &image_cache]() { + PROF_SCOPED_RANGE(PROF_EVENT_P(ifd_read_region_tiles_boundary_task, index_hash)); + uint32_t nbytes_tile_index = nbytes_tile_index_orig; + uint32_t dest_pixel_index = dest_pixel_index_orig; + + if (tiledata_size > 0) { - copy_partial = true; - if (index_y != end_index_y) + bool copy_partial = false; + uint32_t fixed_nbytes_tile_pixel_size_x = nbytes_tile_pixel_size_x; + uint32_t fixed_tile_pixel_offset_ey = tile_pixel_offset_ey; + + if (offset_x == offset_boundary_x) { - fixed_tile_pixel_offset_ey = pixel_offset_boundary_y; + copy_partial = true; + if (offset_x != offset_ex) + { + fixed_nbytes_tile_pixel_size_x = + (pixel_offset_boundary_x - tile_pixel_offset_x + 1) * samples_per_pixel; + } + else + { + fixed_nbytes_tile_pixel_size_x = + (std::min(pixel_offset_boundary_x, pixel_offset_ex) - tile_pixel_offset_x + 1) * + samples_per_pixel; + } } - else + if (index_y == boundary_index_y) { - fixed_tile_pixel_offset_ey = std::min(pixel_offset_boundary_y, pixel_offset_ey); + copy_partial = true; + if (index_y != end_index_y) + { + fixed_tile_pixel_offset_ey = pixel_offset_boundary_y; + } + else + { + fixed_tile_pixel_offset_ey = std::min(pixel_offset_boundary_y, pixel_offset_ey); + } } - } - uint8_t* tile_data = tile_raster; + uint8_t* tile_data = nullptr; + std::unique_ptr tile_raster = + std::unique_ptr(nullptr, cucim_free); - auto key = image_cache.create_key(ifd_hash_value, index); - image_cache.lock(index_hash); - auto value = image_cache.find(key); - if (value) - { - image_cache.unlock(index_hash); - tile_data = static_cast(value->data); - } - else - { - // Lifetime of tile_data is same with `value` - // : do not access this data when `value` is not accessible. - if (cache_type != cucim::cache::CacheType::kNoCache) + if (loader && loader->batch_data_processor()) { - tile_data = static_cast(image_cache.allocate(tile_raster_nbytes)); - } - - { - PROF_SCOPED_RANGE(PROF_EVENT(ifd_decompression)); switch (compression_method) { - case COMPRESSION_NONE: - cuslide::raw::decode_raw(tiff_file, nullptr, tiledata_offset, tiledata_size, &tile_data, - tile_raster_nbytes, out_device); - break; case COMPRESSION_JPEG: - cuslide::jpeg::decode_libjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size, - jpegtable_data, jpegtable_count, &tile_data, out_device, - jpeg_color_space); - break; - case COMPRESSION_ADOBE_DEFLATE: - case COMPRESSION_DEFLATE: - cuslide::deflate::decode_deflate(tiff_file, nullptr, tiledata_offset, tiledata_size, - &tile_data, tile_raster_nbytes, out_device); - break; - case cuslide::jpeg2k::kAperioJpeg2kYCbCr: // 33003 - cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size, - &tile_data, tile_raster_nbytes, out_device, - cuslide::jpeg2k::ColorSpace::kSYCC); - break; - case cuslide::jpeg2k::kAperioJpeg2kRGB: // 33005 - cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size, - &tile_data, tile_raster_nbytes, out_device, - cuslide::jpeg2k::ColorSpace::kRGB); - break; - case COMPRESSION_LZW: - cuslide::lzw::decode_lzw(tiff_file, nullptr, tiledata_offset, tiledata_size, &tile_data, - tile_raster_nbytes, out_device); - // Apply unpredictor - // 1: none, 2: horizontal differencing, 3: floating point predictor - // https://www.adobe.io/content/dam/udp/en/open/standards/tiff/TIFF6.pdf - if (predictor == 2) - { - cuslide::lzw::horAcc8(tile_data, tile_raster_nbytes, nbytes_tw); - } break; default: throw std::runtime_error("Unsupported compression method"); } - } - value = image_cache.create_value(tile_data, tile_raster_nbytes); - image_cache.insert(key, value); - image_cache.unlock(index_hash); - } - if (copy_partial) - { - uint32_t fill_gap_x = nbytes_tile_pixel_size_x - fixed_nbytes_tile_pixel_size_x; - // Fill original, then fill white for remaining - if (fill_gap_x > 0) - { - for (uint32_t ty = tile_pixel_offset_sy; ty <= fixed_tile_pixel_offset_ey; - ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw) + auto value = loader->wait_for_processing(index); + if (!value) // if shutdown { - memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index, - fixed_nbytes_tile_pixel_size_x); - memset(dest_start_ptr + dest_pixel_index + fixed_nbytes_tile_pixel_size_x, background_value, - fill_gap_x); + return; + } + + tile_data = static_cast(value->data); + + cudaError_t cuda_status; + if (copy_partial) + { + uint32_t fill_gap_x = nbytes_tile_pixel_size_x - fixed_nbytes_tile_pixel_size_x; + // Fill original, then fill white for remaining + if (fill_gap_x > 0) + { + CUDA_ERROR(cudaMemcpy2D( + dest_start_ptr + dest_pixel_index, dest_pixel_step_y, tile_data + nbytes_tile_index, + nbytes_tw, fixed_nbytes_tile_pixel_size_x, + fixed_tile_pixel_offset_ey - tile_pixel_offset_sy + 1, cudaMemcpyDeviceToDevice)); + CUDA_ERROR(cudaMemset2D(dest_start_ptr + dest_pixel_index + fixed_nbytes_tile_pixel_size_x, + dest_pixel_step_y, background_value, fill_gap_x, + fixed_tile_pixel_offset_ey - tile_pixel_offset_sy + 1)); + dest_pixel_index += + dest_pixel_step_y * (fixed_tile_pixel_offset_ey - tile_pixel_offset_sy + 1); + } + else + { + CUDA_ERROR(cudaMemcpy2D( + dest_start_ptr + dest_pixel_index, dest_pixel_step_y, tile_data + nbytes_tile_index, + nbytes_tw, fixed_nbytes_tile_pixel_size_x, + fixed_tile_pixel_offset_ey - tile_pixel_offset_sy + 1, cudaMemcpyDeviceToDevice)); + dest_pixel_index += + dest_pixel_step_y * (fixed_tile_pixel_offset_ey - tile_pixel_offset_sy + 1); + } + + CUDA_ERROR(cudaMemset2D(dest_start_ptr + dest_pixel_index, dest_pixel_step_y, + background_value, nbytes_tile_pixel_size_x, + tile_pixel_offset_ey - (fixed_tile_pixel_offset_ey + 1) + 1)); + } + else + { + CUDA_ERROR(cudaMemcpy2D(dest_start_ptr + dest_pixel_index, dest_pixel_step_y, + tile_data + nbytes_tile_index, nbytes_tw, nbytes_tile_pixel_size_x, + tile_pixel_offset_ey - tile_pixel_offset_sy + 1, + cudaMemcpyDeviceToDevice)); } } else { - for (uint32_t ty = tile_pixel_offset_sy; ty <= fixed_tile_pixel_offset_ey; - ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw) + auto key = image_cache.create_key(ifd_hash_value, index); + image_cache.lock(index_hash); + auto value = image_cache.find(key); + if (value) { - memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index, - fixed_nbytes_tile_pixel_size_x); + image_cache.unlock(index_hash); + tile_data = static_cast(value->data); } - } + else + { + // Lifetime of tile_data is same with `value` + // : do not access this data when `value` is not accessible. + if (cache_type != cucim::cache::CacheType::kNoCache) + { + tile_data = static_cast(image_cache.allocate(tile_raster_nbytes)); + } + else + { + // Allocate temporary buffer for tile data + tile_raster = std::unique_ptr( + reinterpret_cast(cucim_malloc(tile_raster_nbytes)), cucim_free); + tile_data = tile_raster.get(); + } + { + PROF_SCOPED_RANGE(PROF_EVENT(ifd_decompression)); + switch (compression_method) + { + case COMPRESSION_NONE: + cuslide::raw::decode_raw(tiff_file, nullptr, tiledata_offset, tiledata_size, + &tile_data, tile_raster_nbytes, out_device); + break; + case COMPRESSION_JPEG: + cuslide::jpeg::decode_libjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size, + jpegtable_data, jpegtable_count, &tile_data, + out_device, jpeg_color_space); + break; + case COMPRESSION_ADOBE_DEFLATE: + case COMPRESSION_DEFLATE: + cuslide::deflate::decode_deflate(tiff_file, nullptr, tiledata_offset, tiledata_size, + &tile_data, tile_raster_nbytes, out_device); + break; + case cuslide::jpeg2k::kAperioJpeg2kYCbCr: // 33003 + cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset, + tiledata_size, &tile_data, tile_raster_nbytes, + out_device, cuslide::jpeg2k::ColorSpace::kSYCC); + break; + case cuslide::jpeg2k::kAperioJpeg2kRGB: // 33005 + cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset, + tiledata_size, &tile_data, tile_raster_nbytes, + out_device, cuslide::jpeg2k::ColorSpace::kRGB); + break; + case COMPRESSION_LZW: + cuslide::lzw::decode_lzw(tiff_file, nullptr, tiledata_offset, tiledata_size, + &tile_data, tile_raster_nbytes, out_device); + // Apply unpredictor + // 1: none, 2: horizontal differencing, 3: floating point predictor + // https://www.adobe.io/content/dam/udp/en/open/standards/tiff/TIFF6.pdf + if (predictor == 2) + { + cuslide::lzw::horAcc8(tile_data, tile_raster_nbytes, nbytes_tw); + } + break; + default: + throw std::runtime_error("Unsupported compression method"); + } + } + value = image_cache.create_value(tile_data, tile_raster_nbytes); + image_cache.insert(key, value); + image_cache.unlock(index_hash); + } + if (copy_partial) + { + uint32_t fill_gap_x = nbytes_tile_pixel_size_x - fixed_nbytes_tile_pixel_size_x; + // Fill original, then fill white for remaining + if (fill_gap_x > 0) + { + for (uint32_t ty = tile_pixel_offset_sy; ty <= fixed_tile_pixel_offset_ey; + ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw) + { + memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index, + fixed_nbytes_tile_pixel_size_x); + memset(dest_start_ptr + dest_pixel_index + fixed_nbytes_tile_pixel_size_x, + background_value, fill_gap_x); + } + } + else + { + for (uint32_t ty = tile_pixel_offset_sy; ty <= fixed_tile_pixel_offset_ey; + ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw) + { + memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index, + fixed_nbytes_tile_pixel_size_x); + } + } - for (uint32_t ty = fixed_tile_pixel_offset_ey + 1; ty <= tile_pixel_offset_ey; - ++ty, dest_pixel_index += dest_pixel_step_y) - { - memset(dest_start_ptr + dest_pixel_index, background_value, nbytes_tile_pixel_size_x); + for (uint32_t ty = fixed_tile_pixel_offset_ey + 1; ty <= tile_pixel_offset_ey; + ++ty, dest_pixel_index += dest_pixel_step_y) + { + memset(dest_start_ptr + dest_pixel_index, background_value, nbytes_tile_pixel_size_x); + } + } + else + { + for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey; + ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw) + { + memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index, + nbytes_tile_pixel_size_x); + } + } } } else { - for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey; - ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw) + + if (out_device.type() == cucim::io::DeviceType::kCPU) { - memcpy( - dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index, nbytes_tile_pixel_size_x); + for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey; + ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw) + { + // Set (255,255,255) + memset(dest_start_ptr + dest_pixel_index, background_value, nbytes_tile_pixel_size_x); + } + } + else + { + cudaError_t cuda_status; + CUDA_ERROR(cudaMemset2D(dest_start_ptr + dest_pixel_index, dest_pixel_step_y, background_value, + nbytes_tile_pixel_size_x, tile_pixel_offset_ey - tile_pixel_offset_sy)); } } + }; + + if (loader && *loader) + { + loader->enqueue(std::move(decode_func), + cucim::loader::TileInfo{ location_index, index, tiledata_offset, tiledata_size }); } else { - for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey; - ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw) - { - // Set (255,255,255) - memset(dest_start_ptr + dest_pixel_index, background_value, nbytes_tile_pixel_size_x); - } + decode_func(); } + dest_pixel_index_x += nbytes_tile_pixel_size_x; } dest_start_ptr += dest_pixel_step_y * dest_pixel_offset_len_y; } - if (tile_raster) - { - cucim_free(tile_raster); - } return true; } @@ -966,4 +1239,4 @@ void IFD::write_offsets_(const char* file_path) offsets.close(); } -} // namespace cuslide::tiff \ No newline at end of file +} // namespace cuslide::tiff diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.h b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.h index 16601f0d7..e15724782 100644 --- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.h +++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.h @@ -19,13 +19,15 @@ #include "types.h" +#include +#include + +#include #include #include +#include //#include -#include -#include - namespace cuslide::tiff { @@ -40,21 +42,23 @@ class EXPORT_VISIBLE IFD : public std::enable_shared_from_this static bool read_region_tiles(const TIFF* tiff, const IFD* ifd, - const int64_t sx, - const int64_t sy, + const int64_t* location, + const int64_t location_index, const int64_t w, const int64_t h, void* raster, - const cucim::io::Device& out_device); + const cucim::io::Device& out_device, + cucim::loader::ThreadBatchDataLoader* loader); static bool read_region_tiles_boundary(const TIFF* tiff, const IFD* ifd, - const int64_t sx, - const int64_t sy, + const int64_t* location, + const int64_t location_index, const int64_t w, const int64_t h, void* raster, - const cucim::io::Device& out_device); + const cucim::io::Device& out_device, + cucim::loader::ThreadBatchDataLoader* loader); bool read(const TIFF* tiff, const cucim::io::format::ImageMetadataDesc* metadata, @@ -88,6 +92,9 @@ class EXPORT_VISIBLE IFD : public std::enable_shared_from_this const std::vector& image_piece_offsets() const; const std::vector& image_piece_bytecounts() const; + size_t pixel_size_nbytes() const; + size_t tile_raster_size_nbytes() const; + // Hidden methods for benchmarking void write_offsets_(const char* file_path); diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.cpp index a99d1a411..cc3c5fd31 100644 --- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.cpp +++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.cpp @@ -281,8 +281,8 @@ TIFF::TIFF(const cucim::filesystem::Path& file_path, int mode) : file_path_(file cucim_free(file_path_cstr); throw std::invalid_argument(fmt::format("Cannot load {}!", file_path)); } - // TODO: make file_handle_ object to pointer - file_handle_ = CuCIMFileHandle{ fd, nullptr, FileHandleType::kPosix, file_path_cstr, this }; + file_handle_shared_ = std::make_shared(fd, nullptr, FileHandleType::kPosix, file_path_cstr, this); + file_handle_ = file_handle_shared_.get(); // TODO: warning if the file is big endian is_big_endian_ = ::TIFFIsBigEndian(tiff_client_); @@ -318,17 +318,6 @@ void TIFF::close() TIFFClose(tiff_client_); tiff_client_ = nullptr; } - if (file_handle_.path) - { - cucim_free(file_handle_.path); - file_handle_.path = nullptr; - } - if (file_handle_.client_data) - { - // Deleting file_handle_.client_data is parser_close()'s responsibility - // Do not execute this: `delete static_cast(file_handle_.client_data);` - file_handle_.client_data = nullptr; - } if (metadata_) { delete reinterpret_cast(metadata_); @@ -698,8 +687,8 @@ bool TIFF::read(const cucim::io::format::ImageMetadataDesc* metadata, return read_associated_image(metadata, request, out_image_data, out_metadata); } - // TODO: assume length of location/size to 2. - constexpr int32_t ndims = 2; + const int32_t ndim = request->size_ndim; + const uint64_t location_len = request->location_len; if (request->level >= level_to_ifd_idx_.size()) { @@ -711,7 +700,7 @@ bool TIFF::read(const cucim::io::format::ImageMetadataDesc* metadata, auto original_img_width = main_ifd->width(); auto original_img_height = main_ifd->height(); - for (int32_t i = 0; i < ndims; ++i) + for (int32_t i = 0; i < ndim; ++i) { if (request->size[i] <= 0) { @@ -733,7 +722,7 @@ bool TIFF::read(const cucim::io::format::ImageMetadataDesc* metadata, float downsample_factor = metadata->resolution_info.level_downsamples[request->level]; // Change request based on downsample factor. (normalized value at level-0 -> real location at the requested level) - for (int32_t i = 0; i < ndims; ++i) + for (int64_t i = ndim * location_len - 1; i >= 0; --i) { request->location[i] /= downsample_factor; } @@ -831,8 +820,9 @@ bool TIFF::read_associated_image(const cucim::io::format::ImageMetadataDesc* met switch (compression_method) { case COMPRESSION_JPEG: - if (!cuslide::jpeg::decode_libjpeg(file_handle_.fd, nullptr /*jpeg_buf*/, offset, size, jpegtable_data, - jpegtable_count, &target_ptr, out_device, jpeg_color_space)) + if (!cuslide::jpeg::decode_libjpeg(file_handle_->fd, nullptr /*jpeg_buf*/, offset, size, + jpegtable_data, jpegtable_count, &target_ptr, out_device, + jpeg_color_space)) { cucim_free(raster); fmt::print(stderr, "[Error] Failed to read region with libjpeg!\n"); @@ -840,8 +830,8 @@ bool TIFF::read_associated_image(const cucim::io::format::ImageMetadataDesc* met } break; case COMPRESSION_LZW: - if (!cuslide::lzw::decode_lzw( - file_handle_.fd, nullptr /*jpeg_buf*/, offset, size, &target_ptr, strip_nbytes, out_device)) + if (!cuslide::lzw::decode_lzw(file_handle_->fd, nullptr /*jpeg_buf*/, offset, size, &target_ptr, + strip_nbytes, out_device)) { cucim_free(raster); fmt::print(stderr, "[Error] Failed to read region with lzw decoder!\n"); @@ -1041,9 +1031,9 @@ cucim::filesystem::Path TIFF::file_path() const return file_path_; } -CuCIMFileHandle TIFF::file_handle() const +std::shared_ptr& TIFF::file_handle() { - return file_handle_; + return file_handle_shared_; } ::TIFF* TIFF::client() const { diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.h b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.h index 6b3730e88..c770669aa 100644 --- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.h +++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.h @@ -68,7 +68,8 @@ class EXPORT_VISIBLE TIFF : public std::enable_shared_from_this cucim::io::format::ImageMetadataDesc* out_metadata); cucim::filesystem::Path file_path() const; - CuCIMFileHandle file_handle() const; + std::shared_ptr& file_handle(); /// used for moving the ownership of the file handle to the caller. + /// Do not use for the application -- it will return nullptr. ::TIFF* client() const; const std::vector& ifd_offsets() const; std::shared_ptr ifd(size_t index) const; @@ -104,7 +105,9 @@ class EXPORT_VISIBLE TIFF : public std::enable_shared_from_this void _populate_aperio_svs_metadata(uint16_t ifd_count, void* metadata, std::shared_ptr& first_ifd); cucim::filesystem::Path file_path_; - CuCIMFileHandle file_handle_{}; + /// Temporary shared file handle whose ownership would be transferred to CuImage through parser_open() + std::shared_ptr file_handle_shared_; + CuCIMFileHandle* file_handle_ = nullptr; ::TIFF* tiff_client_ = nullptr; std::vector ifd_offsets_; /// IFD offset for an index (IFD index) std::vector> ifds_; /// IFD object for an index (IFD index) diff --git a/cpp/src/cache/image_cache.cpp b/cpp/src/cache/image_cache.cpp index 3cf4099ce..5036911b7 100644 --- a/cpp/src/cache/image_cache.cpp +++ b/cpp/src/cache/image_cache.cpp @@ -26,7 +26,8 @@ ImageCacheKey::ImageCacheKey(uint64_t file_hash, uint64_t index) : file_hash(fil { } -ImageCacheValue::ImageCacheValue(void* data, uint64_t size, void* user_obj) : data(data), size(size), user_obj(user_obj) +ImageCacheValue::ImageCacheValue(void* data, uint64_t size, void* user_obj, const cucim::io::DeviceType device_type) + : data(data), size(size), user_obj(user_obj), device_type(device_type) { } @@ -36,7 +37,8 @@ ImageCacheValue::operator bool() const } -ImageCache::ImageCache(const ImageCacheConfig& config, CacheType type) : type_(type), config_(config){}; +ImageCache::ImageCache(const ImageCacheConfig& config, CacheType type, const cucim::io::DeviceType device_type) + : type_(type), device_type_(device_type), config_(config){}; CacheType ImageCache::type() const { @@ -48,6 +50,11 @@ const char* ImageCache::type_str() const return "nocache"; } +cucim::io::DeviceType ImageCache::device_type() const +{ + return device_type_; +} + ImageCacheConfig& ImageCache::config() { return config_; diff --git a/cpp/src/cache/image_cache_empty.cpp b/cpp/src/cache/image_cache_empty.cpp index 0cd0a0c5e..c5935eb62 100644 --- a/cpp/src/cache/image_cache_empty.cpp +++ b/cpp/src/cache/image_cache_empty.cpp @@ -27,7 +27,7 @@ std::shared_ptr EmptyImageCache::create_key(uint64_t, uint64_t) { return std::make_shared(0, 0); } -std::shared_ptr EmptyImageCache::create_value(void*, uint64_t) +std::shared_ptr EmptyImageCache::create_value(void*, uint64_t, const cucim::io::DeviceType) { return std::make_shared(nullptr, 0); } @@ -47,11 +47,19 @@ void EmptyImageCache::unlock(uint64_t) return; } +void* EmptyImageCache::mutex(uint64_t) +{ + return nullptr; +} + bool EmptyImageCache::insert(std::shared_ptr&, std::shared_ptr&) { return true; } +void EmptyImageCache::remove_front() +{ +} uint32_t EmptyImageCache::size() const { diff --git a/cpp/src/cache/image_cache_empty.h b/cpp/src/cache/image_cache_empty.h index b592c817a..5ad367d9b 100644 --- a/cpp/src/cache/image_cache_empty.h +++ b/cpp/src/cache/image_cache_empty.h @@ -35,13 +35,16 @@ class EmptyImageCache : public ImageCache EmptyImageCache(const ImageCacheConfig& config); std::shared_ptr create_key(uint64_t file_hash, uint64_t index) override; - std::shared_ptr create_value(void* data, uint64_t size) override; + std::shared_ptr create_value( + void* data, uint64_t size, const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU) override; void* allocate(std::size_t n) override; void lock(uint64_t index) override; void unlock(uint64_t index) override; + void* mutex(uint64_t index) override; bool insert(std::shared_ptr& key, std::shared_ptr& value) override; + void remove_front() override; uint32_t size() const override; uint64_t memory_size() const override; diff --git a/cpp/src/cache/image_cache_manager.cpp b/cpp/src/cache/image_cache_manager.cpp index bd8a8a7e9..ce67b5f5d 100644 --- a/cpp/src/cache/image_cache_manager.cpp +++ b/cpp/src/cache/image_cache_manager.cpp @@ -29,7 +29,7 @@ namespace cucim::cache { -uint32_t preferred_memory_capacity(const std::vector& image_size, +uint32_t preferred_memory_capacity(const std::vector& image_size, const std::vector& tile_size, const std::vector& patch_size, uint32_t bytes_per_pixel) @@ -47,7 +47,9 @@ uint32_t preferred_memory_capacity(const std::vector& image_size, // The maximal number of tiles (y-axis) overapped with the given patch uint32_t patch_down_count = - std::min(image_size[1] + (tile_size[1] - 1), patch_size[1] + (tile_size[1] - 1)) / tile_size[1] + 1; + std::min(image_size[1] + (tile_size[1] - 1), static_cast(patch_size[1] + (tile_size[1] - 1))) / + tile_size[1] + + 1; // (tile_accross_count) x (tile width) x (tile_height) x (patch_down_count) x (bytes per pixel) uint64_t bytes_needed = @@ -95,14 +97,8 @@ void ImageCacheManager::reserve(uint32_t new_memory_capacity, uint32_t new_capac cache_->reserve(cache_config); } -std::unique_ptr ImageCacheManager::create_cache() const -{ - ImageCacheConfig& cache_config = cucim::CuImage::get_config()->cache(); - - return create_cache(cache_config); -} - -std::unique_ptr ImageCacheManager::create_cache(const ImageCacheConfig& cache_config) const +std::unique_ptr ImageCacheManager::create_cache(const ImageCacheConfig& cache_config, + const cucim::io::DeviceType device_type) { PROF_SCOPED_RANGE(PROF_EVENT(image_cache_create_cache)); switch (cache_config.type) @@ -110,12 +106,19 @@ std::unique_ptr ImageCacheManager::create_cache(const ImageCacheConf case CacheType::kNoCache: return std::make_unique(cache_config); case CacheType::kPerProcess: - return std::make_unique(cache_config); + return std::make_unique(cache_config, device_type); case CacheType::kSharedMemory: - return std::make_unique(cache_config); + return std::make_unique(cache_config, device_type); default: return std::make_unique(cache_config); } } +std::unique_ptr ImageCacheManager::create_cache() const +{ + ImageCacheConfig& cache_config = cucim::CuImage::get_config()->cache(); + + return create_cache(cache_config); +} + } // namespace cucim::cache diff --git a/cpp/src/cache/image_cache_per_process.cpp b/cpp/src/cache/image_cache_per_process.cpp index f2b36889f..95d2286c5 100644 --- a/cpp/src/cache/image_cache_per_process.cpp +++ b/cpp/src/cache/image_cache_per_process.cpp @@ -17,7 +17,9 @@ #include "image_cache_per_process.h" +#include "cucim/cache/image_cache.h" #include "cucim/memory/memory_manager.h" +#include "cucim/util/cuda.h" #include @@ -53,19 +55,38 @@ struct PerProcessImageCacheItem std::shared_ptr value; }; -PerProcessImageCacheValue::PerProcessImageCacheValue(void* data, uint64_t size, void* user_obj) - : ImageCacheValue(data, size, user_obj){}; +PerProcessImageCacheValue::PerProcessImageCacheValue(void* data, + uint64_t size, + void* user_obj, + const cucim::io::DeviceType device_type) + : ImageCacheValue(data, size, user_obj, device_type){}; + PerProcessImageCacheValue::~PerProcessImageCacheValue() { if (data) { - cucim_free(data); + switch (device_type) + { + case io::DeviceType::kCPU: + cucim_free(data); + break; + case io::DeviceType::kCUDA: { + cudaError_t cuda_status; + CUDA_TRY(cudaFree(data)); + break; + } + case io::DeviceType::kPinned: + case io::DeviceType::kCPUShared: + case io::DeviceType::kCUDAShared: + fmt::print(stderr, "Device type {} is not supported!\n", device_type); + break; + } data = nullptr; } }; -PerProcessImageCache::PerProcessImageCache(const ImageCacheConfig& config) - : ImageCache(config, CacheType::kPerProcess), +PerProcessImageCache::PerProcessImageCache(const ImageCacheConfig& config, const cucim::io::DeviceType device_type) + : ImageCache(config, CacheType::kPerProcess, device_type), mutex_array_(config.mutex_pool_capacity), capacity_nbytes_(kOneMiB * config.memory_capacity), capacity_(config.capacity), @@ -89,14 +110,32 @@ std::shared_ptr PerProcessImageCache::create_key(uint64_t file_ha { return std::make_shared(file_hash, index); } -std::shared_ptr PerProcessImageCache::create_value(void* data, uint64_t size) +std::shared_ptr PerProcessImageCache::create_value(void* data, + uint64_t size, + const cucim::io::DeviceType device_type) { - return std::make_shared(data, size); + return std::make_shared(data, size, nullptr, device_type); } void* PerProcessImageCache::allocate(std::size_t n) { - return cucim_malloc(n); + switch (device_type_) + { + case io::DeviceType::kCPU: + return cucim_malloc(n); + case io::DeviceType::kCUDA: { + cudaError_t cuda_status; + void* image_data_ptr = nullptr; + CUDA_TRY(cudaMalloc(&image_data_ptr, n)); + return image_data_ptr; + } + case io::DeviceType::kPinned: + case io::DeviceType::kCPUShared: + case io::DeviceType::kCUDAShared: + fmt::print(stderr, "Device type {} is not supported!\n", device_type_); + break; + } + return nullptr; } void PerProcessImageCache::lock(uint64_t index) @@ -109,6 +148,11 @@ void PerProcessImageCache::unlock(uint64_t index) mutex_array_[index % mutex_pool_capacity_].unlock(); } +void* PerProcessImageCache::mutex(uint64_t index) +{ + return &mutex_array_[index % mutex_pool_capacity_]; +} + bool PerProcessImageCache::insert(std::shared_ptr& key, std::shared_ptr& value) { if (value->size > capacity_nbytes_ || capacity_ < 1) @@ -135,6 +179,32 @@ bool PerProcessImageCache::insert(std::shared_ptr& key, std::shar return succeed; } +void PerProcessImageCache::remove_front() +{ + while (true) + { + uint32_t head = list_head_.load(std::memory_order_relaxed); + uint32_t tail = list_tail_.load(std::memory_order_relaxed); + if (head != tail) + { + // Remove front by increasing head + if (list_head_.compare_exchange_weak( + head, (head + 1) % list_capacity_, std::memory_order_release, std::memory_order_relaxed)) + { + std::shared_ptr head_item = list_[head]; + size_nbytes_.fetch_sub(head_item->value->size, std::memory_order_relaxed); + hashmap_.erase(head_item->key); + list_[head].reset(); // decrease refcount + break; + } + } + else + { + break; // already empty + } + } +} + uint32_t PerProcessImageCache::size() const { uint32_t head = list_head_.load(std::memory_order_relaxed); @@ -279,36 +349,6 @@ bool PerProcessImageCache::is_memory_full(uint64_t additional_size) const } } -void PerProcessImageCache::remove_front() -{ - while (true) - { - uint32_t head = list_head_.load(std::memory_order_relaxed); - uint32_t tail = list_tail_.load(std::memory_order_relaxed); - if (head != tail) - { - // Remove front by increasing head - if (list_head_.compare_exchange_weak( - head, (head + 1) % list_capacity_, std::memory_order_release, std::memory_order_relaxed)) - { - // fmt::print(stderr, "{} remove list_[{:05}]\n", std::hash{}(std::this_thread::get_id()), head); //[print_list] - std::shared_ptr head_item = list_[head]; - // if (head_item) // it is possible that head_item is nullptr. - // { - size_nbytes_.fetch_sub(head_item->value->size, std::memory_order_relaxed); - hashmap_.erase(head_item->key); - list_[head].reset(); // decrease refcount - break; - // } - } - } - else - { - break; // already empty - } - } -} - void PerProcessImageCache::push_back(std::shared_ptr& item) { uint32_t tail = list_tail_.load(std::memory_order_relaxed); @@ -318,7 +358,6 @@ void PerProcessImageCache::push_back(std::shared_ptr& if (list_tail_.compare_exchange_weak( tail, (tail + 1) % list_capacity_, std::memory_order_release, std::memory_order_relaxed)) { - // fmt::print(stderr, "{} list_[{:05}]={}\n", std::hash{}(std::this_thread::get_id()), tail, (uint64_t)item->key->location_hash); // [print_list] list_[tail] = item; size_nbytes_.fetch_add(item->value->size, std::memory_order_relaxed); break; diff --git a/cpp/src/cache/image_cache_per_process.h b/cpp/src/cache/image_cache_per_process.h index 67c325fd0..5279e44e6 100644 --- a/cpp/src/cache/image_cache_per_process.h +++ b/cpp/src/cache/image_cache_per_process.h @@ -49,7 +49,10 @@ struct PerProcessImageCacheItem; struct PerProcessImageCacheValue : public ImageCacheValue { - PerProcessImageCacheValue(void* data, uint64_t size, void* user_obj = nullptr); + PerProcessImageCacheValue(void* data, + uint64_t size, + void* user_obj = nullptr, + const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU); ~PerProcessImageCacheValue() override; }; @@ -64,19 +67,23 @@ struct PerProcessImageCacheValue : public ImageCacheValue class PerProcessImageCache : public ImageCache { public: - PerProcessImageCache(const ImageCacheConfig& config); + PerProcessImageCache(const ImageCacheConfig& config, + const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU); ~PerProcessImageCache(); const char* type_str() const override; std::shared_ptr create_key(uint64_t file_hash, uint64_t index) override; - std::shared_ptr create_value(void* data, uint64_t size) override; + std::shared_ptr create_value( + void* data, uint64_t size, const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU) override; void* allocate(std::size_t n) override; void lock(uint64_t index) override; void unlock(uint64_t index) override; + void* mutex(uint64_t index) override; bool insert(std::shared_ptr& key, std::shared_ptr& value) override; + void remove_front() override; uint32_t size() const override; uint64_t memory_size() const override; @@ -98,7 +105,6 @@ class PerProcessImageCache : public ImageCache private: bool is_list_full() const; bool is_memory_full(uint64_t additional_size = 0) const; - void remove_front(); void push_back(std::shared_ptr& item); bool erase(const std::shared_ptr& key); diff --git a/cpp/src/cache/image_cache_shared_memory.cpp b/cpp/src/cache/image_cache_shared_memory.cpp index d09e8d47c..8b62cac68 100644 --- a/cpp/src/cache/image_cache_shared_memory.cpp +++ b/cpp/src/cache/image_cache_shared_memory.cpp @@ -154,8 +154,11 @@ struct ImageCacheItemDetail deleter_type value; }; -SharedMemoryImageCacheValue::SharedMemoryImageCacheValue(void* data, uint64_t size, void* user_obj) - : ImageCacheValue(data, size, user_obj){}; +SharedMemoryImageCacheValue::SharedMemoryImageCacheValue(void* data, + uint64_t size, + void* user_obj, + const cucim::io::DeviceType device_type) + : ImageCacheValue(data, size, user_obj, device_type){}; SharedMemoryImageCacheValue::~SharedMemoryImageCacheValue() { @@ -169,8 +172,8 @@ SharedMemoryImageCacheValue::~SharedMemoryImageCacheValue() } }; -SharedMemoryImageCache::SharedMemoryImageCache(const ImageCacheConfig& config) - : ImageCache(config, CacheType::kSharedMemory), +SharedMemoryImageCache::SharedMemoryImageCache(const ImageCacheConfig& config, const cucim::io::DeviceType device_type) + : ImageCache(config, CacheType::kSharedMemory, device_type), segment_(create_segment(config)), // mutex_array_(nullptr, shared_mem_deleter(segment_)), size_nbytes_(nullptr, shared_mem_deleter>(segment_)), @@ -190,6 +193,12 @@ SharedMemoryImageCache::SharedMemoryImageCache(const ImageCacheConfig& config) const uint32_t& mutex_pool_capacity = config.mutex_pool_capacity; const bool& record_stat = config.record_stat; + if (device_type != cucim::io::DeviceType::kCPU) + { + throw std::runtime_error( + fmt::format("[Error] SharedMemoryImageCache doesn't support other memory type other than CPU memory!\n")); + } + try { // mutex_array_.reset(segment_->find_or_construct_it( @@ -282,11 +291,13 @@ std::shared_ptr SharedMemoryImageCache::create_key(uint64_t file_ return std::shared_ptr(key.get().get(), null_deleter(key)); } -std::shared_ptr SharedMemoryImageCache::create_value(void* data, uint64_t size) +std::shared_ptr SharedMemoryImageCache::create_value(void* data, + uint64_t size, + const cucim::io::DeviceType device_type) { auto value = boost::interprocess::make_managed_shared_ptr( segment_->find_or_construct(boost::interprocess::anonymous_instance)( - data, size, &*segment_), + data, size, &*segment_, device_type), *segment_); return std::shared_ptr(value.get().get(), null_deleter(value)); @@ -333,6 +344,11 @@ void SharedMemoryImageCache::unlock(uint64_t index) mutex_array_[index % *mutex_pool_capacity_].unlock(); } +void* SharedMemoryImageCache::mutex(uint64_t index) +{ + return &mutex_array_[index % *mutex_pool_capacity_]; +} + bool SharedMemoryImageCache::insert(std::shared_ptr& key, std::shared_ptr& value) { if (value->size > *capacity_nbytes_ || *capacity_ < 1) @@ -359,6 +375,36 @@ bool SharedMemoryImageCache::insert(std::shared_ptr& key, std::sh return succeed; } +void SharedMemoryImageCache::remove_front() +{ + while (true) + { + uint32_t head = (*list_head_).load(std::memory_order_relaxed); + uint32_t tail = (*list_tail_).load(std::memory_order_relaxed); + if (head != tail) + { + // Remove front by increasing head + if ((*list_head_) + .compare_exchange_weak( + head, (head + 1) % (*list_capacity_), std::memory_order_release, std::memory_order_relaxed)) + { + auto& head_item = (*list_)[head]; + if (head_item) // it is possible that head_item is nullptr + { + (*size_nbytes_).fetch_sub(head_item->value->size, std::memory_order_relaxed); + hashmap_->erase(head_item->key); + (*list_)[head].reset(); // decrease refcount + break; + } + } + } + else + { + break; // already empty + } + } +} + uint32_t SharedMemoryImageCache::size() const { uint32_t head = list_head_->load(std::memory_order_relaxed); @@ -508,36 +554,6 @@ bool SharedMemoryImageCache::is_memory_full(uint64_t additional_size) const } } -void SharedMemoryImageCache::remove_front() -{ - while (true) - { - uint32_t head = (*list_head_).load(std::memory_order_relaxed); - uint32_t tail = (*list_tail_).load(std::memory_order_relaxed); - if (head != tail) - { - // Remove front by increasing head - if ((*list_head_) - .compare_exchange_weak( - head, (head + 1) % (*list_capacity_), std::memory_order_release, std::memory_order_relaxed)) - { - auto& head_item = (*list_)[head]; - if (head_item) // it is possible that head_item is nullptr - { - (*size_nbytes_).fetch_sub(head_item->value->size, std::memory_order_relaxed); - hashmap_->erase(head_item->key); - (*list_)[head].reset(); // decrease refcount - break; - } - } - } - else - { - break; // already empty - } - } -} - void SharedMemoryImageCache::push_back(cache_item_type& item) { uint32_t tail = (*list_tail_).load(std::memory_order_relaxed); diff --git a/cpp/src/cache/image_cache_shared_memory.h b/cpp/src/cache/image_cache_shared_memory.h index dcf5b560c..3f8134977 100644 --- a/cpp/src/cache/image_cache_shared_memory.h +++ b/cpp/src/cache/image_cache_shared_memory.h @@ -40,7 +40,10 @@ struct ImageCacheItemDetail; struct SharedMemoryImageCacheValue : public ImageCacheValue { - SharedMemoryImageCacheValue(void* data, uint64_t size, void* user_obj = nullptr); + SharedMemoryImageCacheValue(void* data, + uint64_t size, + void* user_obj = nullptr, + const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU); ~SharedMemoryImageCacheValue() override; }; @@ -121,19 +124,23 @@ using cache_item_type = boost::interprocess::shared_ptr< class SharedMemoryImageCache : public ImageCache { public: - SharedMemoryImageCache(const ImageCacheConfig& config); + SharedMemoryImageCache(const ImageCacheConfig& config, + const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU); ~SharedMemoryImageCache(); const char* type_str() const override; std::shared_ptr create_key(uint64_t file_hash, uint64_t index) override; - std::shared_ptr create_value(void* data, uint64_t size) override; + std::shared_ptr create_value( + void* data, uint64_t size, const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU) override; void* allocate(std::size_t n) override; void lock(uint64_t index) override; void unlock(uint64_t index) override; + void* mutex(uint64_t index) override; bool insert(std::shared_ptr& key, std::shared_ptr& value) override; + void remove_front() override; uint32_t size() const override; uint64_t memory_size() const override; @@ -155,7 +162,6 @@ class SharedMemoryImageCache : public ImageCache private: bool is_list_full() const; bool is_memory_full(uint64_t additional_size = 0) const; - void remove_front(); void push_back(cache_item_type& item); bool erase(const std::shared_ptr& key); diff --git a/cpp/src/concurrent/threadpool.cpp b/cpp/src/concurrent/threadpool.cpp new file mode 100644 index 000000000..595b40aff --- /dev/null +++ b/cpp/src/concurrent/threadpool.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cucim/concurrent/threadpool.h" + +#include +#include + +#include "cucim/profiler/nvtx3.h" + +namespace cucim::concurrent +{ + +struct ThreadPool::Executor : public tf::Executor +{ + // inherits Constructor + using tf::Executor::Executor; +}; + + +ThreadPool::ThreadPool(int32_t num_workers) +{ + num_workers_ = num_workers; + if (num_workers > 0) + { + executor_ = std::make_unique(num_workers); + } +} + +ThreadPool::~ThreadPool() +{ + if (executor_) + { + executor_->wait_for_all(); + } +} + +ThreadPool::operator bool() const +{ + return (num_workers_ > 0); +} + +std::future ThreadPool::enqueue(std::function task) +{ + auto future = executor_->async([task]() { task(); }); + return std::move(future); +} + +void ThreadPool::wait() +{ + if (executor_) + { + executor_->wait_for_all(); + } +} + +} // namespace cucim::concurrent diff --git a/cpp/src/cuimage.cpp b/cpp/src/cuimage.cpp index b41b5be64..e4b03d0fb 100644 --- a/cpp/src/cuimage.cpp +++ b/cpp/src/cuimage.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #if CUCIM_SUPPORT_CUDA # include @@ -143,21 +144,25 @@ std::unique_ptr CuImage::image_format_plugins_ = std::make_ CuImage::CuImage(const filesystem::Path& path) { PROF_SCOPED_RANGE(PROF_EVENT_P(cuimage_cuimage, 1)); - // printf("[cuCIM] CuImage::CuImage(filesystem::Path path)\n"); + ensure_init(); image_format_ = image_format_plugins_->detect_image_format(path); // TODO: need to detect available format for the file path { PROF_SCOPED_RANGE(PROF_EVENT(cuimage_cuimage_open)); - file_handle_ = image_format_->image_parser.open(path.c_str()); + std::shared_ptr* file_handle_shared = + reinterpret_cast*>(image_format_->image_parser.open(path.c_str())); + file_handle_ = *file_handle_shared; + delete file_handle_shared; + + // Set deleter to close the file handle + file_handle_->set_deleter(image_format_->image_parser.close); } - // printf("[GB] file_handle: %s\n", file_handle_.path); - // fmt::print("[GB] CuImage path char: '{}'\n", file_handle_.path[0]); io::format::ImageMetadata& image_metadata = *(new io::format::ImageMetadata{}); image_metadata_ = &image_metadata.desc(); - is_loaded_ = image_format_->image_parser.parse(&file_handle_, image_metadata_); + is_loaded_ = image_format_->image_parser.parse(file_handle_.get(), image_metadata_); dim_indices_ = DimIndices(image_metadata_->dims); auto& associated_image_info = image_metadata_->associated_image_info; @@ -178,18 +183,10 @@ CuImage::CuImage(const filesystem::Path& path, const std::string& plugin_name) (void)plugin_name; } -// CuImage::CuImage(const CuImage& cuimg) : std::enable_shared_from_this() -//{ -// printf("[cuCIM] CuImage::CuImage(const CuImage& cuimg)\n"); -// (void)cuimg; -// -//} - CuImage::CuImage(CuImage&& cuimg) : std::enable_shared_from_this() { PROF_SCOPED_RANGE(PROF_EVENT_P(cuimage_cuimage, 3)); - // printf("[cuCIM] CuImage::CuImage(CuImage&& cuimg) %s\n", cuimg.file_handle_.path); - (void)cuimg; + std::swap(file_handle_, cuimg.file_handle_); std::swap(image_format_, cuimg.image_format_); std::swap(image_metadata_, cuimg.image_metadata_); @@ -205,11 +202,8 @@ CuImage::CuImage(const CuImage* cuimg, : std::enable_shared_from_this() { PROF_SCOPED_RANGE(PROF_EVENT_P(cuimage_cuimage, 4)); - // printf( - // "[cuCIM] CuImage::CuImage(CuImage* cuimg, io::format::ImageMetadataDesc* image_metadata, - // cucim::io::format::ImageDataDesc* image_data)\n"); - // file_handle_ = cuimg->file_handle_; ==> Don't do this. it will cause a double free. + file_handle_ = cuimg->file_handle_; image_format_ = cuimg->image_format_; image_metadata_ = image_metadata; image_data_ = image_data; @@ -233,15 +227,14 @@ CuImage::CuImage(const CuImage* cuimg, CuImage::CuImage() : std::enable_shared_from_this() { PROF_SCOPED_RANGE(PROF_EVENT_P(cuimage_cuimage, 5)); - file_handle_.path = const_cast(""); + file_handle_ = std::make_shared(); + file_handle_->path = const_cast(""); } CuImage::~CuImage() { PROF_SCOPED_RANGE(PROF_EVENT(cuimage__cuimage)); - // printf("[cuCIM] CuImage::~CuImage()\n"); - close(); - image_format_ = nullptr; // memory release is handled by the framework + if (image_metadata_) { // Memory for json_data needs to be manually released if image_metadata_->json_data is not "" @@ -273,18 +266,18 @@ CuImage::~CuImage() image_data_->container.data = nullptr; break; case io::DeviceType::kCUDA: - cudaError_t cuda_status; - CUDA_TRY(cudaFree(image_data_->container.data)); - image_data_->container.data = nullptr; - if (cuda_status) + + if (image_data_->loader) { - fmt::print(stderr, "[Error] Cannot free memory!"); + cudaError_t cuda_status; + CUDA_TRY(cudaFree(image_data_->container.data)); } + image_data_->container.data = nullptr; break; case io::DeviceType::kPinned: case io::DeviceType::kCPUShared: case io::DeviceType::kCUDAShared: - fmt::print(stderr, "Device type {} is not supported!", device_type); + fmt::print(stderr, "Device type {} is not supported!\n", device_type); break; } } @@ -303,9 +296,20 @@ CuImage::~CuImage() cucim_free(image_data_->shm_name); image_data_->shm_name = nullptr; } + if (image_data_->loader) + { + auto loader = reinterpret_cast(image_data_->loader); + delete loader; + + image_data_->loader = nullptr; + } + cucim_free(image_data_); image_data_ = nullptr; } + + close(); // close file handle (NOTE:: close the file handle after loader is deleted) + image_format_ = nullptr; // memory release is handled by the framework } Framework* CuImage::get_framework() @@ -351,7 +355,7 @@ bool CuImage::is_trace_enabled() filesystem::Path CuImage::path() const { - return file_handle_.path == nullptr ? "" : file_handle_.path; + return file_handle_->path == nullptr ? "" : file_handle_->path; } bool CuImage::is_loaded() const { @@ -588,9 +592,27 @@ memory::DLTContainer CuImage::container() const } } +loader::ThreadBatchDataLoader* CuImage::loader() const +{ + if (image_data_) + { + return reinterpret_cast(image_data_->loader); + } + else + { + return nullptr; + } +} + CuImage CuImage::read_region(std::vector&& location, std::vector&& size, uint16_t level, + uint32_t num_workers, + uint32_t batch_size, + bool drop_last, + uint32_t prefetch_factor, + bool shuffle, + uint64_t seed, const DimIndices& region_dim_indices, const io::Device& device, DLTensor* buf, @@ -598,7 +620,6 @@ CuImage CuImage::read_region(std::vector&& location, { PROF_SCOPED_RANGE(PROF_EVENT(cuimage_read_region)); (void)region_dim_indices; - (void)device; (void)buf; (void)shm_name; @@ -621,44 +642,101 @@ CuImage CuImage::read_region(std::vector&& location, size.insert(size.end(), level_dimension.begin(), level_dimension.end()); } + // The number of locations should be the multiplication of the number of dimensions in the size. + if (location.size() % size.size() != 0) + { + throw std::runtime_error( + "[Error] The number of locations should be the multiplication of the number of dimensions in the size!"); + } + + // Make sure the batch size is not zero. + if (batch_size == 0) + { + batch_size = 1; + } + + // num_workers would be always > 0 if output device type is CUDA + if (num_workers == 0 && device.type() == cucim::io::DeviceType::kCUDA) + { + num_workers = 1; + } + + uint32_t size_ndim = size.size(); + uint64_t location_len = location.size() / size_ndim; std::string device_name = std::string(device); cucim::io::format::ImageReaderRegionRequestDesc request{}; - int64_t request_location[2] = { location[0], location[1] }; - request.location = request_location; + + if (location_len > 1 || batch_size > 1 || num_workers > 0) + { + // ::Note:: Here, to pass vector data to C interface, we move data in the original vector to the vector in heap + // memory and create a unique pointer with 'new'. The data is transferred to ThreadBatchDataLoader class members + // (locations_ and size_) for automatic deletion on exit. + auto location_ptr = new std::vector(); + location_ptr->swap(location); + auto location_unique = reinterpret_cast(new std::unique_ptr>(location_ptr)); + + auto size_ptr = new std::vector(); + size_ptr->swap(size); + auto size_unique = reinterpret_cast(new std::unique_ptr>(size_ptr)); + + request.location = location_ptr->data(); + request.location_unique = location_unique; + request.size = size_ptr->data(); + request.size_unique = size_unique; + } + else + { + request.location = location.data(); + request.size = size.data(); + } + request.location_len = location_len; + request.size_ndim = size_ndim; request.level = level; - int64_t request_size[2] = { size[0], size[1] }; - request.size = request_size; + request.num_workers = num_workers; + request.batch_size = batch_size; + request.drop_last = drop_last; + request.prefetch_factor = prefetch_factor; + request.shuffle = shuffle; + request.seed = seed; request.device = device_name.data(); - // cucim::io::format::ImageDataDesc image_data{}; + auto image_data = std::unique_ptr( + reinterpret_cast(cucim_malloc(sizeof(io::format::ImageDataDesc))), cucim_free); + memset(image_data.get(), 0, sizeof(io::format::ImageDataDesc)); - cucim::io::format::ImageDataDesc* image_data = - static_cast(cucim_malloc(sizeof(cucim::io::format::ImageDataDesc))); - memset(image_data, 0, sizeof(cucim::io::format::ImageDataDesc)); try { // Read region from internal file if image_data_ is nullptr if (image_data_ == nullptr) { - if (file_handle_.fd < 0) // file_handle_ is not opened + if (!file_handle_) // file_handle_ is not opened { throw std::runtime_error("[Error] The image file is closed!"); } if (!image_format_->image_reader.read( - &file_handle_, image_metadata_, &request, image_data, nullptr /*out_metadata*/)) + file_handle_.get(), image_metadata_, &request, image_data.get(), nullptr /*out_metadata*/)) { - cucim_free(image_data); throw std::runtime_error("[Error] Failed to read image!"); } } else // Read region by cropping image { + const char* dims_str = image_metadata_->dims; + if (strncmp("YXC", dims_str, 4) != 0) + { + throw std::runtime_error(fmt::format("[Error] The image is not in YXC format! ({})", dims_str)); + } + if (image_data_->container.data == nullptr) + { + throw std::runtime_error( + "[Error] The image data is nullptr! It is possible that the object is iterator and the image data " + "is not loaded yet! Please advance the iterator first!"); + } crop_image(request, *image_data); } } catch (std::invalid_argument& e) { - cucim_free(image_data); throw e; } @@ -676,6 +754,10 @@ CuImage CuImage::read_region(std::vector&& location, auto& resource = out_metadata.get_resource(); std::string_view dims{ "YXC" }; + if (batch_size > 1) + { + dims = { "NYXC" }; + } // Information from image_data std::pmr::vector shape(&resource); @@ -712,9 +794,17 @@ CuImage CuImage::read_region(std::vector&& location, std::pmr::vector spacing_units(&resource); spacing_units.reserve(ndim); - for (int i = 0; i < ndim; i++) + + int index = 0; + if (ndim == 4) { - int64_t dim_char = dim_indices_.index(dims[i]); + index = 1; + // The first dimension is for 'batch' ('N') + spacing_units.emplace_back(std::string_view{ "batch" }); + } + for (; index < ndim; ++index) + { + int64_t dim_char = dim_indices_.index(dims[index]); const char* str_ptr = image_metadata_->spacing_units[dim_char]; size_t str_len = strlen(image_metadata_->spacing_units[dim_char]); @@ -758,7 +848,7 @@ CuImage CuImage::read_region(std::vector&& location, const uint16_t level_ndim = 2; std::pmr::vector level_dimensions(&resource); level_dimensions.reserve(level_ndim * 1); // it has only one size - level_dimensions.insert(level_dimensions.end(), &size[0], &size[level_ndim]); + level_dimensions.insert(level_dimensions.end(), request.location, &request.location[request.location_len]); std::pmr::vector level_downsamples(&resource); level_downsamples.reserve(1); @@ -766,7 +856,8 @@ CuImage CuImage::read_region(std::vector&& location, std::pmr::vector level_tile_sizes(&resource); level_tile_sizes.reserve(level_ndim * 1); // it has only one size - level_tile_sizes.insert(level_tile_sizes.end(), &size[0], &size[level_ndim]); // same with level_dimension + level_tile_sizes.insert( + level_tile_sizes.end(), request.location, &request.location[request.location_len]); // same with level_dimension // Empty associated images const size_t associated_image_count = 0; @@ -797,7 +888,7 @@ CuImage CuImage::read_region(std::vector&& location, out_metadata.raw_data(raw_data); out_metadata.json_data(json_data); - return CuImage(this, &out_metadata.desc(), image_data); + return CuImage(this, &out_metadata.desc(), image_data.release()); } std::set CuImage::associated_images() const @@ -808,7 +899,7 @@ std::set CuImage::associated_images() const CuImage CuImage::associated_image(const std::string& name, const io::Device& device) const { PROF_SCOPED_RANGE(PROF_EVENT(cuimage_associated_image)); - if (file_handle_.fd < 0) // file_handle_ is not opened + if (file_handle_->fd < 0) // file_handle_ is not opened { throw std::runtime_error("[Error] The image file is closed!"); } @@ -820,20 +911,19 @@ CuImage CuImage::associated_image(const std::string& name, const io::Device& dev std::string device_name = std::string(device); request.device = device_name.data(); - io::format::ImageDataDesc* out_image_data = - static_cast(cucim_malloc(sizeof(cucim::io::format::ImageDataDesc))); + auto out_image_data = std::unique_ptr( + reinterpret_cast(cucim_malloc(sizeof(io::format::ImageDataDesc))), cucim_free); + memset(out_image_data.get(), 0, sizeof(io::format::ImageDataDesc)); io::format::ImageMetadata& out_metadata = *(new io::format::ImageMetadata{}); if (!image_format_->image_reader.read( - &file_handle_, image_metadata_, &request, out_image_data, &out_metadata.desc())) + file_handle_.get(), image_metadata_, &request, out_image_data.get(), &out_metadata.desc())) { - cucim_free(out_image_data); - delete &out_metadata; throw std::runtime_error("[Error] Failed to read image!"); } - return CuImage(this, &out_metadata.desc(), out_image_data); + return CuImage(this, &out_metadata.desc(), out_image_data.release()); } return CuImage{}; } @@ -892,13 +982,7 @@ void CuImage::save(std::string file_path) const void CuImage::close() { - if (file_handle_.client_data) - { - image_format_->image_parser.close(&file_handle_); - } - file_handle_.cufile = nullptr; - file_handle_.path = nullptr; - file_handle_.fd = -1; + file_handle_ = nullptr; } void CuImage::ensure_init() @@ -947,8 +1031,7 @@ bool CuImage::crop_image(const io::format::ImageReaderRegionRequestDesc& request io::format::ImageDataDesc& out_image_data) const { PROF_SCOPED_RANGE(PROF_EVENT(cuimage_crop_image)); - // TODO: assume length of location/size to 2. - constexpr int32_t ndims = 2; + const int32_t ndim = request.size_ndim; if (request.level >= image_metadata_->resolution_info.level_count) { @@ -964,7 +1047,7 @@ bool CuImage::crop_image(const io::format::ImageReaderRegionRequestDesc& request // (we cannot use `ifd->samples_per_pixel()` here) uint32_t samples_per_pixel = static_cast(image_metadata_->shape[dim_indices_.index('C')]); - for (int32_t i = 0; i < ndims; ++i) + for (int32_t i = 0; i < ndim; ++i) { if (request.location[i] < 0) { @@ -1110,4 +1193,217 @@ bool CuImage::crop_image(const io::format::ImageReaderRegionRequestDesc& request return true; } +///////////////////////////// +// Iterator implementation // +///////////////////////////// + +CuImage::iterator CuImage::begin() +{ + return iterator(shared_from_this()); +} +CuImage::iterator CuImage::end() +{ + return iterator(shared_from_this(), true); +} + +CuImage::const_iterator CuImage::begin() const +{ + return const_iterator(shared_from_this()); +} + +CuImage::const_iterator CuImage::end() const +{ + return const_iterator(shared_from_this(), true); +} + +template +CuImageIterator::CuImageIterator(std::shared_ptr cuimg, bool ending) + : cuimg_(cuimg), loader_(nullptr), batch_index_(0), total_batch_count_(0) +{ + if (!cuimg_) + { + throw std::runtime_error("CuImageIterator: cuimg is nullptr!"); + } + + auto& image_data = cuimg_->image_data_; + cucim::loader::ThreadBatchDataLoader* loader = nullptr; + if (image_data) + { + loader = reinterpret_cast(image_data->loader); + loader_ = loader; + } + + if (ending) // point to the end + { + if (image_data) + { + if (loader) + { + total_batch_count_ = loader->total_batch_count(); + batch_index_ = total_batch_count_; + } + else + { + total_batch_count_ = 1; + batch_index_ = 1; + } + } + else + { + batch_index_ = 0; + } + } + else + { + if (image_data) + { + if (loader) + { + total_batch_count_ = loader->total_batch_count(); + if (loader->size() > 1) + { + batch_index_ = loader->processed_batch_count(); + } + else + { + batch_index_ = 0; + } + } + else + { + total_batch_count_ = 1; + batch_index_ = 0; + } + } + else + { + throw std::out_of_range("Batch index out of range! ('image_data_' is null)"); + } + } +} + +template +typename CuImageIterator::reference CuImageIterator::operator*() const +{ + return cuimg_; +} + +template +typename CuImageIterator::pointer CuImageIterator::operator->() +{ + return cuimg_.get(); +} + +template +CuImageIterator& CuImageIterator::operator++() +{ + // Prefix increment + increase_index_(); + return *this; +} + +template +CuImageIterator CuImageIterator::operator++(int) +{ + // Postfix increment + auto temp(*this); + increase_index_(); + return temp; +} + +template +bool CuImageIterator::operator==(const CuImageIterator& other) +{ + return cuimg_.get() == other.cuimg_.get() && batch_index_ == other.batch_index_; +}; + +template +bool CuImageIterator::operator!=(const CuImageIterator& other) +{ + return cuimg_.get() != other.cuimg_.get() || batch_index_ != other.batch_index_; +}; + +template +int64_t CuImageIterator::index() +{ + auto loader = reinterpret_cast(loader_); + if (loader && (loader->size() > 1)) + { + batch_index_ = loader->processed_batch_count(); + } + return batch_index_; +} + +template +uint64_t CuImageIterator::size() const +{ + return total_batch_count_; +} + +template +void CuImageIterator::increase_index_() +{ + auto loader = reinterpret_cast(loader_); + if (loader) + { + auto next_data = loader->next_data(); + if (next_data) + { + auto& image_data = cuimg_->image_data_; + auto image_data_ptr = reinterpret_cast(&(image_data->container.data)); + + DLContext& ctx = image_data->container.ctx; + auto device_type = static_cast(ctx.device_type); + switch (device_type) + { + case io::DeviceType::kCPU: + if (*image_data_ptr) + { + cucim_free(*image_data_ptr); + } + break; + case io::DeviceType::kCUDA: + if (*image_data_ptr) + { + cudaError_t cuda_status; + CUDA_ERROR(cudaFree(*image_data_ptr)); + } + break; + case io::DeviceType::kPinned: + case io::DeviceType::kCPUShared: + case io::DeviceType::kCUDAShared: + fmt::print(stderr, "Device type {} is not supported!\n", device_type); + break; + } + + *image_data_ptr = next_data; + + if (loader->batch_size() > 1) + { + // Set value for dimension 'N' + cuimg_->image_data_->container.shape[0] = loader->data_batch_size(); + cuimg_->image_metadata_->shape[0] = loader->data_batch_size(); + } + } + if (loader->size() > 1) + { + batch_index_ = loader->processed_batch_count(); + } + else + { + if (batch_index_ < static_cast(total_batch_count_)) + { + ++batch_index_; + } + } + } + else + { + if (batch_index_ < static_cast(total_batch_count_)) + { + ++batch_index_; + } + } +} + } // namespace cucim \ No newline at end of file diff --git a/cpp/src/filesystem/cufile_driver.cpp b/cpp/src/filesystem/cufile_driver.cpp index d2a7bd9eb..67ca9cdda 100644 --- a/cpp/src/filesystem/cufile_driver.cpp +++ b/cpp/src/filesystem/cufile_driver.cpp @@ -195,14 +195,13 @@ CuFileDriver::CuFileDriver(int fd, bool no_gds, bool use_mmap, const char* file_ file_flags_ = flags; FileHandleType file_type = (flags & O_DIRECT) ? FileHandleType::kPosixODirect : FileHandleType::kPosix; - handle_ = CuCIMFileHandle{ fd, - nullptr, - file_type, - const_cast(file_path_.c_str()), - this, - static_cast(st.st_dev), - static_cast(st.st_ino), - static_cast(st.st_mtim.tv_nsec) }; + // Copy file path (Allocated memory would be freed at close() method.) + char* file_path_cstr = static_cast(cucim_malloc(file_path_.size() + 1)); + memcpy(file_path_cstr, file_path_.c_str(), file_path_.size()); + file_path_cstr[file_path_.size()] = '\0'; + handle_ = std::make_shared(fd, nullptr, file_type, const_cast(file_path_cstr), this, + static_cast(st.st_dev), static_cast(st.st_ino), + static_cast(st.st_mtim.tv_nsec)); CUfileError_t status; CUfileDescr_t cf_descr{}; // It is important to set zero! @@ -212,10 +211,10 @@ CuFileDriver::CuFileDriver(int fd, bool no_gds, bool use_mmap, const char* file_ { cf_descr.handle.fd = fd; cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD; - status = cuFileHandleRegister(&handle_.cufile, &cf_descr); + status = cuFileHandleRegister(&handle_->cufile, &cf_descr); if (status.err == CU_FILE_SUCCESS) { - handle_.type = FileHandleType::kGPUDirect; + handle_->type = FileHandleType::kGPUDirect; } else { @@ -236,7 +235,7 @@ CuFileDriver::CuFileDriver(int fd, bool no_gds, bool use_mmap, const char* file_ mmap_ptr_ = mmap((void*)0, file_size_, PROT_READ, MAP_SHARED, fd, 0); if (mmap_ptr_ != MAP_FAILED) { - handle_.type = FileHandleType::kMemoryMapped; + handle_->type = FileHandleType::kMemoryMapped; } else { @@ -258,7 +257,7 @@ ssize_t pread(const std::shared_ptr& fd, void* buf, size_t count, } else { - fmt::print(stderr, "fd (CuFileDriver) is null!"); + fmt::print(stderr, "fd (CuFileDriver) is null!\n"); return -1; } } @@ -270,7 +269,7 @@ ssize_t pwrite(const std::shared_ptr& fd, const void* buf, size_t } else { - fmt::print(stderr, "fd (CuFileDriver) is null!"); + fmt::print(stderr, "fd (CuFileDriver) is null!\n"); return -1; } } @@ -448,7 +447,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu cudaPointerAttributes attributes; cudaMemoryType memory_type; - FileHandleType file_type = handle_.type; + FileHandleType file_type = handle_->type; CUDA_TRY(cudaPointerGetAttributes(&attributes, buf)); if (cuda_status) @@ -482,7 +481,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu { break; } - read_cnt = ::pread(handle_.fd, cache_buf, bytes_to_copy, read_offset); + read_cnt = ::pread(handle_->fd, cache_buf, bytes_to_copy, read_offset); CUDA_TRY(cudaMemcpy(output_buf, cache_buf, bytes_to_copy, cudaMemcpyHostToDevice)); if (cuda_status) { @@ -497,7 +496,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu } else { - total_read_cnt = ::pread(handle_.fd, reinterpret_cast(buf) + buf_offset, count, file_offset); + total_read_cnt = ::pread(handle_->fd, reinterpret_cast(buf) + buf_offset, count, file_offset); } } else if (file_type == FileHandleType::kMemoryMapped) @@ -517,7 +516,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu } total_read_cnt = count; } - else if (memory_type == cudaMemoryTypeUnregistered || handle_.type == FileHandleType::kPosixODirect) + else if (memory_type == cudaMemoryTypeUnregistered || handle_->type == FileHandleType::kPosixODirect) { uint64_t buf_align = (reinterpret_cast(buf) + buf_offset) % PAGE_SIZE; bool is_aligned = (buf_align == 0) && ((file_offset % PAGE_SIZE) == 0); @@ -531,7 +530,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu if (memory_type == cudaMemoryTypeUnregistered) { read_cnt = - ::pread(handle_.fd, reinterpret_cast(buf) + buf_offset, block_read_size, file_offset); + ::pread(handle_->fd, reinterpret_cast(buf) + buf_offset, block_read_size, file_offset); total_read_cnt += read_cnt; } else @@ -551,7 +550,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu break; } - read_cnt = ::pread(handle_.fd, cache_buf, bytes_to_copy, read_offset); + read_cnt = ::pread(handle_->fd, cache_buf, bytes_to_copy, read_offset); CUDA_TRY(cudaMemcpy(input_buf, cache_buf, bytes_to_copy, cudaMemcpyHostToDevice)); if (cuda_status) { @@ -574,7 +573,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu // Read the remaining block (size of PAGE_SIZE) ssize_t read_cnt; - read_cnt = ::pread(handle_.fd, buf_pos, PAGE_SIZE, block_read_size); + read_cnt = ::pread(handle_->fd, buf_pos, PAGE_SIZE, block_read_size); if (read_cnt < 0) { fmt::print(stderr, "Cannot read the remaining file content block! ({})\n", std::strerror(errno)); @@ -612,7 +611,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu if (large_block_size <= cache_size) // Optimize if bytes to load is less than cache_size { - ssize_t read_cnt = ::pread(handle_.fd, cache_buf, large_block_size, file_start_offset); + ssize_t read_cnt = ::pread(handle_->fd, cache_buf, large_block_size, file_start_offset); if (read_cnt < 0) { fmt::print(stderr, "Cannot read the file content block! ({})\n", std::strerror(errno)); @@ -652,7 +651,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu // Handle the head part of the file content if (header_size) { - read_cnt = ::pread(handle_.fd, internal_buf_pos, PAGE_SIZE, read_offset); + read_cnt = ::pread(handle_->fd, internal_buf_pos, PAGE_SIZE, read_offset); if (read_cnt < 0) { fmt::print(stderr, "Cannot read the head part of the file content block! ({})\n", @@ -691,7 +690,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu break; } - read_cnt = ::pread(handle_.fd, cache_buf, bytes_to_copy, read_offset); + read_cnt = ::pread(handle_->fd, cache_buf, bytes_to_copy, read_offset); if (memory_type == cudaMemoryTypeUnregistered) { memcpy(output_buf, cache_buf, bytes_to_copy); @@ -715,7 +714,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu if (tail_size) { // memset(internal_buf_pos, 0, PAGE_SIZE); // no need to initialize for pread() - read_cnt = ::pread(handle_.fd, internal_buf_pos, PAGE_SIZE, read_offset); + read_cnt = ::pread(handle_->fd, internal_buf_pos, PAGE_SIZE, read_offset); if (read_cnt < 0) { fmt::print(stderr, "Cannot read the tail part of the file content block! ({})\n", @@ -745,7 +744,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu { (void*)s_cufile_cache.device_cache(); // Lazy initialization - ssize_t read_cnt = cuFileRead(handle_.cufile, reinterpret_cast(buf) + buf_offset, count, file_offset, 0); + ssize_t read_cnt = cuFileRead(handle_->cufile, reinterpret_cast(buf) + buf_offset, count, file_offset, 0); total_read_cnt += read_cnt; if (read_cnt < 0) { @@ -775,7 +774,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o cudaPointerAttributes attributes; cudaMemoryType memory_type; - FileHandleType file_type = handle_.type; + FileHandleType file_type = handle_->type; CUDA_TRY(cudaPointerGetAttributes(&attributes, buf)); if (cuda_status) @@ -808,7 +807,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o { return -1; } - write_cnt = ::pwrite(handle_.fd, cache_buf, bytes_to_copy, write_offset); + write_cnt = ::pwrite(handle_->fd, cache_buf, bytes_to_copy, write_offset); write_offset += write_cnt; input_buf += write_cnt; remaining_size -= write_cnt; @@ -818,7 +817,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o } else { - total_write_cnt = ::pwrite(handle_.fd, reinterpret_cast(buf) + buf_offset, count, file_offset); + total_write_cnt = ::pwrite(handle_->fd, reinterpret_cast(buf) + buf_offset, count, file_offset); } } else if (file_type == FileHandleType::kMemoryMapped) @@ -826,7 +825,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o fmt::print(stderr, "[Error] pwrite() is not supported for Memory-mapped IO file type!\n"); return -1; } - else if (memory_type == cudaMemoryTypeUnregistered || handle_.type == FileHandleType::kPosixODirect) + else if (memory_type == cudaMemoryTypeUnregistered || handle_->type == FileHandleType::kPosixODirect) { uint64_t buf_align = (reinterpret_cast(buf) + buf_offset) % PAGE_SIZE; bool is_aligned = (buf_align == 0) && ((file_offset % PAGE_SIZE) == 0); @@ -841,7 +840,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o if (memory_type == cudaMemoryTypeUnregistered) { write_cnt = ::pwrite( - handle_.fd, reinterpret_cast(buf) + buf_offset, block_write_size, file_offset); + handle_->fd, reinterpret_cast(buf) + buf_offset, block_write_size, file_offset); total_write_cnt += write_cnt; } else @@ -866,7 +865,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o { return -1; } - write_cnt = ::pwrite(handle_.fd, cache_buf, bytes_to_copy, write_offset); + write_cnt = ::pwrite(handle_->fd, cache_buf, bytes_to_copy, write_offset); write_offset += write_cnt; input_buf += write_cnt; remaining_size -= write_cnt; @@ -885,7 +884,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o // Read the remaining block (size of PAGE_SIZE) ssize_t read_cnt; - read_cnt = ::pread(handle_.fd, internal_buf_pos, PAGE_SIZE, block_write_size); + read_cnt = ::pread(handle_->fd, internal_buf_pos, PAGE_SIZE, block_write_size); if (read_cnt < 0) { fmt::print(stderr, "Cannot read the remaining file content block! ({})\n", std::strerror(errno)); @@ -908,7 +907,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o } } // Write the constructed block - write_cnt = ::pwrite(handle_.fd, internal_buf_pos, PAGE_SIZE, block_write_size); + write_cnt = ::pwrite(handle_->fd, internal_buf_pos, PAGE_SIZE, block_write_size); if (write_cnt < 0) { fmt::print(stderr, "Cannot write the remaining file content! ({})\n", std::strerror(errno)); @@ -933,7 +932,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o if (large_block_size <= cache_size) // Optimize if bytes to write is less than cache_size { memset(cache_buf, 0, PAGE_SIZE); - ssize_t read_cnt = ::pread(handle_.fd, cache_buf, PAGE_SIZE, file_start_offset); + ssize_t read_cnt = ::pread(handle_->fd, cache_buf, PAGE_SIZE, file_start_offset); if (read_cnt < 0) { fmt::print( @@ -942,7 +941,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o } if (large_block_size > PAGE_SIZE) { - read_cnt = ::pread(handle_.fd, cache_buf + large_block_size - PAGE_SIZE, PAGE_SIZE, + read_cnt = ::pread(handle_->fd, cache_buf + large_block_size - PAGE_SIZE, PAGE_SIZE, end_boundary_offset - PAGE_SIZE); if (read_cnt < 0) { @@ -966,7 +965,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o } // Write the constructed block - ssize_t write_cnt = ::pwrite(handle_.fd, cache_buf, large_block_size, file_start_offset); + ssize_t write_cnt = ::pwrite(handle_->fd, cache_buf, large_block_size, file_start_offset); if (write_cnt < 0) { fmt::print(stderr, "Cannot write the file content block! ({})\n", std::strerror(errno)); @@ -993,7 +992,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o // Handle the head part of the file content if (header_size) { - read_cnt = ::pread(handle_.fd, internal_buf_pos, PAGE_SIZE, write_offset); + read_cnt = ::pread(handle_->fd, internal_buf_pos, PAGE_SIZE, write_offset); if (read_cnt < 0) { fmt::print(stderr, "Cannot read the head part of the file content block! ({})\n", @@ -1017,7 +1016,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o } // Write the constructed block - write_cnt = ::pwrite(handle_.fd, internal_buf_pos, PAGE_SIZE, write_offset); + write_cnt = ::pwrite(handle_->fd, internal_buf_pos, PAGE_SIZE, write_offset); if (write_cnt < 0) { fmt::print(stderr, "Cannot write the head part of the file content block! ({})\n", @@ -1052,7 +1051,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o return -1; } } - write_cnt = ::pwrite(handle_.fd, cache_buf, bytes_to_copy, write_offset); + write_cnt = ::pwrite(handle_->fd, cache_buf, bytes_to_copy, write_offset); write_offset += write_cnt; input_buf += write_cnt; body_remaining_size -= write_cnt; @@ -1064,7 +1063,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o if (tail_size) { memset(internal_buf_pos, 0, PAGE_SIZE); - read_cnt = ::pread(handle_.fd, internal_buf_pos, PAGE_SIZE, write_offset); + read_cnt = ::pread(handle_->fd, internal_buf_pos, PAGE_SIZE, write_offset); if (read_cnt < 0) { fmt::print(stderr, "Cannot read the tail part of the file content block! ({})\n", @@ -1087,7 +1086,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o } // Write the constructed block - write_cnt = ::pwrite(handle_.fd, internal_buf_pos, PAGE_SIZE, write_offset); + write_cnt = ::pwrite(handle_->fd, internal_buf_pos, PAGE_SIZE, write_offset); if (write_cnt < 0) { fmt::print(stderr, "Cannot write the tail part of the file content block! ({})\n", @@ -1104,7 +1103,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o (void*)s_cufile_cache.device_cache(); // Lazy initialization ssize_t write_cnt = - cuFileWrite(handle_.cufile, reinterpret_cast(buf) + buf_offset, count, file_offset, 0); + cuFileWrite(handle_->cufile, reinterpret_cast(buf) + buf_offset, count, file_offset, 0); if (write_cnt < 0) { fmt::print(stderr, "[cuFile Error] {}\n", CUFILE_ERRSTR(write_cnt)); @@ -1122,10 +1121,10 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o } bool CuFileDriver::close() { - if (handle_.cufile) + if (handle_->cufile) { - cuFileHandleDeregister(handle_.cufile); - handle_.cufile = nullptr; + cuFileHandleDeregister(handle_->cufile); + } if (mmap_ptr_) { @@ -1136,22 +1135,21 @@ bool CuFileDriver::close() } mmap_ptr_ = nullptr; } - if (handle_.fd != -1) + if (handle_->fd != -1) { // If block write was used if ((file_flags_ & O_RDWR) && - (handle_.type == FileHandleType::kGPUDirect || handle_.type == FileHandleType::kPosixODirect)) + (handle_->type == FileHandleType::kGPUDirect || handle_->type == FileHandleType::kPosixODirect)) { // Truncate file assuming that `file_size_` is up to date during pwrite() calls - int err = ::ftruncate(handle_.fd, file_size_); + int err = ::ftruncate(handle_->fd, file_size_); if (err < 0) { - fmt::print(stderr, "[Error] Cannot resize the file {} to {} ({})\n", handle_.path, file_size_, + fmt::print(stderr, "[Error] Cannot resize the file {} to {} ({})\n", handle_->path, file_size_, std::strerror(errno)); } } - ::close(handle_.fd); - handle_.fd = -1; + handle_ = nullptr; } file_path_.clear(); file_size_ = 0; diff --git a/cpp/src/loader/batch_data_processor.cpp b/cpp/src/loader/batch_data_processor.cpp new file mode 100644 index 000000000..2ba188b18 --- /dev/null +++ b/cpp/src/loader/batch_data_processor.cpp @@ -0,0 +1,77 @@ +/* + * Apache License, Version 2.0 + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cucim/loader/batch_data_processor.h" + +#include +#include + +#include "cucim/cache/image_cache_manager.h" + +namespace cucim::loader +{ + +BatchDataProcessor::BatchDataProcessor(const uint32_t batch_size) : batch_size_(batch_size), processed_index_count_(0) +{ +} + +BatchDataProcessor::~BatchDataProcessor() +{ +} + + +void BatchDataProcessor::add_tile(const TileInfo& tile) +{ + tiles_.emplace_back(tile); + ++total_index_count_; +} + +TileInfo BatchDataProcessor::remove_front_tile() +{ + const TileInfo tile = tiles_.front(); + tiles_.pop_front(); + ++processed_index_count_; + return tile; +} + +uint32_t BatchDataProcessor::request(std::deque& batch_item_counts, const uint32_t num_remaining_patches) +{ + (void)batch_item_counts; + (void)num_remaining_patches; + return 0; +} + +uint32_t BatchDataProcessor::wait_batch(const uint32_t index_in_task, + std::deque& batch_item_counts, + const uint32_t num_remaining_patches) +{ + (void)index_in_task; + (void)batch_item_counts; + (void)num_remaining_patches; + return 0; +} + +std::shared_ptr BatchDataProcessor::wait_for_processing(const uint32_t) +{ + return std::shared_ptr(); +} + +void BatchDataProcessor::shutdown() +{ +} + +} // namespace cucim::loader diff --git a/cpp/src/loader/thread_batch_data_loader.cpp b/cpp/src/loader/thread_batch_data_loader.cpp new file mode 100644 index 000000000..1d5cd5498 --- /dev/null +++ b/cpp/src/loader/thread_batch_data_loader.cpp @@ -0,0 +1,320 @@ +/* + * Apache License, Version 2.0 + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cucim/loader/thread_batch_data_loader.h" + +#include + +#include + +#include "cucim/profiler/nvtx3.h" +#include "cucim/util/cuda.h" + +namespace cucim::loader +{ + +ThreadBatchDataLoader::ThreadBatchDataLoader(LoadFunc load_func, + std::unique_ptr batch_data_processor, + const cucim::io::Device out_device, + std::unique_ptr> location, + std::unique_ptr> image_size, + const uint64_t location_len, + const size_t one_raster_size, + const uint32_t batch_size, + const uint32_t prefetch_factor, + const uint32_t num_workers) + : load_func_(load_func), + out_device_(out_device), + location_(std::move(location)), + image_size_(std::move(image_size)), + location_len_(location_len), + one_rester_size_(one_raster_size), + batch_size_(batch_size), + prefetch_factor_(prefetch_factor), + num_workers_(num_workers), + batch_data_processor_(std::move(batch_data_processor)), + buffer_size_(one_raster_size * batch_size), + thread_pool_(num_workers), + queued_item_count_(0), + buffer_item_head_index_(0), + buffer_item_tail_index_(0), + processed_batch_count_(0), + current_data_(nullptr), + current_data_batch_size_(0) +{ + buffer_item_len_ = std::min(static_cast(location_len_), static_cast(1 + prefetch_factor_)), + + raster_data_.reserve(buffer_item_len_); + cucim::io::DeviceType device_type = out_device_.type(); + for (size_t i = 0; i < buffer_item_len_; ++i) + { + switch (device_type) + { + case io::DeviceType::kCPU: + raster_data_.emplace_back(static_cast(cucim_malloc(buffer_size_))); + break; + case io::DeviceType::kCUDA: { + cudaError_t cuda_status; + void* image_data_ptr = nullptr; + CUDA_ERROR(cudaMalloc(&image_data_ptr, buffer_size_)); + raster_data_.emplace_back(static_cast(image_data_ptr)); + break; + } + case io::DeviceType::kPinned: + case io::DeviceType::kCPUShared: + case io::DeviceType::kCUDAShared: + fmt::print(stderr, "Device type {} is not supported!\n", device_type); + break; + } + } +} + +ThreadBatchDataLoader::~ThreadBatchDataLoader() +{ + cucim::io::DeviceType device_type = out_device_.type(); + for (auto& raster_ptr : raster_data_) + { + switch (device_type) + { + case io::DeviceType::kCPU: + if (raster_ptr) + { + cucim_free(raster_ptr); + } + break; + case io::DeviceType::kCUDA: + cudaError_t cuda_status; + if (raster_ptr) + { + cuda_status = cudaSuccess; + CUDA_TRY(cudaFree(raster_ptr)); + } + break; + case io::DeviceType::kPinned: + case io::DeviceType::kCPUShared: + case io::DeviceType::kCUDAShared: + fmt::print(stderr, "Device type {} is not supported!", device_type); + break; + } + raster_ptr = nullptr; + } + if (batch_data_processor_) + { + stopped_ = true; + batch_data_processor_->shutdown(); + } +} + +ThreadBatchDataLoader::operator bool() const +{ + return (num_workers_ > 0); +} + +uint8_t* ThreadBatchDataLoader::raster_pointer(const uint64_t location_index) const +{ + uint64_t buffer_item_index = (location_index / batch_size_) % buffer_item_len_; + uint32_t raster_data_index = location_index % batch_size_; + + assert(buffer_item_index < buffer_item_len_); + + uint8_t* batch_raster_ptr = raster_data_[buffer_item_index]; + + return &batch_raster_ptr[raster_data_index * one_rester_size_]; +} + +uint32_t ThreadBatchDataLoader::request(uint32_t load_size) +{ + if (num_workers_ == 0) + { + return 0; + } + + if (load_size == 0) + { + load_size = batch_size_; + } + + uint32_t num_items_to_request = std::min(load_size, static_cast(location_len_ - queued_item_count_)); + for (uint32_t i = 0; i < num_items_to_request; ++i) + { + uint32_t last_item_count = 0; + if (!tasks_.empty()) + { + last_item_count = tasks_.size(); + } + load_func_(this, queued_item_count_); + ++queued_item_count_; + buffer_item_tail_index_ = queued_item_count_ % buffer_item_len_; + // Append the number of added tasks to the batch count list. + batch_item_counts_.emplace_back(tasks_.size() - last_item_count); + } + + if (batch_data_processor_) + { + uint32_t num_remaining_patches = static_cast(location_len_ - queued_item_count_); + batch_data_processor_->request(batch_item_counts_, num_remaining_patches); + } + return num_items_to_request; +} + +uint32_t ThreadBatchDataLoader::wait_batch() +{ + if (num_workers_ == 0) + { + return 0; + } + + uint32_t num_items_waited = 0; + for (uint32_t batch_item_index = 0; batch_item_index < batch_size_ && !batch_item_counts_.empty(); ++batch_item_index) + { + uint32_t batch_item_count = batch_item_counts_.front(); + for (uint32_t i = 0; i < batch_item_count; ++i) + { + auto& future = tasks_.front(); + future.wait(); + tasks_.pop_front(); + if (batch_data_processor_) + { + batch_data_processor_->remove_front_tile(); + uint32_t num_remaining_patches = static_cast(location_len_ - queued_item_count_); + batch_data_processor_->wait_batch(i, batch_item_counts_, num_remaining_patches); + } + } + batch_item_counts_.pop_front(); + num_items_waited += batch_item_count; + } + return num_items_waited; +} + + +uint8_t* ThreadBatchDataLoader::next_data() +{ + if (num_workers_ == 0) // (location_len == 1 && batch_size == 1) + { + // If it reads entire image with multi threads (using loader), release raster memory from batch data loader. + uint8_t* batch_raster_ptr = raster_data_[0]; + raster_data_[0] = nullptr; + return batch_raster_ptr; + } + + if (processed_batch_count_ * batch_size_ >= location_len_) + { + // Remove buffer items that are no longer needed. + for (size_t i = 0; i < buffer_item_len_; ++i) + { + raster_data_[i] = nullptr; + } + return nullptr; + } + + // Wait until the batch is ready. + wait_batch(); + + uint8_t* batch_raster_ptr = raster_data_[buffer_item_head_index_]; + + cucim::io::DeviceType device_type = out_device_.type(); + switch (device_type) + { + case io::DeviceType::kCPU: + raster_data_[buffer_item_head_index_] = static_cast(cucim_malloc(buffer_size_)); + break; + case io::DeviceType::kCUDA: { + cudaError_t cuda_status; + CUDA_ERROR(cudaMalloc(&raster_data_[buffer_item_head_index_], buffer_size_)); + break; + } + case io::DeviceType::kPinned: + case io::DeviceType::kCPUShared: + case io::DeviceType::kCUDAShared: + fmt::print(stderr, "Device type {} is not supported!\n", device_type); + break; + } + + buffer_item_head_index_ = (buffer_item_head_index_ + 1) % buffer_item_len_; + + current_data_ = batch_raster_ptr; + current_data_batch_size_ = + std::min(location_len_ - (processed_batch_count_ * batch_size_), static_cast(batch_size_)); + + ++processed_batch_count_; + + // Prepare the next batch + request(batch_size_); + return batch_raster_ptr; +} + +BatchDataProcessor* ThreadBatchDataLoader::batch_data_processor() +{ + return batch_data_processor_.get(); +} + +std::shared_ptr ThreadBatchDataLoader::wait_for_processing(uint32_t index) +{ + if (batch_data_processor_ == nullptr || stopped_) + { + return std::shared_ptr(); + } + + return batch_data_processor_->wait_for_processing(index); +} + +uint64_t ThreadBatchDataLoader::size() const +{ + return location_len_; +} + +uint32_t ThreadBatchDataLoader::batch_size() const +{ + return batch_size_; +} + +uint64_t ThreadBatchDataLoader::total_batch_count() const +{ + return (location_len_ + batch_size_ - 1) / batch_size_; +} + +uint64_t ThreadBatchDataLoader::processed_batch_count() const +{ + return processed_batch_count_; +} + +uint8_t* ThreadBatchDataLoader::data() const +{ + return current_data_; +} + +uint32_t ThreadBatchDataLoader::data_batch_size() const +{ + return current_data_batch_size_; +} + +bool ThreadBatchDataLoader::enqueue(std::function task, const TileInfo& tile) +{ + if (num_workers_ > 0) + { + auto future = thread_pool_.enqueue(task); + tasks_.emplace_back(std::move(future)); + if (batch_data_processor_) + { + batch_data_processor_->add_tile(tile); + } + return true; + } + return false; +} + +} // namespace cucim::loader diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index eb5910e33..2e892875b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -43,12 +43,21 @@ target_compile_definitions(cucim_tests CUCIM_VERSION_PATCH=${PROJECT_VERSION_PATCH} CUCIM_VERSION_BUILD=${PROJECT_VERSION_BUILD} ) + +# Include pthread +# (https://cmake.org/cmake/help/v3.18/module/FindThreads.html) +set(CMAKE_THREAD_PREFER_PTHREAD TRUE) +set(THREADS_PREFER_PTHREAD_FLAG TRUE) +find_package(Threads REQUIRED) + target_link_libraries(cucim_tests PRIVATE CUDA::cudart ${CUCIM_PACKAGE_NAME} deps::catch2 deps::openslide + deps::taskflow + Threads::Threads # -lpthread ) include(ParseAndAddCatchTests) diff --git a/cucim.code-workspace b/cucim.code-workspace index f5a63064b..af60d4b02 100644 --- a/cucim.code-workspace +++ b/cucim.code-workspace @@ -6,6 +6,9 @@ { "path": "cpp/plugins/cucim.kit.cuslide" }, + { + "path": "cpp/plugins/cucim.kit.cumed" + }, { "path": "python" } @@ -141,7 +144,15 @@ "__node_handle": "cpp", "__memory": "cpp", "*.def": "cpp", - "__mutex_base": "cpp" + "__mutex_base": "cpp", + "mprealsupport": "cpp", + "compare": "cpp", + "concepts": "cpp", + "coroutine": "cpp", + "numbers": "cpp", + "semaphore": "cpp", + "stop_token": "cpp", + "tensor": "cpp" }, // https://vector-of-bool.github.io/docs/vscode-cmake-tools/settings.html "cmake.buildTask": true, @@ -160,7 +171,6 @@ "options": { "cwd": "${workspaceFolder}", "env": { - // Workaround the environment variable issue: https://github.com/microsoft/vscode/issues/121470 "PATH": "${env:HOME}/.local/bin:${env:PATH}" } }, @@ -179,14 +189,17 @@ "type": "shell", "command": "./run build_local all release", "options": { - "cwd": "${workspaceFolder}" + "cwd": "${workspaceFolder}", + "env": { + "PATH": "${env:HOME}/.local/bin:${env:PATH}" + } }, "presentation": { "reveal": "always", "focus": true }, "problemMatcher": [], - "group": "build", + "group": "build" } ] }, @@ -210,7 +223,7 @@ "environment": [ { "name": "LD_LIBRARY_PATH", - "value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${env:LD_LIBRARY_PATH}" + "value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${workspaceFolder:cucim.kit.cumed}/build-debug/lib:${env:LD_LIBRARY_PATH}" }, { "name": "CUCIM_TEST_PLUGIN_PATH", @@ -238,7 +251,7 @@ "environment": [ { "name": "LD_LIBRARY_PATH", - "value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${env:LD_LIBRARY_PATH}" + "value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${workspaceFolder:cucim.kit.cumed}/build-debug/lib:${env:LD_LIBRARY_PATH}" }, { "name": "CUCIM_TEST_PLUGIN_PATH", @@ -259,9 +272,10 @@ "name": "(gdb) cucim_py", "type": "cppdbg", "request": "launch", - "program": "/usr/bin/python3", + "program": "/usr/bin/bash", // https://github.com/catchorg/Catch2/blob/devel/docs/command-line.md#specifying-which-tests-to-run "args": [ + "${workspaceFolder:cucim}/scripts/debug_python", "${workspaceFolder:python}/cucim/src/localtest.py", ], "stopAtEntry": false, @@ -269,7 +283,7 @@ "environment": [ { "name": "LD_LIBRARY_PATH", - "value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${env:LD_LIBRARY_PATH}" + "value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${workspaceFolder:cucim.kit.cumed}/build-debug/lib:${env:LD_LIBRARY_PATH}" }, { "name": "CUCIM_TEST_PLUGIN_PATH", @@ -301,7 +315,7 @@ "environment": [ { "name": "LD_LIBRARY_PATH", - "value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${env:LD_LIBRARY_PATH}" + "value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${workspaceFolder:cucim.kit.cumed}/build-debug/lib:${env:LD_LIBRARY_PATH}" } ], "console": "externalTerminal", diff --git a/examples/cpp/tiff_image/main.cpp b/examples/cpp/tiff_image/main.cpp index 051652285..41b212e1c 100644 --- a/examples/cpp/tiff_image/main.cpp +++ b/examples/cpp/tiff_image/main.cpp @@ -59,8 +59,7 @@ int main(int argc, char* argv[]) fmt::print("channel_names: ({})\n", fmt::join(associated_image.channel_names(), ", ")); fmt::print("\n"); - cucim::CuImage region = - image.read_region({ 10000, 10000 }, { 1024, 1024 }, 0, cucim::DimIndices{}, "cpu", nullptr, ""); + cucim::CuImage region = image.read_region({ 10000, 10000 }, { 1024, 1024 }, 0); fmt::print("is_loaded: {}\n", region.is_loaded()); fmt::print("device: {}\n", std::string(region.device())); @@ -83,9 +82,21 @@ int main(int argc, char* argv[]) region.save(fmt::format("{}/output.ppm", output_folder_path)); - cucim::CuImage region2 = - image.read_region({ 5000, 5000 }, { 1024, 1024 }, 1, cucim::DimIndices{}, "cpu", nullptr, ""); + cucim::CuImage region2 = image.read_region({ 5000, 5000 }, { 1024, 1024 }, 1); region2.save(fmt::format("{}/output2.ppm", output_folder_path)); + // Batch loading image + // You need to create shared pointer for cucim::CuImage. Otherwise it will cause std::bad_weak_ptr exception. + auto batch_image = std::make_shared(input_file_path); + + auto region3 = std::make_shared(image.read_region( + { 0, 0, 100, 200, 300, 300, 400, 400, 500, 500, 600, 600, 700, 700, 800, 800, 900, 900, 1000, 1000 }, + { 200, 200 }, 0 /*level*/, 2 /*num_workers*/, 2 /*batch_size*/, false /*drop_last*/, 1 /*prefetch_factor*/, + false /*shuffle*/, 0 /*seed*/)); + + for (auto batch : *region3) + { + fmt::print("shape: {}, data size:{}\n", fmt::join(batch->shape(), ", "), batch->container().size()); + } return 0; } diff --git a/python/cucim/src/localtest.py b/python/cucim/src/localtest.py index 59955dbc8..b255b8287 100644 --- a/python/cucim/src/localtest.py +++ b/python/cucim/src/localtest.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cucim/tests/performance/clara/test_read_region_memory_usage.py b/python/cucim/tests/performance/clara/test_read_region_memory_usage.py index fe2a3b976..676168937 100644 --- a/python/cucim/tests/performance/clara/test_read_region_memory_usage.py +++ b/python/cucim/tests/performance/clara/test_read_region_memory_usage.py @@ -18,14 +18,14 @@ from ...util.io import open_image_cucim -def test_read_region_cuda_memleak(testimg_tiff_stripe_4096x4096_256): +def test_read_region_cuda_memleak(testimg_tiff_stripe_4096x4096_256_jpeg): import GPUtil gpus = GPUtil.getGPUs() if len(gpus) == 0: pytest.skip('No gpu available') - img = open_image_cucim(testimg_tiff_stripe_4096x4096_256) + img = open_image_cucim(testimg_tiff_stripe_4096x4096_256_jpeg) gpu = gpus[0] mem_usage_history = [gpu.memoryUsed] @@ -38,11 +38,10 @@ def test_read_region_cuda_memleak(testimg_tiff_stripe_4096x4096_256): print(mem_usage_history) - # Memory usage difference should be less than 40MB + # The difference in memory usage should be less than 180MB. # Note: Since we cannot measure GPU memory usage for a process, - # we use a rough number. - # Actual CUDA memory used would be 48MB per iteration (4096x4096x3). - assert mem_usage_history[4] - mem_usage_history[1] < 40.0 + # we use a rough number (experimentally measured). + assert mem_usage_history[4] - mem_usage_history[1] < 180.0 def test_read_region_cpu_memleak(testimg_tiff_stripe_4096x4096_256): diff --git a/python/pybind11/cache/cache_py.cpp b/python/pybind11/cache/cache_py.cpp index f46352cb1..f028aa922 100644 --- a/python/pybind11/cache/cache_py.cpp +++ b/python/pybind11/cache/cache_py.cpp @@ -123,12 +123,12 @@ void py_image_cache_reserve(ImageCache& cache, uint32_t memory_capacity, py::kwa } py::int_ py_preferred_memory_capacity(const py::object& img, - const std::optional>& image_size, + const std::optional>& image_size, const std::optional>& tile_size, const std::optional>& patch_size, uint32_t bytes_per_pixel) { - std::vector param_image; + std::vector param_image; std::vector param_tile; std::vector param_patch; diff --git a/python/pybind11/cache/cache_py.h b/python/pybind11/cache/cache_py.h index e0b817fcd..99af48bba 100644 --- a/python/pybind11/cache/cache_py.h +++ b/python/pybind11/cache/cache_py.h @@ -40,7 +40,7 @@ py::dict py_config(ImageCache& cache); void py_image_cache_reserve(ImageCache& cache, uint32_t memory_capacity, py::kwargs kwargs); py::int_ py_preferred_memory_capacity(const py::object& img, - const std::optional>& image_size, + const std::optional>& image_size, const std::optional>& tile_size, const std::optional>& patch_size, uint32_t bytes_per_pixel); diff --git a/python/pybind11/cache/cache_pydoc.h b/python/pybind11/cache/cache_pydoc.h index 67840a40a..cca61557e 100644 --- a/python/pybind11/cache/cache_pydoc.h +++ b/python/pybind11/cache/cache_pydoc.h @@ -22,7 +22,7 @@ namespace cucim::cache::doc { // py::int_ py_preferred_memory_capacity(const py::object& img, -// const std::optional>& image_size, +// const std::optional>& image_size, // const std::optional>& tile_size, // const std::optional>& patch_size, // uint32_t bytes_per_pixel); diff --git a/python/pybind11/cucim_py.cpp b/python/pybind11/cucim_py.cpp index be91f9d3f..e3f48a03d 100644 --- a/python/pybind11/cucim_py.cpp +++ b/python/pybind11/cucim_py.cpp @@ -148,9 +148,15 @@ PYBIND11_MODULE(_cucim, m) .def_property("resolutions", &py_resolutions, nullptr, doc::CuImage::doc_resolutions, py::call_guard()) // .def("read_region", &py_read_region, doc::CuImage::doc_read_region, py::call_guard(), // - py::arg("location") = py::list{}, // - py::arg("size") = py::list{}, // + py::arg("location") = py::tuple{}, // + py::arg("size") = py::tuple{}, // py::arg("level") = 0, // + py::arg("num_workers") = 0, // + py::arg("batch_size") = 1, // + py::arg("drop_last") = py::bool_(false), // + py::arg("prefetch_factor") = 2, // + py::arg("shuffle") = py::bool_(false), // + py::arg("seed") = py::int_(0), // py::arg("device") = io::Device(), // py::arg("buf") = py::none(), // py::arg("shm_name") = "") // @@ -163,6 +169,12 @@ PYBIND11_MODULE(_cucim, m) .def("save", &CuImage::save, doc::CuImage::doc_save, py::call_guard()) // .def("close", &CuImage::close, doc::CuImage::doc_close, py::call_guard()) // .def("__bool__", &CuImage::operator bool, py::call_guard()) // + .def( + "__iter__", // + [](const std::shared_ptr& cuimg) { // + return cuimg->begin(); // + }, // + py::call_guard()) .def( "__enter__", [](const std::shared_ptr& cuimg) { // @@ -183,6 +195,30 @@ PYBIND11_MODULE(_cucim, m) }, py::call_guard()); + py::class_>(m, "CuImageIterator") // + .def(py::init, bool>(), doc::CuImageIterator::doc_CuImageIterator, + py::arg("cuimg"), // + py::arg("ending") = false, py::call_guard()) + .def( + "__len__", + [](const CuImageIterator& it) { // + return it.size(); // + }, // + py::call_guard()) + .def( + "__iter__", // + [](CuImageIterator& it) { // + return CuImageIterator(it); // + }, // + py::call_guard()) + .def("__next__", &py_cuimage_iterator_next, py::call_guard()) + .def( + "__repr__", // + [](CuImageIterator& it) { // + return fmt::format("", it.index()); + }, + py::call_guard()); + // We can use `"cpu"` instead of `Device("cpu")` py::implicitly_convertible(); } @@ -379,15 +415,61 @@ py::dict py_resolutions(const CuImage& cuimg) py::object py_read_region(const CuImage& cuimg, - std::vector&& location, + const py::iterable& location, std::vector&& size, int16_t level, + uint32_t num_workers, + uint32_t batch_size, + bool drop_last, + uint32_t prefetch_factor, + bool shuffle, + uint64_t seed, const io::Device& device, const py::object& buf, const std::string& shm_name, const py::kwargs& kwargs) { + if (!size.empty() && size.size() != 2) + { + throw std::runtime_error("size (patch size) should be 2!"); + } + cucim::DimIndices indices; + std::vector locations; + { + py::gil_scoped_acquire scope_guard; + + auto arr = pybind11::array_t::ensure(location); + if (arr) // fast copy + { + py::buffer_info buf = arr.request(); + int64_t* data_array = static_cast(buf.ptr); + ssize_t data_size = buf.size; + locations.reserve(data_size); + locations.insert(locations.end(), &data_array[0], &data_array[data_size]); + } + else + { + auto iter = py::iter(location); + while (iter != py::iterator::sentinel()) + { + if (py::isinstance(*iter)) + { + auto iter2 = py::iter(*iter); + while (iter2 != py::iterator::sentinel()) + { + locations.emplace_back(py::cast(*iter2)); + ++iter2; + } + } + else + { + locations.emplace_back(py::cast(*iter)); + } + ++iter; + } + } + } if (kwargs) { @@ -414,9 +496,6 @@ py::object py_read_region(const CuImage& cuimg, } indices_args.emplace_back(std::make_pair(key_char, value)); - - // fmt::print("k:{} v:{}\n", std::string(py::str(item.first)), - // std::string(py::str(item.second))); } } indices = cucim::DimIndices(indices_args); @@ -427,14 +506,26 @@ py::object py_read_region(const CuImage& cuimg, } auto region_ptr = std::make_shared( - cuimg.read_region(std::move(location), std::move(size), level, indices, device, nullptr, "")); + std::move(cuimg.read_region(std::move(locations), std::move(size), level, num_workers, batch_size, drop_last, + prefetch_factor, shuffle, seed, indices, device, nullptr, ""))); + auto loader = region_ptr->loader(); + if (batch_size > 1 || (loader && loader->size() > 1)) + { + auto iter_ptr = region_ptr->begin(); + + py::gil_scoped_acquire scope_guard; + + py::object iter = py::cast(iter_ptr); + return iter; + } + else { py::gil_scoped_acquire scope_guard; py::object region = py::cast(region_ptr); - // Add `__array_interace__` or `__cuda_array_interface__` in runtime. + // Add `__array_interface__` or `__cuda_array_interface__` in runtime. _set_array_interface(region); return region; @@ -450,13 +541,40 @@ py::object py_associated_image(const CuImage& cuimg, const std::string& name, co py::object image = py::cast(image_ptr); - // Add `__array_interace__` or `__cuda_array_interface__` in runtime. + // Add `__array_interface__` or `__cuda_array_interface__` in runtime. _set_array_interface(image); return image; } } +py::object py_cuimage_iterator_next(CuImageIterator& it) +{ + bool stop_iteration = (it.index() == it.size()); + + // Get the next batch of images. + ++it; + + auto cuimg = *it; + memory::DLTContainer container = cuimg->container(); + DLTensor* tensor = static_cast(container); + cucim::loader::ThreadBatchDataLoader* loader = cuimg->loader(); + + { + py::gil_scoped_acquire scope_guard; + py::object cuimg_obj = py::cast(cuimg); + if (loader) + { + _set_array_interface(cuimg_obj); + } + if (stop_iteration) + { + throw py::stop_iteration(); + } + return cuimg_obj; + } +} + void _set_array_interface(const py::object& cuimg_obj) { const auto& cuimg = cuimg_obj.cast(); @@ -464,42 +582,74 @@ void _set_array_interface(const py::object& cuimg_obj) // TODO: using __array_struct__, access to array interface could be faster // (https://numpy.org/doc/stable/reference/arrays.interface.html#c-struct-access) // TODO: check the performance difference between python int vs python long later. + + loader::ThreadBatchDataLoader* loader = cuimg.loader(); memory::DLTContainer container = cuimg.container(); - const DLTensor* tensor = static_cast(container); + DLTensor* tensor = static_cast(container); if (!tensor) { return; } + if (loader) + { + // Get the last available (batch) image. + tensor->data = loader->data(); + } - const char* type_str = container.numpy_dtype(); - py::str typestr = py::str(type_str); + if (tensor->data) + { + const char* type_str = container.numpy_dtype(); + py::str typestr = py::str(type_str); - py::tuple data = pybind11::make_tuple(py::int_(reinterpret_cast(tensor->data)), py::bool_(false)); - py::list descr; - descr.append(py::make_tuple(""_s, typestr)); + py::tuple data = pybind11::make_tuple(py::int_(reinterpret_cast(tensor->data)), py::bool_(false)); + py::list descr; + descr.append(py::make_tuple(""_s, typestr)); - py::tuple shape = vector2pytuple(cuimg.shape()); + py::tuple shape = vector2pytuple(cuimg.shape()); - // TODO: depending on container's memory type, expose either array_interface or cuda_array_interface - switch (tensor->ctx.device_type) - { - case kDLCPU: { - // Reference: https://numpy.org/doc/stable/reference/arrays.interface.html - cuimg_obj.attr("__array_interface__") = - py::dict{ "data"_a = data, "strides"_a = py::none(), "descr"_a = descr, - "typestr"_a = typestr, "shape"_a = shape, "version"_a = py::int_(3) }; - } - break; - case kDLGPU: { - // Reference: https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html - cuimg_obj.attr("__cuda_array_interface__") = - py::dict{ "data"_a = data, "strides"_a = py::none(), "descr"_a = descr, "typestr"_a = typestr, - "shape"_a = shape, "version"_a = py::int_(3), "mask"_a = py::none(), "stream"_a = 1 }; + // Depending on container's memory type, expose either array_interface or cuda_array_interface + switch (tensor->ctx.device_type) + { + case kDLCPU: { + // Reference: https://numpy.org/doc/stable/reference/arrays.interface.html + cuimg_obj.attr("__array_interface__") = + py::dict{ "data"_a = data, "strides"_a = py::none(), "descr"_a = descr, + "typestr"_a = typestr, "shape"_a = shape, "version"_a = py::int_(3) }; + } + break; + case kDLGPU: { + // Reference: https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html + cuimg_obj.attr("__cuda_array_interface__") = + py::dict{ "data"_a = data, "strides"_a = py::none(), "descr"_a = descr, "typestr"_a = typestr, + "shape"_a = shape, "version"_a = py::int_(3), "mask"_a = py::none(), "stream"_a = 1 }; + } + break; + default: + break; + } } - break; - default: + else + { + switch (tensor->ctx.device_type) + { + case kDLCPU: { + if (py::hasattr(cuimg_obj, "__array_interface__")) + { + py::delattr(cuimg_obj, "__array_interface__"); + } + } break; + case kDLGPU: { + if (py::hasattr(cuimg_obj, "__cuda_array_interface__")) + { + py::delattr(cuimg_obj, "__cuda_array_interface__"); + } + } + break; + default: + break; + } } } diff --git a/python/pybind11/cucim_py.h b/python/pybind11/cucim_py.h index 2b0614d65..4d25cda7a 100644 --- a/python/pybind11/cucim_py.h +++ b/python/pybind11/cucim_py.h @@ -29,6 +29,8 @@ namespace cucim // Forward declarations class CuImage; +template +class CuImageIterator; namespace io { class Device; @@ -67,15 +69,23 @@ bool py_is_trace_enabled(py::object /* self */); json py_metadata(const CuImage& cuimg); py::dict py_resolutions(const CuImage& cuimg); py::object py_read_region(const CuImage& cuimg, - std::vector&& location, + const py::iterable& location, std::vector&& size, int16_t level, + uint32_t num_workers, + uint32_t batch_size, + bool drop_last, + uint32_t prefetch_factor, + bool shuffle, + uint64_t seed, const io::Device& device, const py::object& buf, const std::string& shm_name, const py::kwargs& kwargs); py::object py_associated_image(const CuImage& cuimg, const std::string& name, const io::Device& device); +py::object py_cuimage_iterator_next(CuImageIterator& it); + void _set_array_interface(const py::object& cuimg_obj); } // namespace cucim diff --git a/python/pybind11/cucim_pydoc.h b/python/pybind11/cucim_pydoc.h index 255cd8998..673a0d900 100644 --- a/python/pybind11/cucim_pydoc.h +++ b/python/pybind11/cucim_pydoc.h @@ -258,6 +258,16 @@ Add `__array_interface__` or `__cuda_array_interface__` depending on the memory }; // namespace CuImage +namespace CuImageIterator +{ + +// CuImageIterator(std::shared_ptr cuimg, bool ending = false); +PYDOC(CuImageIterator, R"doc( +Constructor of CuImageIterator. +)doc") + +} // namespace CuImageIterator + } // namespace cucim::doc #endif // PYCUCIM_CUCIM_PYDOC_H diff --git a/run b/run index e2cd109a3..8033eaf18 100755 --- a/run +++ b/run @@ -268,6 +268,8 @@ build_local_libcucim_() { # Copy cufile SDK from host system to temp/cuda copy_gds_files_ $source_folder + # Copy libjpeg SDK from host system to temp/cuda + copy_nvjpeg_files_ $source_folder ${CMAKE_CMD} -S ${source_folder} -B ${build_folder} -G "Unix Makefiles" \ -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE \ @@ -471,6 +473,74 @@ copy_gds_files_() { fi } +get_arch_name_() { + architecture="unknown" + case $(uname -m) in + x86_64) architecture="x86_64" ;; + arm|aarch64) lscpu | awk '/Architecture:/{print $2}' | grep -i -q "aarch64" && architecture="sbsa";; + esac + echo "${architecture}" +} + +copy_nvjpeg_files_() { + local root_folder=${1:-${TOP}} + local arch_name="$(get_arch_name_)" + local nvjpeg_search="${root_folder}/temp/cuda/include:${root_folder}/temp/cuda/lib64" + local cuda_version="11.6" + local nvjpeg_version="11-6_11.6.0.55-1" + local candidate + local nvjpeg_include + local nvjpeg_lib + + for candidate in ${nvjpeg_search}; do + nvjpeg_include="$(echo $candidate | cut -d: -f1)" + nvjpeg_lib="$(echo $candidate | cut -d: -f2)" + if [ -f ${nvjpeg_include}/nvjpeg.h ] && [ -f ${nvjpeg_lib}/libnvjpeg_static.a ]; then + c_echo W "nvJPEG client library is available at '${nvjpeg_include}/nvjpeg.h' and '${nvjpeg_lib}/libnvjpeg_static.a'" + break + fi + nvjpeg_include="" + nvjpeg_lib="" + done + + if [ ! -f ${nvjpeg_include}/nvjpeg.h ]; then + c_echo_err Y "nvJPEG library is not available! Downloading the redistributable package to get nvjpeg.h and libraries." + + run_command rm -rf ${root_folder}/temp/cuda/include/*nvjpeg* + run_command rm -rf ${root_folder}/temp/cuda/lib64/*nvjpeg* + run_command mkdir -p ${root_folder}/temp/cuda/include ${root_folder}/temp/cuda/lib64 + + local temp_tgz_dir=$(mktemp -d) + pushd ${temp_tgz_dir} + c_echo W "Arch name: " G "${arch_name}" + if [ "${arch_name}" = "sbsa" ]; then + run_command wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/${arch_name}/libnvjpeg-dev-${nvjpeg_version}_arm64.deb + else + run_command wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/${arch_name}/libnvjpeg-dev-${nvjpeg_version}_amd64.deb + fi + + mkdir -p libnvjpeg-dev + mv libnvjpeg-dev-${nvjpeg_version}*.deb libnvjpeg-dev/ + pushd libnvjpeg-dev > /dev/null + run_command ar x libnvjpeg-dev-${nvjpeg_version}*.deb + run_command xz --decompress data.tar.xz + run_command tar xvf data.tar + popd > /dev/null + + run_command cp libnvjpeg-dev/usr/local/cuda-${cuda_version}/include/nvjpeg.h ${root_folder}/temp/cuda/include/ + run_command cp libnvjpeg-dev/usr/local/cuda-${cuda_version}/lib64/libnvjpeg_static.a ${root_folder}/temp/cuda/lib64/ + popd > /dev/null + run_command rm -r ${temp_tgz_dir} + else + run_command mkdir -p ${root_folder}/temp/cuda/include ${root_folder}/temp/cuda/lib64 + + if [ "${nvjpeg_include}" != "${root_folder}/temp/cuda/include" ]; then + run_command cp -Pf ${nvjpeg_include}/*nvjpeg* ${root_folder}/temp/cuda/include/ || true + run_command cp -Pf ${nvjpeg_lib}/*nvjpeg* ${root_folder}/temp/cuda/lib64/ || true + fi + fi +} + build_python_package_desc() { echo 'Build Python package Note: This command does not remove `dist` folder before building. @@ -483,6 +553,8 @@ build_python_package() { # Copy cufile SDK from host system to temp/cuda copy_gds_files_ + # Copy nvjpeg SDK from host system to temp/cuda + copy_nvjpeg_files_ run_command ${TOP}/dockcross-manylinux2014-x64 ./run build_python_package_ ret=$? diff --git a/scripts/debug_python b/scripts/debug_python new file mode 100755 index 000000000..f62e377fe --- /dev/null +++ b/scripts/debug_python @@ -0,0 +1,32 @@ +#!/bin/bash +# +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") + +if [ -e ${SCRIPT_DIR}/debug_env.sh ]; then + # User can place debug_env.sh in the same directory as this script (scripts/debug_env.sh would be ignored in git repo) + . ${SCRIPT_DIR}/debug_env.sh +elif [ -e ${SCRIPT_DIR}/../.python-version ]; then + # Need to init in script: https://github.com/pyenv/pyenv-virtualenv/issues/204 + eval "$(pyenv init -)" + eval "$(pyenv virtualenv-init -)" + # Above will do `pyenv activate $(cat ${SCRIPT_DIR}/../.python-version)`. +else + echo "Environment file not found. Exiting." + exit 1 +fi +echo "Python: $(python3 -c "import sys;print(sys.executable)")" +exec env python3 "$@"