From 292c963d7449e9fff86c4a2fa68f5788c15cb259 Mon Sep 17 00:00:00 2001
From: Gigon Bae <gbae@nvidia.com>
Date: Thu, 27 Jan 2022 23:55:36 -0800
Subject: [PATCH] Support multi-threads and batch, and support nvJPEG for
 JPEG-compressed images (#191)

This patch is for supporting/addressing the following issues.
- #139
- #123
- #149

Please see #149 for the detailed design.

- Use [TaskFlow](https://github.com/taskflow/taskflow) to use a thread pool and distributes workload to multiple threads if `num_workers` parameter to `read_region()` API is larger than zero.
- Implement a buffered loader to load images in a batch manner.
- Uses nvJPEG to decode JPEG-compressed image if `device` parameter to `read_region()` API is `cuda`. (It requires performance improvement though).
- File handler is now shared among multiple CuImage objects.
- Clean up some code

This implementation is already released as part of cuCIM [v21.12.01](https://github.com/rapidsai/cucim/wiki/release_notes_v21.12.01) release.
The followings are added on top of existing changes:
- Use nvjpeg.a from the publicly available package.
- Fix GPU memory leak when using nvjpeg API (when `device='cuda'` parameter is used in `read_region` method).
- Get `libculibos.a` from /usr/local/lib[64] if the library cannot be found in Conda environment. (nvjpeg depends on the static library).

Authors:
  - Gigon Bae (https://github.com/gigony)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cucim/pull/191
---
 .gitignore                                    |   2 +
 .idea/cucim.iml                               |  10 +-
 3rdparty/LICENSE.taskflow                     |  21 +
 CMakeLists.txt                                |   1 +
 LICENSE-3rdparty.md                           |   8 +-
 benchmarks/main.cpp                           |   3 +-
 conda/recipes/libcucim/build.sh               |   1 +
 cpp/CMakeLists.txt                            |   9 +
 cpp/cmake/deps/taskflow.cmake                 |  39 +
 cpp/include/cucim/cache/image_cache.h         |  17 +-
 cpp/include/cucim/cache/image_cache_manager.h |   7 +-
 cpp/include/cucim/concurrent/threadpool.h     |  52 ++
 cpp/include/cucim/cuimage.h                   |  73 +-
 cpp/include/cucim/filesystem/cufile_driver.h  |   2 +-
 cpp/include/cucim/filesystem/file_handle.h    |  73 +-
 cpp/include/cucim/io/format/image_format.h    |  37 +-
 .../cucim/loader/batch_data_processor.h       |  59 ++
 .../cucim/loader/thread_batch_data_loader.h   | 114 +++
 cpp/include/cucim/loader/tile_info.h          |  37 +
 cpp/include/cucim/profiler/nvtx3.h            |   2 +
 cpp/include/cucim/util/cuda.h                 |  37 +-
 cpp/plugins/cucim.kit.cumed/CMakeLists.txt    |   2 +
 .../cucim.kit.cumed/src/cumed/cumed.cpp       |  25 +-
 .../.idea/cucim.kit.cuslide.iml               |   2 +-
 cpp/plugins/cucim.kit.cuslide/CMakeLists.txt  |  12 +
 .../cmake/deps/libculibos.cmake               |  42 +
 .../cmake/deps/libopenjpeg.cmake              |   1 -
 .../cucim.kit.cuslide/cmake/deps/nvjpeg.cmake |  41 +
 .../cucim.kit.cuslide/src/cuslide/cuslide.cpp |  21 +-
 .../src/cuslide/jpeg/libnvjpeg.cpp            |  57 ++
 .../src/cuslide/jpeg/libnvjpeg.h              |  36 +
 .../src/cuslide/loader/nvjpeg_processor.cpp   | 439 ++++++++++
 .../src/cuslide/loader/nvjpeg_processor.h     | 110 +++
 .../src/cuslide/tiff/ifd.cpp                  | 801 ++++++++++++------
 .../cucim.kit.cuslide/src/cuslide/tiff/ifd.h  |  25 +-
 .../src/cuslide/tiff/tiff.cpp                 |  36 +-
 .../cucim.kit.cuslide/src/cuslide/tiff/tiff.h |   7 +-
 cpp/src/cache/image_cache.cpp                 |  11 +-
 cpp/src/cache/image_cache_empty.cpp           |  10 +-
 cpp/src/cache/image_cache_empty.h             |   5 +-
 cpp/src/cache/image_cache_manager.cpp         |  27 +-
 cpp/src/cache/image_cache_per_process.cpp     | 117 ++-
 cpp/src/cache/image_cache_per_process.h       |  14 +-
 cpp/src/cache/image_cache_shared_memory.cpp   |  88 +-
 cpp/src/cache/image_cache_shared_memory.h     |  14 +-
 cpp/src/concurrent/threadpool.cpp             |  70 ++
 cpp/src/cuimage.cpp                           | 424 +++++++--
 cpp/src/filesystem/cufile_driver.cpp          | 100 ++-
 cpp/src/loader/batch_data_processor.cpp       |  77 ++
 cpp/src/loader/thread_batch_data_loader.cpp   | 320 +++++++
 cpp/tests/CMakeLists.txt                      |   9 +
 cucim.code-workspace                          |  32 +-
 examples/cpp/tiff_image/main.cpp              |  19 +-
 python/cucim/src/localtest.py                 |   2 +-
 .../clara/test_read_region_memory_usage.py    |  11 +-
 python/pybind11/cache/cache_py.cpp            |   4 +-
 python/pybind11/cache/cache_py.h              |   2 +-
 python/pybind11/cache/cache_pydoc.h           |   2 +-
 python/pybind11/cucim_py.cpp                  | 216 ++++-
 python/pybind11/cucim_py.h                    |  12 +-
 python/pybind11/cucim_pydoc.h                 |  10 +
 run                                           |  72 ++
 scripts/debug_python                          |  32 +
 63 files changed, 3314 insertions(+), 647 deletions(-)
 create mode 100644 3rdparty/LICENSE.taskflow
 create mode 100644 cpp/cmake/deps/taskflow.cmake
 create mode 100644 cpp/include/cucim/concurrent/threadpool.h
 create mode 100644 cpp/include/cucim/loader/batch_data_processor.h
 create mode 100644 cpp/include/cucim/loader/thread_batch_data_loader.h
 create mode 100644 cpp/include/cucim/loader/tile_info.h
 create mode 100644 cpp/plugins/cucim.kit.cuslide/cmake/deps/libculibos.cmake
 create mode 100644 cpp/plugins/cucim.kit.cuslide/cmake/deps/nvjpeg.cmake
 create mode 100644 cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.cpp
 create mode 100644 cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.h
 create mode 100644 cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.cpp
 create mode 100644 cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.h
 create mode 100644 cpp/src/concurrent/threadpool.cpp
 create mode 100644 cpp/src/loader/batch_data_processor.cpp
 create mode 100644 cpp/src/loader/thread_batch_data_loader.cpp
 create mode 100755 scripts/debug_python

diff --git a/.gitignore b/.gitignore
index ef3f2babb..40bbb6354 100644
--- a/.gitignore
+++ b/.gitignore
@@ -153,3 +153,5 @@ conda-bld
 # Large Images
 *.svs
 
+# Custom debug environment setup script for VS Code (used by scripts/debug_python)
+/scripts/debug_env.sh
diff --git a/.idea/cucim.iml b/.idea/cucim.iml
index 24b221981..6d70257c7 100644
--- a/.idea/cucim.iml
+++ b/.idea/cucim.iml
@@ -1,8 +1,2 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module classpath="CMake" type="CPP_MODULE" version="4">
-  <component name="FacetManager">
-    <facet type="Python" name="Python facet">
-      <configuration sdkName="" />
-    </facet>
-  </component>
-</module>
\ No newline at end of file
+<?xml version="1.0" encoding="UTF-8"?>
+<module classpath="CMake" type="CPP_MODULE" version="4" />
\ No newline at end of file
diff --git a/3rdparty/LICENSE.taskflow b/3rdparty/LICENSE.taskflow
new file mode 100644
index 000000000..33887bc43
--- /dev/null
+++ b/3rdparty/LICENSE.taskflow
@@ -0,0 +1,21 @@
+TASKFLOW MIT LICENSE
+
+Copyright (c) 2018-2021 Dr. Tsung-Wei Huang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 585314c27..3c5c6e91e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,6 +126,7 @@ superbuild_depend(json)
 superbuild_depend(libcuckoo)
 superbuild_depend(boost-header-only)
 superbuild_depend(nvtx3)
+superbuild_depend(taskflow)
 
 ################################################################################
 # Define some names
diff --git a/LICENSE-3rdparty.md b/LICENSE-3rdparty.md
index 680306f85..05c3f8cc7 100644
--- a/LICENSE-3rdparty.md
+++ b/LICENSE-3rdparty.md
@@ -85,7 +85,7 @@ DLPack
   - https://github.com/dmlc/dlpack/blob/main/LICENSE
 - Copyright: DLPack Contributors
 
-NVIDIA CUDA TOOLKIT (including libcufile)
+NVIDIA CUDA TOOLKIT (including libcufile/libnvjpeg)
 - License: NVIDIA License
   - https://docs.nvidia.com/cuda/pdf/EULA.pdf
 - Copyright: NVIDIA Corporation
@@ -269,3 +269,9 @@ NVTX (NVIDIA Tool Extension Library)
   - https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/LICENSE.txt
 - Copyright: NVIDIA Corporation
 - Usage: Support for profiling with NVIDIA Nsight Systems
+
+Taskflow
+- License: MIT License
+  - https://github.com/taskflow/taskflow
+- Copyright: Dr. Tsung-Wei Huang
+- Usage: Threadpool implementation for batch processing.
diff --git a/benchmarks/main.cpp b/benchmarks/main.cpp
index d33516bd6..7c2577713 100644
--- a/benchmarks/main.cpp
+++ b/benchmarks/main.cpp
@@ -65,8 +65,7 @@ static void test_cucim(benchmark::State& state)
 
         cucim::CuImage image = cucim::CuImage(input_path.c_str());
         cucim::CuImage region =
-            image.read_region({ request_location[0], request_location[1] }, { state.range(0), state.range(0) }, 0,
-                              cucim::DimIndices{}, "cpu", nullptr, "");
+            image.read_region({ request_location[0], request_location[1] }, { state.range(0), state.range(0) }, 0);
     }
 }
 
diff --git a/conda/recipes/libcucim/build.sh b/conda/recipes/libcucim/build.sh
index 364bb9533..9d2217bbc 100644
--- a/conda/recipes/libcucim/build.sh
+++ b/conda/recipes/libcucim/build.sh
@@ -5,6 +5,7 @@ CUCIM_BUILD_TYPE=${CUCIM_BUILD_TYPE:-release}
 echo "CC          : ${CC}"
 echo "CXX         : ${CXX}"
 echo "CUDAHOSTCXX : ${CUDAHOSTCXX}"
+echo "CUDA        : ${CUDA}"
 
 # For now CUDAHOSTCXX is set to `/usr/bin/g++` by
 # https://github.com/rapidsai/docker/blob/161b200157206660d88fb02cf69fe58d363ac95e/generated-dockerfiles/rapidsai-core_ubuntu18.04-devel.Dockerfile
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f6670c982..67409a699 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -44,6 +44,7 @@ add_library(${CUCIM_PACKAGE_NAME}
         include/cucim/codec/base64.h
         include/cucim/codec/hash_function.h
         include/cucim/codec/methods.h
+        include/cucim/concurrent/threadpool.h
         include/cucim/config/config.h
         include/cucim/core/framework.h
         include/cucim/core/plugin.h
@@ -58,6 +59,9 @@ add_library(${CUCIM_PACKAGE_NAME}
         include/cucim/io/device.h
         include/cucim/io/device_type.h
         include/cucim/io/format/image_format.h
+        include/cucim/loader/batch_data_processor.h
+        include/cucim/loader/thread_batch_data_loader.h
+        include/cucim/loader/tile_info.h
         include/cucim/logger/logger.h
         include/cucim/logger/timer.h
         include/cucim/macros/defines.h
@@ -85,6 +89,7 @@ add_library(${CUCIM_PACKAGE_NAME}
         src/cache/image_cache_shared_memory.h
         src/cache/image_cache_shared_memory.cpp
         src/codec/base64.cpp
+        src/concurrent/threadpool.cpp
         src/config/config.cpp
         src/core/cucim_framework.h
         src/core/cucim_framework.cpp
@@ -98,6 +103,8 @@ add_library(${CUCIM_PACKAGE_NAME}
         src/io/device.cpp
         src/io/device_type.cpp
         src/io/format/image_format.cpp
+        src/loader/batch_data_processor.cpp
+        src/loader/thread_batch_data_loader.cpp
         src/logger/logger.cpp
         src/logger/timer.cpp
         src/memory/memory_manager.cpp
@@ -144,6 +151,7 @@ target_compile_definitions(${CUCIM_PACKAGE_NAME}
 target_link_libraries(${CUCIM_PACKAGE_NAME}
         PUBLIC
             ${CMAKE_DL_LIBS}
+            Threads::Threads # -lpthread
             $<BUILD_INTERFACE:deps::fmt>
             $<INSTALL_INTERFACE:cucim::fmt-header-only>
         PRIVATE
@@ -154,6 +162,7 @@ target_link_libraries(${CUCIM_PACKAGE_NAME}
             deps::boost-header-only
             deps::json
             deps::nvtx3
+            deps::taskflow
         )
 
 if (CUCIM_STATIC_GDS)
diff --git a/cpp/cmake/deps/taskflow.cmake b/cpp/cmake/deps/taskflow.cmake
new file mode 100644
index 000000000..147b6f4c3
--- /dev/null
+++ b/cpp/cmake/deps/taskflow.cmake
@@ -0,0 +1,39 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if (NOT TARGET deps::taskflow)
+    FetchContent_Declare(
+            deps-taskflow
+            GIT_REPOSITORY https://github.com/taskflow/taskflow.git
+            GIT_TAG v3.2.0
+            GIT_SHALLOW TRUE
+    )
+    FetchContent_GetProperties(deps-taskflow)
+    if (NOT deps-taskflow_POPULATED)
+        message(STATUS "Fetching taskflow sources")
+        FetchContent_Populate(deps-taskflow)
+        message(STATUS "Fetching taskflow sources - done")
+    endif ()
+
+    set(TF_BUILD_TESTS OFF)
+    set(TF_BUILD_EXAMPLES OFF)
+
+    add_subdirectory(${deps-taskflow_SOURCE_DIR} ${deps-taskflow_BINARY_DIR} EXCLUDE_FROM_ALL)
+
+    add_library(deps::taskflow INTERFACE IMPORTED GLOBAL)
+    target_link_libraries(deps::taskflow INTERFACE Taskflow)
+    set(deps-taskflow_SOURCE_DIR ${deps-taskflow_SOURCE_DIR} CACHE INTERNAL "" FORCE)
+    mark_as_advanced(deps-taskflow_SOURCE_DIR)
+endif ()
diff --git a/cpp/include/cucim/cache/image_cache.h b/cpp/include/cucim/cache/image_cache.h
index f8824be7b..d389bc10a 100644
--- a/cpp/include/cucim/cache/image_cache.h
+++ b/cpp/include/cucim/cache/image_cache.h
@@ -43,7 +43,10 @@ struct EXPORT_VISIBLE ImageCacheKey
 
 struct EXPORT_VISIBLE ImageCacheValue
 {
-    ImageCacheValue(void* data, uint64_t size, void* user_obj = nullptr);
+    ImageCacheValue(void* data,
+                    uint64_t size,
+                    void* user_obj = nullptr,
+                    const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU);
     virtual ~ImageCacheValue(){};
 
     operator bool() const;
@@ -51,6 +54,7 @@ struct EXPORT_VISIBLE ImageCacheValue
     void* data = nullptr;
     uint64_t size = 0;
     void* user_obj = nullptr;
+    cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU;
 };
 
 /**
@@ -63,11 +67,14 @@ struct EXPORT_VISIBLE ImageCacheValue
 class EXPORT_VISIBLE ImageCache : public std::enable_shared_from_this<ImageCache>
 {
 public:
-    ImageCache(const ImageCacheConfig& config, CacheType type = CacheType::kNoCache);
+    ImageCache(const ImageCacheConfig& config,
+               CacheType type = CacheType::kNoCache,
+               const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU);
     virtual ~ImageCache(){};
 
     virtual CacheType type() const;
     virtual const char* type_str() const;
+    virtual cucim::io::DeviceType device_type() const;
     virtual ImageCacheConfig& config();
     virtual ImageCacheConfig get_config() const;
 
@@ -79,14 +86,17 @@ class EXPORT_VISIBLE ImageCache : public std::enable_shared_from_this<ImageCache
      * @return std::shared_ptr<ImageCacheKey> A shared pointer containing %ImageCacheKey.
      */
     virtual std::shared_ptr<ImageCacheKey> create_key(uint64_t file_hash, uint64_t index) = 0;
-    virtual std::shared_ptr<ImageCacheValue> create_value(void* data, uint64_t size) = 0;
+    virtual std::shared_ptr<ImageCacheValue> create_value(
+        void* data, uint64_t size, const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU) = 0;
 
     virtual void* allocate(std::size_t n) = 0;
 
     virtual void lock(uint64_t index) = 0;
     virtual void unlock(uint64_t index) = 0;
+    virtual void* mutex(uint64_t index) = 0;
 
     virtual bool insert(std::shared_ptr<ImageCacheKey>& key, std::shared_ptr<ImageCacheValue>& value) = 0;
+    virtual void remove_front() = 0;
 
     virtual uint32_t size() const = 0;
     virtual uint64_t memory_size() const = 0;
@@ -128,6 +138,7 @@ class EXPORT_VISIBLE ImageCache : public std::enable_shared_from_this<ImageCache
 
 protected:
     CacheType type_ = CacheType::kNoCache;
+    cucim::io::DeviceType device_type_ = cucim::io::DeviceType::kCPU;
     ImageCacheConfig config_;
 };
 
diff --git a/cpp/include/cucim/cache/image_cache_manager.h b/cpp/include/cucim/cache/image_cache_manager.h
index b86a8629e..3fab3c9d6 100644
--- a/cpp/include/cucim/cache/image_cache_manager.h
+++ b/cpp/include/cucim/cache/image_cache_manager.h
@@ -20,6 +20,7 @@
 #include "cucim/core/framework.h"
 
 #include "cucim/cache/image_cache.h"
+#include "cucim/io/device_type.h"
 
 namespace cucim::cache
 {
@@ -27,7 +28,7 @@ namespace cucim::cache
 constexpr uint32_t kDefaultTileSize = 256;
 constexpr uint32_t kDefaultPatchSize = 256;
 
-uint32_t EXPORT_VISIBLE preferred_memory_capacity(const std::vector<uint32_t>& image_size,
+uint32_t EXPORT_VISIBLE preferred_memory_capacity(const std::vector<uint64_t>& image_size,
                                                   const std::vector<uint32_t>& tile_size,
                                                   const std::vector<uint32_t>& patch_size,
                                                   uint32_t bytes_per_pixel = 3);
@@ -43,9 +44,11 @@ class EXPORT_VISIBLE ImageCacheManager
     void reserve(uint32_t new_memory_capacity);
     void reserve(uint32_t new_memory_capacity, uint32_t new_capacity);
 
+    static std::unique_ptr<ImageCache> create_cache(const ImageCacheConfig& cache_config,
+                                                    const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU);
+
 private:
     std::unique_ptr<ImageCache> create_cache() const;
-    std::unique_ptr<ImageCache> create_cache(const ImageCacheConfig& cache_config) const;
 
     std::shared_ptr<ImageCache> cache_;
 };
diff --git a/cpp/include/cucim/concurrent/threadpool.h b/cpp/include/cucim/concurrent/threadpool.h
new file mode 100644
index 000000000..33e0a18ca
--- /dev/null
+++ b/cpp/include/cucim/concurrent/threadpool.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUCIM_CONCURRENT_THREADPOOL_H
+#define CUCIM_CONCURRENT_THREADPOOL_H
+
+#include "cucim/macros/api_header.h"
+
+#include <functional>
+#include <future>
+#include <memory>
+
+namespace cucim::concurrent
+{
+
+class EXPORT_VISIBLE ThreadPool
+{
+public:
+    explicit ThreadPool(int32_t num_workers);
+    ThreadPool(const ThreadPool&) = delete;
+
+    ThreadPool& operator=(const ThreadPool&) = delete;
+
+    operator bool() const;
+
+    ~ThreadPool();
+
+    std::future<void> enqueue(std::function<void()> task);
+    void wait();
+
+private:
+    struct Executor;
+    std::unique_ptr<Executor> executor_;
+    size_t num_workers_;
+};
+
+} // namespace cucim::concurrent
+
+#endif // CUCIM_CONCURRENT_THREADPOOL_H
diff --git a/cpp/include/cucim/cuimage.h b/cpp/include/cucim/cuimage.h
index a7a9b22d9..28a9120b7 100644
--- a/cpp/include/cucim/cuimage.h
+++ b/cpp/include/cucim/cuimage.h
@@ -23,20 +23,28 @@
 #include "cucim/filesystem/file_path.h"
 #include "cucim/io/device.h"
 #include "cucim/io/format/image_format.h"
+#include "cucim/loader/thread_batch_data_loader.h"
 #include "cucim/memory/dlpack.h"
 #include "cucim/plugin/image_format.h"
 #include "cucim/profiler/profiler.h"
 
 #include <array>
-#include <set>
-#include <string>
+#include <cstddef> // for std::ptrdiff_t
+#include <iterator> // for std::forward_iterator_tag
 #include <memory>
 #include <mutex>
+#include <set>
+#include <string>
 #include <vector>
 
 namespace cucim
 {
 
+// Forward declarations
+class CuImage;
+template <typename DataType>
+class CuImageIterator;
+
 using DetectedFormat = std::pair<std::string, std::vector<std::string>>;
 using Metadata = std::string;
 using Shape = std::vector<int64_t>;
@@ -154,11 +162,19 @@ class EXPORT_VISIBLE CuImage : public std::enable_shared_from_this<CuImage>
 
     ResolutionInfo resolutions() const;
 
+    loader::ThreadBatchDataLoader* loader() const;
+
     memory::DLTContainer container() const;
 
     CuImage read_region(std::vector<int64_t>&& location,
                         std::vector<int64_t>&& size,
                         uint16_t level = 0,
+                        uint32_t num_workers = 0,
+                        uint32_t batch_size = 1,
+                        bool drop_last = false,
+                        uint32_t prefetch_factor = 2,
+                        bool shuffle = false,
+                        uint64_t seed = 0,
                         const DimIndices& region_dim_indices = {},
                         const io::Device& device = "cpu",
                         DLTensor* buf = nullptr,
@@ -171,6 +187,22 @@ class EXPORT_VISIBLE CuImage : public std::enable_shared_from_this<CuImage>
 
     void close();
 
+    /////////////////////////////
+    // Iterator implementation //
+    /////////////////////////////
+
+    using iterator = CuImageIterator<CuImage>;
+    using const_iterator = CuImageIterator<const CuImage>;
+
+    friend class CuImageIterator<CuImage>;
+    friend class CuImageIterator<const CuImage>;
+
+    iterator begin();
+    iterator end();
+
+    const_iterator begin() const;
+    const_iterator end() const;
+
 private:
     using Mutex = std::mutex;
     using ScopedLock = std::scoped_lock<Mutex>;
@@ -191,7 +223,7 @@ class EXPORT_VISIBLE CuImage : public std::enable_shared_from_this<CuImage>
 
     mutable Mutex mutex_;
     cucim::io::format::ImageFormatDesc* image_format_ = nullptr;
-    CuCIMFileHandle file_handle_{};
+    std::shared_ptr<CuCIMFileHandle> file_handle_;
     io::format::ImageMetadataDesc* image_metadata_ = nullptr;
     io::format::ImageDataDesc* image_data_ = nullptr;
     bool is_loaded_ = false;
@@ -199,6 +231,41 @@ class EXPORT_VISIBLE CuImage : public std::enable_shared_from_this<CuImage>
     std::set<std::string> associated_images_;
 };
 
+template <typename DataType>
+class EXPORT_VISIBLE CuImageIterator
+{
+public:
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = DataType;
+    using pointer = value_type*;
+    using reference = std::shared_ptr<value_type>;
+
+    CuImageIterator(std::shared_ptr<DataType> cuimg, bool ending = false);
+    CuImageIterator(const CuImageIterator<DataType>& it) = default;
+
+    reference operator*() const;
+    pointer operator->();
+    CuImageIterator<DataType>& operator++();
+    CuImageIterator<DataType> operator++(int);
+    bool operator==(const CuImageIterator<DataType>& other);
+    bool operator!=(const CuImageIterator<DataType>& other);
+
+    int64_t index(); /// batch index
+    uint64_t size() const; /// number of batches
+
+private:
+    void increase_index_();
+
+    std::shared_ptr<DataType> cuimg_;
+    void* loader_ = nullptr;
+    int64_t batch_index_ = 0;
+    uint64_t total_batch_count_ = 0;
+};
+
+template class CuImageIterator<CuImage>;
+template class CuImageIterator<const CuImage>;
+
 } // namespace cucim
 
 #endif // CUCIM_CUIMAGE_H
diff --git a/cpp/include/cucim/filesystem/cufile_driver.h b/cpp/include/cucim/filesystem/cufile_driver.h
index f2ec749cd..e1877d5aa 100644
--- a/cpp/include/cucim/filesystem/cufile_driver.h
+++ b/cpp/include/cucim/filesystem/cufile_driver.h
@@ -190,7 +190,7 @@ class EXPORT_VISIBLE CuFileDriver : public std::enable_shared_from_this<CuFileDr
     size_t file_size_ = 0;
     int file_flags_ = -1;
     void* mmap_ptr_ = nullptr;
-    ::CuCIMFileHandle handle_;
+    std::shared_ptr<CuCIMFileHandle> handle_;
 };
 
 } // namespace cucim::filesystem
diff --git a/cpp/include/cucim/filesystem/file_handle.h b/cpp/include/cucim/filesystem/file_handle.h
index 3bf2c005b..5ecbd56fa 100644
--- a/cpp/include/cucim/filesystem/file_handle.h
+++ b/cpp/include/cucim/filesystem/file_handle.h
@@ -21,8 +21,16 @@
 #include "../macros/defines.h"
 #include <cstdio>
 #include <cstdint>
+#include <memory>
+
+#include <fmt/format.h>
+
+#include "cucim/memory/memory_manager.h"
 
 typedef void* CUfileHandle_t;
+typedef void* CuCIMFileHandle_share;
+typedef void* CuCIMFileHandle_ptr;
+typedef bool (*CuCIMFileHandleDeleter)(CuCIMFileHandle_ptr);
 
 enum class FileHandleType: uint16_t
 {
@@ -35,29 +43,50 @@ enum class FileHandleType: uint16_t
 
 
 #if CUCIM_PLATFORM_LINUX
-struct CuCIMFileHandle
+
+struct EXPORT_VISIBLE CuCIMFileHandle : public std::enable_shared_from_this<CuCIMFileHandle>
 {
-#    ifdef __cplusplus
-    EXPORT_VISIBLE CuCIMFileHandle();
-    EXPORT_VISIBLE CuCIMFileHandle(int fd, CUfileHandle_t cufile, FileHandleType type, char* path, void* client_data);
-    EXPORT_VISIBLE CuCIMFileHandle(int fd,
-                                   CUfileHandle_t cufile,
-                                   FileHandleType type,
-                                   char* path,
-                                   void* client_data,
-                                   uint64_t dev,
-                                   uint64_t ino,
-                                   int64_t mtime);
-#    endif
-    int fd;
-    CUfileHandle_t cufile;
-    FileHandleType type; /// 1: POSIX, 2: POSIX+ODIRECT, 4: MemoryMapped, 8: GPUDirect
-    char* path;
-    void* client_data;
-    uint64_t hash_value;
-    uint64_t dev;
-    uint64_t ino;
-    int64_t mtime;
+    CuCIMFileHandle();
+    CuCIMFileHandle(int fd, CUfileHandle_t cufile, FileHandleType type, char* path, void* client_data);
+    CuCIMFileHandle(int fd,
+                    CUfileHandle_t cufile,
+                    FileHandleType type,
+                    char* path,
+                    void* client_data,
+                    uint64_t dev,
+                    uint64_t ino,
+                    int64_t mtime);
+
+    ~CuCIMFileHandle()
+    {
+        if (path && path[0] != '\0')
+        {
+            cucim_free(path);
+            path = nullptr;
+        }
+
+        if (deleter)
+        {
+            deleter(this);
+            deleter = nullptr;
+        }
+    }
+
+    CuCIMFileHandleDeleter set_deleter(CuCIMFileHandleDeleter deleter)
+    {
+        return this->deleter = deleter;
+    }
+
+    int fd = -1;
+    CUfileHandle_t cufile = nullptr;
+    FileHandleType type = FileHandleType::kUnknown; /// 1: POSIX, 2: POSIX+ODIRECT, 4: MemoryMapped, 8: GPUDirect
+    char* path = nullptr;
+    void* client_data = nullptr;
+    uint64_t hash_value = 0;
+    uint64_t dev = 0;
+    uint64_t ino = 0;
+    int64_t mtime = 0;
+    CuCIMFileHandleDeleter deleter = nullptr;
 };
 #else
 #    error "This platform is not supported!"
diff --git a/cpp/include/cucim/io/format/image_format.h b/cpp/include/cucim/io/format/image_format.h
index 091ea098b..c83f72dc9 100644
--- a/cpp/include/cucim/io/format/image_format.h
+++ b/cpp/include/cucim/io/format/image_format.h
@@ -160,6 +160,7 @@ struct ImageDataDesc
 {
     DLTensor container;
     char* shm_name;
+    void* loader;
 };
 
 struct ImageCheckerDesc
@@ -183,7 +184,7 @@ struct ImageParserDesc
      * @param file_path
      * @return
      */
-    CuCIMFileHandle(CUCIM_ABI* open)(const char* file_path);
+    CuCIMFileHandle_share(CUCIM_ABI* open)(const char* file_path);
 
     /**
      *
@@ -191,26 +192,36 @@ struct ImageParserDesc
      * @param out_metadata
      * @return
      */
-    bool(CUCIM_ABI* parse)(CuCIMFileHandle* handle, ImageMetadataDesc* out_metadata);
+    bool(CUCIM_ABI* parse)(CuCIMFileHandle_ptr handle, ImageMetadataDesc* out_metadata);
 
     /**
      *
      * @param handle
      * @return
      */
-    bool(CUCIM_ABI* close)(CuCIMFileHandle* handle);
+    bool(CUCIM_ABI* close)(CuCIMFileHandle_ptr handle);
 };
 
 struct ImageReaderRegionRequestDesc
 {
-    int64_t* location;
-    int64_t* size;
-    uint16_t level;
-    DimIndicesDesc region_dim_indices;
-    char* associated_image_name;
-    char* device;
-    DLTensor* buf;
-    char* shm_name;
+    int64_t* location = nullptr;
+    void* location_unique = nullptr;
+    int64_t* size = nullptr;
+    void* size_unique = nullptr;
+    uint64_t location_len = 1;
+    int32_t size_ndim = 2;
+    uint16_t level = 0;
+    uint32_t num_workers = 0;
+    uint32_t batch_size = 1;
+    bool drop_last = false;
+    uint32_t prefetch_factor = 2;
+    bool shuffle = false;
+    uint64_t seed = 0;
+    DimIndicesDesc region_dim_indices{};
+    char* associated_image_name = nullptr;
+    char* device = nullptr;
+    DLTensor* buf = nullptr;
+    char* shm_name = nullptr;
 };
 
 struct ImageReaderDesc
@@ -223,7 +234,7 @@ struct ImageReaderDesc
      * @param out_image_metadata needed for associated_image
      * @return
      */
-    bool(CUCIM_ABI* read)(const CuCIMFileHandle* handle,
+    bool(CUCIM_ABI* read)(const CuCIMFileHandle_ptr handle,
                           const ImageMetadataDesc* metadata,
                           const ImageReaderRegionRequestDesc* request,
                           ImageDataDesc* out_image_data,
@@ -239,7 +250,7 @@ struct ImageWriterDesc
      * @param image_data
      * @return
      */
-    bool(CUCIM_ABI* write)(const CuCIMFileHandle* handle,
+    bool(CUCIM_ABI* write)(const CuCIMFileHandle_ptr handle,
                            const ImageMetadataDesc* metadata,
                            const ImageDataDesc* image_data);
 };
diff --git a/cpp/include/cucim/loader/batch_data_processor.h b/cpp/include/cucim/loader/batch_data_processor.h
new file mode 100644
index 000000000..7570c772a
--- /dev/null
+++ b/cpp/include/cucim/loader/batch_data_processor.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUCIM_LOADER_BATCH_DATA_PROCESSOR_H
+#define CUCIM_LOADER_BATCH_DATA_PROCESSOR_H
+
+#include "cucim/macros/api_header.h"
+
+#include <cstdint>
+#include <deque>
+
+#include <cucim/cache/image_cache.h>
+
+#include "tile_info.h"
+
+namespace cucim::loader
+{
+
+class EXPORT_VISIBLE BatchDataProcessor
+{
+public:
+    BatchDataProcessor(uint32_t batch_size);
+    virtual ~BatchDataProcessor();
+
+    void add_tile(const TileInfo& tile);
+    TileInfo remove_front_tile();
+
+    virtual uint32_t request(std::deque<uint32_t>& batch_item_counts, const uint32_t num_remaining_patches);
+    virtual uint32_t wait_batch(const uint32_t index_in_task,
+                                std::deque<uint32_t>& batch_item_counts,
+                                const uint32_t num_remaining_patches);
+
+    virtual std::shared_ptr<cucim::cache::ImageCacheValue> wait_for_processing(uint32_t);
+
+    virtual void shutdown();
+
+protected:
+    uint32_t batch_size_ = 1;
+    uint64_t total_index_count_ = 0;
+    uint64_t processed_index_count_ = 0;
+    std::deque<TileInfo> tiles_;
+};
+
+} // namespace cucim::loader
+
+#endif // CUCIM_LOADER_BATCH_DATA_PROCESSOR_H
diff --git a/cpp/include/cucim/loader/thread_batch_data_loader.h b/cpp/include/cucim/loader/thread_batch_data_loader.h
new file mode 100644
index 000000000..c368184c9
--- /dev/null
+++ b/cpp/include/cucim/loader/thread_batch_data_loader.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUCIM_LOADER_THREAD_BATCH_DATA_LOADER_H
+#define CUCIM_LOADER_THREAD_BATCH_DATA_LOADER_H
+
+#include "cucim/macros/api_header.h"
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "cucim/cache/image_cache.h"
+#include "cucim/concurrent/threadpool.h"
+#include "cucim/io/device.h"
+#include "cucim/loader/batch_data_processor.h"
+#include "cucim/loader/tile_info.h"
+
+namespace cucim::loader
+{
+
+class EXPORT_VISIBLE ThreadBatchDataLoader
+{
+public:
+    using LoadFunc = std::function<void(ThreadBatchDataLoader* loader_ptr, uint64_t location_index)>;
+
+    ThreadBatchDataLoader(LoadFunc load_func,
+                          std::unique_ptr<BatchDataProcessor> batch_data_processor,
+                          cucim::io::Device out_device,
+                          std::unique_ptr<std::vector<int64_t>> location,
+                          std::unique_ptr<std::vector<int64_t>> image_size,
+                          uint64_t location_len,
+                          size_t one_raster_size,
+                          uint32_t batch_size,
+                          uint32_t prefetch_factor,
+                          uint32_t num_workers);
+
+    ~ThreadBatchDataLoader();
+
+    operator bool() const;
+
+    uint8_t* raster_pointer(uint64_t location_index) const;
+    uint32_t request(uint32_t load_size = 0);
+    uint32_t wait_batch();
+    /**
+     * @brief Return the next batch of data.
+     *
+     * If the number of workers is zero, this function will return the ownership of the data.
+     * @return uint8_t* The pointer to the data.
+     */
+    uint8_t* next_data();
+
+    BatchDataProcessor* batch_data_processor();
+    std::shared_ptr<cucim::cache::ImageCacheValue> wait_for_processing(uint32_t index);
+
+    uint64_t size() const;
+    uint32_t batch_size() const;
+
+    uint64_t total_batch_count() const;
+    uint64_t processed_batch_count() const;
+    uint8_t* data() const;
+    uint32_t data_batch_size() const;
+
+    bool enqueue(std::function<void()> task, const TileInfo& tile);
+
+private:
+    bool stopped_ = false;
+    LoadFunc load_func_;
+    cucim::io::Device out_device_;
+    std::unique_ptr<std::vector<int64_t>> location_ = nullptr;
+    std::unique_ptr<std::vector<int64_t>> image_size_ = nullptr;
+    uint64_t location_len_ = 0;
+    size_t one_rester_size_ = 0;
+    uint32_t batch_size_ = 1;
+    uint32_t prefetch_factor_ = 2;
+    uint32_t num_workers_ = 0;
+
+    // For nvjpeg
+    std::unique_ptr<BatchDataProcessor> batch_data_processor_;
+
+    size_t buffer_item_len_ = 0;
+    size_t buffer_size_ = 0;
+    std::vector<uint8_t*> raster_data_;
+    std::deque<std::future<void>> tasks_;
+    // NOTE: the order is important ('thread_pool_' depends on 'raster_data_' and 'tasks_')
+    cucim::concurrent::ThreadPool thread_pool_;
+
+    uint64_t queued_item_count_ = 0;
+    uint64_t buffer_item_head_index_ = 0;
+    uint64_t buffer_item_tail_index_ = 0;
+
+    std::deque<uint32_t> batch_item_counts_;
+    uint64_t processed_batch_count_ = 0;
+    uint8_t* current_data_ = nullptr;
+    uint32_t current_data_batch_size_ = 0;
+};
+
+} // namespace cucim::loader
+
+#endif // CUCIM_LOADER_THREAD_BATCH_DATA_LOADER_H
diff --git a/cpp/include/cucim/loader/tile_info.h b/cpp/include/cucim/loader/tile_info.h
new file mode 100644
index 000000000..e55bc03a4
--- /dev/null
+++ b/cpp/include/cucim/loader/tile_info.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUCIM_LOADER_TILE_INFO_H
+#define CUCIM_LOADER_TILE_INFO_H
+
+#include "cucim/macros/api_header.h"
+
+#include <cstdint>
+
+namespace cucim::loader
+{
+
+struct EXPORT_VISIBLE TileInfo
+{
+    int64_t location_index = 0; // patch #
+    int64_t index = 0; // tile #
+    uint64_t offset = 0;
+    uint64_t size = 0;
+};
+
+} // namespace cucim::loader
+
+#endif // CUCIM_LOADER_TILE_INFO_H
diff --git a/cpp/include/cucim/profiler/nvtx3.h b/cpp/include/cucim/profiler/nvtx3.h
index 494bc9320..e438a305a 100644
--- a/cpp/include/cucim/profiler/nvtx3.h
+++ b/cpp/include/cucim/profiler/nvtx3.h
@@ -188,8 +188,10 @@ DEFINE_EVENT(ifd_read, "IFD::read()", io, 255, 255, 0, 0);
 DEFINE_EVENT(ifd_read_slowpath, "IFD::read::slow_path", io, 255, 255, 0, 0);
 DEFINE_EVENT(ifd_read_region_tiles, "IFD::read_region_tiles()", io, 255, 255, 0, 0);
 DEFINE_EVENT(ifd_read_region_tiles_iter, "IFD::read_region_tiles::iter", io, 255, 255, 0, 0);
+DEFINE_EVENT(ifd_read_region_tiles_task, "IFD::read_region_tiles::task", io, 255, 255, 0, 0);
 DEFINE_EVENT(ifd_read_region_tiles_boundary, "IFD::read_region_tiles_boundary()", io, 255, 255, 0, 0);
 DEFINE_EVENT(ifd_read_region_tiles_boundary_iter, "IFD::read_region_tiles_boundary::iter", io, 255, 255, 0, 0);
+DEFINE_EVENT(ifd_read_region_tiles_boundary_task, "IFD::read_region_tiles_boundary::task", io, 255, 255, 0, 0);
 DEFINE_EVENT(ifd_decompression, "IFD::decompression", compute, 255, 0, 255, 0);
 
 DEFINE_EVENT(decoder_libjpeg_turbo_tjAlloc, "libjpeg-turbo::tjAlloc()", memory, 255, 63, 72, 204);
diff --git a/cpp/include/cucim/util/cuda.h b/cpp/include/cucim/util/cuda.h
index f89b1adaf..12f161f1e 100644
--- a/cpp/include/cucim/util/cuda.h
+++ b/cpp/include/cucim/util/cuda.h
@@ -17,7 +17,10 @@
 #ifndef CUCIM_UTIL_CUDA_H
 #define CUCIM_UTIL_CUDA_H
 
-#include <cuda_runtime.h>
+
+#if CUCIM_SUPPORT_CUDA
+#    include <cuda_runtime.h>
+#endif
 
 #define CUDA_TRY(stmt)                                                                                                 \
     {                                                                                                                  \
@@ -29,6 +32,38 @@
         }                                                                                                              \
     }
 
+#define CUDA_ERROR(stmt)                                                                                               \
+    {                                                                                                                  \
+        cuda_status = stmt;                                                                                            \
+        if (cudaSuccess != cuda_status)                                                                                \
+        {                                                                                                              \
+            throw std::runtime_error(                                                                                  \
+                fmt::format("[Error] CUDA Runtime call {} in line {} of file {} failed with '{}' ({}).\n", #stmt,      \
+                            __LINE__, __FILE__, cudaGetErrorString(cuda_status), cuda_status));                        \
+        }                                                                                                              \
+    }
+
+#define NVJPEG_TRY(stmt)                                                                                               \
+    {                                                                                                                  \
+        nvjpegStatus_t _nvjpeg_status = stmt;                                                                          \
+        if (_nvjpeg_status != NVJPEG_STATUS_SUCCESS)                                                                   \
+        {                                                                                                              \
+            fmt::print("[Error] NVJPEG call {} in line {} of file {} failed with the error code {}.\n", #stmt,         \
+                __LINE__, __FILE__, _nvjpeg_status));                                                                  \
+        }                                                                                                              \
+    }
+
+#define NVJPEG_ERROR(stmt)                                                                                             \
+    {                                                                                                                  \
+        nvjpegStatus_t _nvjpeg_status = stmt;                                                                          \
+        if (_nvjpeg_status != NVJPEG_STATUS_SUCCESS)                                                                   \
+        {                                                                                                              \
+            throw std::runtime_error(                                                                                  \
+                fmt::format("[Error] NVJPEG call {} in line {} of file {} failed with the error code {}.\n", #stmt,    \
+                            __LINE__, __FILE__, _nvjpeg_status));                                                      \
+        }                                                                                                              \
+    }
+
 namespace cucim::util
 {
 
diff --git a/cpp/plugins/cucim.kit.cumed/CMakeLists.txt b/cpp/plugins/cucim.kit.cumed/CMakeLists.txt
index 3917bb571..4073ecfd5 100644
--- a/cpp/plugins/cucim.kit.cumed/CMakeLists.txt
+++ b/cpp/plugins/cucim.kit.cumed/CMakeLists.txt
@@ -70,6 +70,8 @@ endif()
 
 # Find CUDAToolkit as rmm depends on it
 find_package(CUDAToolkit REQUIRED)
+# For Threads::Threads
+find_package(Threads REQUIRED)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED YES)
diff --git a/cpp/plugins/cucim.kit.cumed/src/cumed/cumed.cpp b/cpp/plugins/cucim.kit.cumed/src/cumed/cumed.cpp
index ebea041a5..8f0cb5f6e 100644
--- a/cpp/plugins/cucim.kit.cumed/src/cumed/cumed.cpp
+++ b/cpp/plugins/cucim.kit.cumed/src/cumed/cumed.cpp
@@ -77,7 +77,7 @@ static bool CUCIM_ABI checker_is_valid(const char* file_name, const char* buf, s
     return false;
 }
 
-static CuCIMFileHandle CUCIM_ABI parser_open(const char* file_path_)
+static CuCIMFileHandle_share CUCIM_ABI parser_open(const char* file_path_)
 {
     const cucim::filesystem::Path& file_path = file_path_;
 
@@ -95,13 +95,13 @@ static CuCIMFileHandle CUCIM_ABI parser_open(const char* file_path_)
         throw std::invalid_argument(fmt::format("Cannot open {}!", file_path));
     }
 
-    // TODO: make file_handle_ object to pointer
-    auto file_handle = CuCIMFileHandle{ fd, nullptr, FileHandleType::kPosix, file_path_cstr, nullptr };
+    auto file_handle = std::make_shared<CuCIMFileHandle>(fd, nullptr, FileHandleType::kPosix, file_path_cstr, nullptr);
+    CuCIMFileHandle_share handle = new std::shared_ptr<CuCIMFileHandle>(std::move(file_handle));
 
-    return file_handle;
+    return handle;
 }
 
-static bool CUCIM_ABI parser_parse(CuCIMFileHandle* handle, cucim::io::format::ImageMetadataDesc* out_metadata_desc)
+static bool CUCIM_ABI parser_parse(CuCIMFileHandle_ptr handle, cucim::io::format::ImageMetadataDesc* out_metadata_desc)
 {
     (void)handle;
     if (!out_metadata_desc || !out_metadata_desc->handle)
@@ -210,13 +210,10 @@ static bool CUCIM_ABI parser_parse(CuCIMFileHandle* handle, cucim::io::format::I
     return true;
 }
 
-static bool CUCIM_ABI parser_close(CuCIMFileHandle* handle)
+static bool CUCIM_ABI parser_close(CuCIMFileHandle_ptr handle_ptr)
 {
-    if (handle->path)
-    {
-        cucim_free(handle->path);
-        handle->path = nullptr;
-    }
+    CuCIMFileHandle* handle = reinterpret_cast<CuCIMFileHandle*>(handle_ptr);
+
     if (handle->client_data)
     {
         // TODO: comment out and reinterpret_cast when needed.
@@ -227,12 +224,13 @@ static bool CUCIM_ABI parser_close(CuCIMFileHandle* handle)
     return true;
 }
 
-static bool CUCIM_ABI reader_read(const CuCIMFileHandle* handle,
+static bool CUCIM_ABI reader_read(const CuCIMFileHandle_ptr handle_ptr,
                                   const cucim::io::format::ImageMetadataDesc* metadata,
                                   const cucim::io::format::ImageReaderRegionRequestDesc* request,
                                   cucim::io::format::ImageDataDesc* out_image_data,
                                   cucim::io::format::ImageMetadataDesc* out_metadata_desc = nullptr)
 {
+    CuCIMFileHandle* handle = reinterpret_cast<CuCIMFileHandle*>(handle_ptr);
     (void)handle;
     (void)metadata;
 
@@ -378,10 +376,11 @@ static bool CUCIM_ABI reader_read(const CuCIMFileHandle* handle,
     return true;
 }
 
-static bool CUCIM_ABI writer_write(const CuCIMFileHandle* handle,
+static bool CUCIM_ABI writer_write(const CuCIMFileHandle_ptr handle_ptr,
                                    const cucim::io::format::ImageMetadataDesc* metadata,
                                    const cucim::io::format::ImageDataDesc* image_data)
 {
+    CuCIMFileHandle* handle = reinterpret_cast<CuCIMFileHandle*>(handle_ptr);
     (void)handle;
     (void)metadata;
     (void)image_data;
diff --git a/cpp/plugins/cucim.kit.cuslide/.idea/cucim.kit.cuslide.iml b/cpp/plugins/cucim.kit.cuslide/.idea/cucim.kit.cuslide.iml
index f08604bb6..6d70257c7 100644
--- a/cpp/plugins/cucim.kit.cuslide/.idea/cucim.kit.cuslide.iml
+++ b/cpp/plugins/cucim.kit.cuslide/.idea/cucim.kit.cuslide.iml
@@ -1,2 +1,2 @@
-<?xml version="1.0" encoding="UTF-8"?>
+<?xml version="1.0" encoding="UTF-8"?>
 <module classpath="CMake" type="CPP_MODULE" version="4" />
\ No newline at end of file
diff --git a/cpp/plugins/cucim.kit.cuslide/CMakeLists.txt b/cpp/plugins/cucim.kit.cuslide/CMakeLists.txt
index ce4314677..37dbca4f7 100644
--- a/cpp/plugins/cucim.kit.cuslide/CMakeLists.txt
+++ b/cpp/plugins/cucim.kit.cuslide/CMakeLists.txt
@@ -70,6 +70,8 @@ endif()
 
 # Find CUDAToolkit as rmm depends on it
 find_package(CUDAToolkit REQUIRED)
+# For Threads::Threads
+find_package(Threads REQUIRED)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED YES)
@@ -123,6 +125,8 @@ superbuild_depend(cli11)
 superbuild_depend(pugixml)
 superbuild_depend(json)
 superbuild_depend(libdeflate)
+superbuild_depend(nvjpeg)
+superbuild_depend(libculibos)
 
 ################################################################################
 # Find cucim package
@@ -160,11 +164,15 @@ add_library(${CUCIM_PLUGIN_NAME}
     src/cuslide/deflate/deflate.h
     src/cuslide/jpeg/libjpeg_turbo.cpp
     src/cuslide/jpeg/libjpeg_turbo.h
+    src/cuslide/jpeg/libnvjpeg.cpp
+    src/cuslide/jpeg/libnvjpeg.h
     src/cuslide/jpeg2k/color_conversion.cpp
     src/cuslide/jpeg2k/color_conversion.h
     src/cuslide/jpeg2k/color_table.h
     src/cuslide/jpeg2k/libopenjpeg.cpp
     src/cuslide/jpeg2k/libopenjpeg.h
+    src/cuslide/loader/nvjpeg_processor.cpp
+    src/cuslide/loader/nvjpeg_processor.h
     ${deps-libopenjpeg_SOURCE_DIR}/src/bin/common/color.c  # for color_sycc_to_rgb() and color_apply_icc_profile()
     src/cuslide/lzw/lzw.cpp
     src/cuslide/lzw/lzw.h
@@ -208,6 +216,10 @@ target_compile_options(${CUCIM_PLUGIN_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-W
 # Link libraries
 target_link_libraries(${CUCIM_PLUGIN_NAME}
         PRIVATE
+            # Use custom nvjpeg_static that supports GPU input (>= CUDA 11.6)
+            deps::nvjpeg_static # add this before cudart so that nvjpeg.h in static library takes precedence.
+            # Add CUDA::culibos to link necessary methods for 'deps::nvjpeg_static'
+            CUDA::culibos       # for nvjpeg
             CUDA::cudart
             deps::fmt
             cucim::cucim
diff --git a/cpp/plugins/cucim.kit.cuslide/cmake/deps/libculibos.cmake b/cpp/plugins/cucim.kit.cuslide/cmake/deps/libculibos.cmake
new file mode 100644
index 000000000..2938198ea
--- /dev/null
+++ b/cpp/plugins/cucim.kit.cuslide/cmake/deps/libculibos.cmake
@@ -0,0 +1,42 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This module tries to find libculibos.a from /usr/local/cuda if
+# CUDA::culibos is not available through `find_package(CUDAToolkit REQUIRED)`.
+if (NOT TARGET CUDA::culibos)
+
+    find_package(CUDAToolkit REQUIRED)
+
+    if(NOT TARGET CUDA::culibos)
+        find_library(CUDA_culibos_LIBRARY
+            NAMES culibos
+            HINTS /usr/local/cuda
+                ENV CUDA_PATH
+            PATH_SUFFIXES nvidia/current lib64 lib/x64 lib
+        )
+
+        mark_as_advanced(CUDA_culibos_LIBRARY)
+
+        if (NOT TARGET CUDA::culibos AND CUDA_culibos_LIBRARY)
+            add_library(CUDA::culibos STATIC IMPORTED GLOBAL)
+            target_include_directories(CUDA::culibos SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+            set_property(TARGET CUDA::culibos PROPERTY IMPORTED_LOCATION "${CUDA_culibos_LIBRARY}")
+            message("Set CUDA_culibos_LIBRARY to '${CUDA_culibos_LIBRARY}'.")
+        else ()
+            message(FATAL_ERROR "Could not find CUDA::culibos.")
+        endif()
+    endif ()
+
+endif ()
diff --git a/cpp/plugins/cucim.kit.cuslide/cmake/deps/libopenjpeg.cmake b/cpp/plugins/cucim.kit.cuslide/cmake/deps/libopenjpeg.cmake
index aea3eb7e4..428b04f0f 100644
--- a/cpp/plugins/cucim.kit.cuslide/cmake/deps/libopenjpeg.cmake
+++ b/cpp/plugins/cucim.kit.cuslide/cmake/deps/libopenjpeg.cmake
@@ -38,7 +38,6 @@ if (NOT TARGET deps::libopenjpeg)
     ###########################################################################
     # Build liblcms2 with the source in libopenjpeg
     ###########################################################################
-    message(" ##GG ${deps-libopenjpeg_SOURCE_DIR}")
 
     add_subdirectory(${deps-libopenjpeg_SOURCE_DIR}/thirdparty/liblcms2 ${deps-libopenjpeg_BINARY_DIR}/thirdparty/liblcms2)
 
diff --git a/cpp/plugins/cucim.kit.cuslide/cmake/deps/nvjpeg.cmake b/cpp/plugins/cucim.kit.cuslide/cmake/deps/nvjpeg.cmake
new file mode 100644
index 000000000..8bcf8c911
--- /dev/null
+++ b/cpp/plugins/cucim.kit.cuslide/cmake/deps/nvjpeg.cmake
@@ -0,0 +1,41 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if (NOT TARGET deps::nvjpeg_static)
+
+    add_library(deps::nvjpeg_static STATIC IMPORTED GLOBAL)
+
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../../../temp/cuda/include/nvjpeg.h)
+        set(NVJPEG_INCLUDE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../../temp/cuda/include)
+    else ()
+        message(FATAL_ERROR "nvjpeg.h not found")
+    endif ()
+
+    message("Set NVJPEG_INCLUDE_PATH to '${NVJPEG_INCLUDE_PATH}'.")
+
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../../../temp/cuda/lib64/libnvjpeg_static.a)
+        set(NVJPEG_STATIC_LIB_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../../temp/cuda/lib64/libnvjpeg_static.a)
+    else ()
+        message(FATAL_ERROR "libnvjpeg_static.a not found")
+    endif ()
+
+    message("Set NVJPEG_STATIC_LIB_PATH to '${NVJPEG_STATIC_LIB_PATH}'.")
+
+    set_target_properties(deps::nvjpeg_static PROPERTIES
+        IMPORTED_LOCATION "${NVJPEG_STATIC_LIB_PATH}"
+        INTERFACE_INCLUDE_DIRECTORIES "${NVJPEG_INCLUDE_PATH}"
+    )
+
+endif ()
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/cuslide.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/cuslide.cpp
index 5e7c17d19..42b0dc894 100644
--- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/cuslide.cpp
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/cuslide.cpp
@@ -82,15 +82,20 @@ static bool CUCIM_ABI checker_is_valid(const char* file_name, const char* buf, s
     return false;
 }
 
-static CuCIMFileHandle CUCIM_ABI parser_open(const char* file_path)
+static CuCIMFileHandle_share CUCIM_ABI parser_open(const char* file_path)
 {
     auto tif = new cuslide::tiff::TIFF(file_path, O_RDONLY);
     tif->construct_ifds();
-    return tif->file_handle();
+    // Move the ownership of the file handle object to the caller (CuImage).
+    auto handle_t = tif->file_handle();
+    tif->file_handle() = nullptr;
+    CuCIMFileHandle_share handle = new std::shared_ptr<CuCIMFileHandle>(handle_t);
+    return handle;
 }
 
-static bool CUCIM_ABI parser_parse(CuCIMFileHandle* handle, cucim::io::format::ImageMetadataDesc* out_metadata_desc)
+static bool CUCIM_ABI parser_parse(CuCIMFileHandle_ptr handle_ptr, cucim::io::format::ImageMetadataDesc* out_metadata_desc)
 {
+    CuCIMFileHandle* handle = reinterpret_cast<CuCIMFileHandle*>(handle_ptr);
     if (!out_metadata_desc || !out_metadata_desc->handle)
     {
         throw std::runtime_error("out_metadata_desc shouldn't be nullptr!");
@@ -237,30 +242,34 @@ static bool CUCIM_ABI parser_parse(CuCIMFileHandle* handle, cucim::io::format::I
     return true;
 }
 
-static bool CUCIM_ABI parser_close(CuCIMFileHandle* handle)
+static bool CUCIM_ABI parser_close(CuCIMFileHandle_ptr handle_ptr)
 {
+    CuCIMFileHandle* handle = reinterpret_cast<CuCIMFileHandle*>(handle_ptr);
+
     auto tif = static_cast<cuslide::tiff::TIFF*>(handle->client_data);
     delete tif;
     handle->client_data = nullptr;
     return true;
 }
 
-static bool CUCIM_ABI reader_read(const CuCIMFileHandle* handle,
+static bool CUCIM_ABI reader_read(const CuCIMFileHandle_ptr handle_ptr,
                                   const cucim::io::format::ImageMetadataDesc* metadata,
                                   const cucim::io::format::ImageReaderRegionRequestDesc* request,
                                   cucim::io::format::ImageDataDesc* out_image_data,
                                   cucim::io::format::ImageMetadataDesc* out_metadata = nullptr)
 {
+    CuCIMFileHandle* handle = reinterpret_cast<CuCIMFileHandle*>(handle_ptr);
     auto tif = static_cast<cuslide::tiff::TIFF*>(handle->client_data);
     bool result = tif->read(metadata, request, out_image_data, out_metadata);
 
     return result;
 }
 
-static bool CUCIM_ABI writer_write(const CuCIMFileHandle* handle,
+static bool CUCIM_ABI writer_write(const CuCIMFileHandle_ptr handle_ptr,
                                    const cucim::io::format::ImageMetadataDesc* metadata,
                                    const cucim::io::format::ImageDataDesc* image_data)
 {
+    CuCIMFileHandle* handle = reinterpret_cast<CuCIMFileHandle*>(handle_ptr);
     (void)handle;
     (void)metadata;
     (void)image_data;
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.cpp
new file mode 100644
index 000000000..a638215d5
--- /dev/null
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.cpp
@@ -0,0 +1,57 @@
+/*
+ * Apache License, Version 2.0
+ * Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "libnvjpeg.h"
+
+#include <cstring>
+#include <unistd.h>
+
+namespace cuslide::jpeg
+{
+
+
+#define THROW(action, message)                                                                                         \
+    {                                                                                                                  \
+        printf("ERROR in line %d while %s:\n%s\n", __LINE__, action, message);                                         \
+        retval = -1;                                                                                                   \
+        goto bailout;                                                                                                  \
+    }
+
+
+bool decode_libnvjpeg(const int fd,
+                      const unsigned char* jpeg_buf,
+                      const uint64_t offset,
+                      const uint64_t size,
+                      const void* jpegtable_data,
+                      const uint32_t jpegtable_count,
+                      uint8_t** dest,
+                      const cucim::io::Device& out_device)
+{
+    (void)out_device;
+    (void)fd;
+    (void)jpeg_buf;
+    (void)offset;
+    (void)size;
+    (void)jpegtable_data;
+    (void)jpegtable_count;
+    (void)dest;
+    (void)out_device;
+
+    return true;
+}
+
+} // namespace cuslide::jpeg
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.h b/cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.h
new file mode 100644
index 000000000..6457d3a0c
--- /dev/null
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/jpeg/libnvjpeg.h
@@ -0,0 +1,36 @@
+/*
+ * Apache License, Version 2.0
+ * Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef CUSLIDE_LIBNVJPEG_H
+#define CUSLIDE_LIBNVJPEG_H
+
+#include <cucim/io/device.h>
+
+namespace cuslide::jpeg
+{
+
+EXPORT_VISIBLE bool decode_libnvjpeg(int fd,
+                                     const unsigned char* jpeg_buf,
+                                     uint64_t offset,
+                                     uint64_t size,
+                                     const void* jpegtable_data,
+                                     uint32_t jpegtable_count,
+                                     uint8_t** dest,
+                                     const cucim::io::Device& out_device);
+
+} // namespace cuslide::jpeg
+
+#endif // CUSLIDE_LIBNVJPEG_H
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.cpp
new file mode 100644
index 000000000..eccd62549
--- /dev/null
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.cpp
@@ -0,0 +1,439 @@
+/*
+ * Apache License, Version 2.0
+ * Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nvjpeg_processor.h"
+
+#include <vector>
+
+#include <cucim/cache/image_cache_manager.h>
+#include <cucim/codec/hash_function.h>
+#include <cucim/io/device.h>
+#include <cucim/util/cuda.h>
+#include <fmt/format.h>
+
+#define ALIGN_UP(x, align_to) (((uint64_t)(x) + ((uint64_t)(align_to)-1)) & ~((uint64_t)(align_to)-1))
+#define ALIGN_DOWN(x, align_to) ((uint64_t)(x) & ~((uint64_t)(align_to)-1))
+namespace cuslide::loader
+{
+
+constexpr uint32_t MAX_CUDA_BATCH_SIZE = 1024;
+
+NvJpegProcessor::NvJpegProcessor(CuCIMFileHandle* file_handle,
+                                 const cuslide::tiff::IFD* ifd,
+                                 const int64_t* request_location,
+                                 const int64_t* request_size,
+                                 const uint64_t location_len,
+                                 const uint32_t batch_size,
+                                 uint32_t maximum_tile_count,
+                                 const uint8_t* jpegtable_data,
+                                 const uint32_t jpegtable_size)
+    : cucim::loader::BatchDataProcessor(batch_size), file_handle_(file_handle), ifd_(ifd)
+{
+    if (maximum_tile_count > 1)
+    {
+        // Calculate nearlest power of 2 that is equal or larger than the given number.
+        // (Test with https://godbolt.org/z/n7qhPYzfP)
+        int next_candidate = maximum_tile_count & (maximum_tile_count - 1);
+        if (next_candidate > 0)
+        {
+            maximum_tile_count <<= 1;
+            while (true)
+            {
+                next_candidate = maximum_tile_count & (maximum_tile_count - 1);
+                if (next_candidate == 0)
+                {
+                    break;
+                }
+                maximum_tile_count = next_candidate;
+            }
+        }
+
+        // Do not exceed MAX_CUDA_BATCH_SIZE for decoding JPEG with nvJPEG
+        uint32_t cuda_batch_size = std::min(maximum_tile_count, MAX_CUDA_BATCH_SIZE);
+
+        // Update prefetch_factor
+        // (We can decode/cache tiles at least two times of the number of tiles for batch decoding)
+        // E.g., (128 - 1) / 32 + 1 ~= 4 => 8 (for 256 tiles) for cuda_batch_size(=128) and batch_size(=32)
+        preferred_loader_prefetch_factor_ = ((cuda_batch_size - 1) / batch_size_ + 1) * 2;
+
+        // Create cuda image cache
+        cucim::cache::ImageCacheConfig cache_config{};
+        cache_config.type = cucim::cache::CacheType::kPerProcess;
+        cache_config.memory_capacity = 1024 * 1024; // 1TB: set to fairly large memory so that memory_capacity is not a
+                                                    // limiter.
+        cache_config.capacity = cuda_batch_size * 2; // limit the number of cache item to cuda_batch_size * 2
+        cuda_image_cache_ =
+            std::move(cucim::cache::ImageCacheManager::create_cache(cache_config, cucim::io::DeviceType::kCUDA));
+
+        cuda_batch_size_ = cuda_batch_size;
+
+        // Initialize nvjpeg
+        cudaError_t cuda_status;
+
+        if (NVJPEG_STATUS_SUCCESS != nvjpegCreate(backend_, NULL, &handle_))
+        {
+            throw std::runtime_error(fmt::format("NVJPEG initialization error"));
+        }
+        if (NVJPEG_STATUS_SUCCESS != nvjpegJpegStateCreate(handle_, &state_))
+        {
+            throw std::runtime_error(fmt::format("JPEG state initialization error"));
+        }
+
+        nvjpegDecodeBatchedParseJpegTables(handle_, state_, jpegtable_data, jpegtable_size);
+        nvjpegDecodeBatchedInitialize(handle_, state_, cuda_batch_size_, 1, output_format_);
+
+        CUDA_ERROR(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
+
+        raw_cuda_inputs_.reserve(cuda_batch_size_);
+        raw_cuda_inputs_len_.reserve(cuda_batch_size_);
+
+        for (uint32_t i = 0; i < cuda_batch_size_; ++i)
+        {
+            raw_cuda_outputs_.emplace_back(); // add all-zero nvjpegImage_t object
+        }
+
+        // Read file block in advance
+        tile_width_ = ifd->tile_width();
+        tile_width_bytes_ = tile_width_ * ifd->pixel_size_nbytes();
+        tile_height_ = ifd->tile_height();
+        tile_raster_nbytes_ = tile_width_bytes_ * tile_height_;
+
+        struct stat sb;
+        fstat(file_handle_->fd, &sb);
+        file_size_ = sb.st_size;
+        file_start_offset_ = 0;
+        file_block_size_ = file_size_;
+
+        update_file_block_info(request_location, request_size, location_len);
+
+        constexpr int BLOCK_SECTOR_SIZE = 4096;
+        switch (backend_)
+        {
+        case NVJPEG_BACKEND_GPU_HYBRID:
+            cufile_ = cucim::filesystem::open(file_handle->path, "rp");
+            unaligned_host_ = static_cast<uint8_t*>(cucim_malloc(file_block_size_ + BLOCK_SECTOR_SIZE * 2));
+            aligned_host_ = reinterpret_cast<uint8_t*>(ALIGN_UP(unaligned_host_, BLOCK_SECTOR_SIZE));
+            cufile_->pread(aligned_host_, file_block_size_, file_start_offset_);
+            break;
+        case NVJPEG_BACKEND_GPU_HYBRID_DEVICE:
+            cufile_ = cucim::filesystem::open(file_handle->path, "r");
+            CUDA_ERROR(cudaMalloc(&unaligned_device_, file_block_size_ + BLOCK_SECTOR_SIZE));
+            aligned_device_ = reinterpret_cast<uint8_t*>(ALIGN_UP(unaligned_device_, BLOCK_SECTOR_SIZE));
+            cufile_->pread(aligned_device_, file_block_size_, file_start_offset_);
+            break;
+        default:
+            throw std::runtime_error("Unsupported backend type");
+        }
+    }
+}
+
+NvJpegProcessor::~NvJpegProcessor()
+{
+    if (unaligned_host_)
+    {
+        cucim_free(unaligned_host_);
+        unaligned_host_ = nullptr;
+    }
+
+    cudaError_t cuda_status;
+    if (unaligned_device_)
+    {
+        CUDA_ERROR(cudaFree(unaligned_device_));
+        unaligned_device_ = nullptr;
+    }
+
+    for (uint32_t i = 0; i < cuda_batch_size_; ++i)
+    {
+        if (raw_cuda_outputs_[i].channel[0])
+        {
+            CUDA_ERROR(cudaFree(raw_cuda_outputs_[i].channel[0]));
+            raw_cuda_outputs_[i].channel[0] = nullptr;
+        }
+    }
+
+    if (state_)
+    {
+        NVJPEG_ERROR(nvjpegJpegStateDestroy(state_));
+        state_ = nullptr;
+    }
+    if (handle_)
+    {
+        NVJPEG_ERROR(nvjpegDestroy(handle_));
+        handle_ = nullptr;
+    }
+}
+
+uint32_t NvJpegProcessor::request(std::deque<uint32_t>& batch_item_counts, const uint32_t num_remaining_patches)
+{
+    (void)batch_item_counts;
+    std::vector<cucim::loader::TileInfo> tile_to_request;
+    if (tiles_.empty())
+    {
+        return 0;
+    }
+
+    // Return if we need to wait until previous cuda batch is consumed.
+    auto& first_tile = tiles_.front();
+    if (first_tile.location_index <= fetch_after_.location_index)
+    {
+        if (first_tile.location_index < fetch_after_.location_index || first_tile.index < fetch_after_.index)
+        {
+            return 0;
+        }
+    }
+
+    // Set fetch_after_ to the last tile info of previously processed cuda batch
+    if (!cache_tile_queue_.empty())
+    {
+        fetch_after_ = cache_tile_map_[cache_tile_queue_.back()];
+    }
+
+    // Remove previous batch (keep last 'cuda_batch_size_' items) before adding/processing new cuda batch
+    std::vector<cucim::loader::TileInfo> removed_tiles;
+    while (cache_tile_queue_.size() > cuda_batch_size_)
+    {
+        uint32_t removed_tile_index = cache_tile_queue_.front();
+        auto removed_tile = cache_tile_map_.find(removed_tile_index);
+        removed_tiles.push_back(removed_tile->second);
+        cache_tile_queue_.pop_front();
+        cache_tile_map_.erase(removed_tile_index);
+    }
+
+    // Collect candidates
+    for (auto tile : tiles_)
+    {
+        auto index = tile.index;
+        if (tile_to_request.size() >= cuda_batch_size_)
+        {
+            break;
+        }
+        if (cache_tile_map_.find(index) == cache_tile_map_.end())
+        {
+            if (tile.size == 0)
+            {
+                continue;
+            }
+            cache_tile_queue_.emplace_back(index);
+            cache_tile_map_.emplace(index, tile);
+            tile_to_request.emplace_back(tile);
+        }
+    }
+
+    // Return if we need to wait until more patches are requested
+    if (tile_to_request.size() < cuda_batch_size_)
+    {
+        if (num_remaining_patches > 0)
+        {
+            // Restore cache_tile_queue_ and cache_tile_map_
+            for (auto& added_tile : tile_to_request)
+            {
+                uint32_t added_index = added_tile.index;
+                cache_tile_queue_.pop_back();
+                cache_tile_map_.erase(added_index);
+            }
+            for (auto rit = removed_tiles.rbegin(); rit != removed_tiles.rend(); ++rit)
+            {
+                uint32_t removed_index = rit->index;
+                cache_tile_queue_.emplace_front(removed_index);
+                cache_tile_map_.emplace(removed_index, *rit);
+            }
+            return 0;
+        }
+        else
+        {
+            // Completed, set fetch_after_ to the last tile info.
+            fetch_after_ = tiles_.back();
+        }
+    }
+
+    uint8_t* file_block_ptr = nullptr;
+    switch (backend_)
+    {
+    case NVJPEG_BACKEND_GPU_HYBRID:
+        file_block_ptr = aligned_host_;
+        break;
+    case NVJPEG_BACKEND_GPU_HYBRID_DEVICE:
+        file_block_ptr = aligned_device_;
+        break;
+    default:
+        throw std::runtime_error("Unsupported backend type");
+    }
+
+    cudaError_t cuda_status;
+
+    // Initialize batch data with the first data
+    if (raw_cuda_inputs_.empty())
+    {
+        for (uint32_t i = 0; i < cuda_batch_size_; ++i)
+        {
+            uint8_t* mem_offset = file_block_ptr + tile_to_request[0].offset - file_start_offset_;
+            raw_cuda_inputs_.push_back((const unsigned char*)mem_offset);
+            raw_cuda_inputs_len_.push_back(tile_to_request[0].size);
+            CUDA_ERROR(cudaMallocPitch(
+                &raw_cuda_outputs_[i].channel[0], &raw_cuda_outputs_[i].pitch[0], tile_width_bytes_, tile_height_));
+        }
+        CUDA_ERROR(cudaStreamSynchronize(stream_));
+    }
+
+    // Set inputs to nvJPEG
+    size_t request_count = tile_to_request.size();
+    for (uint32_t i = 0; i < request_count; ++i)
+    {
+        uint8_t* mem_offset = file_block_ptr + tile_to_request[i].offset - file_start_offset_;
+        raw_cuda_inputs_[i] = mem_offset;
+        raw_cuda_inputs_len_[i] = tile_to_request[i].size;
+    }
+
+    int error_code = nvjpegDecodeBatched(
+        handle_, state_, raw_cuda_inputs_.data(), raw_cuda_inputs_len_.data(), raw_cuda_outputs_.data(), stream_);
+
+    if (NVJPEG_STATUS_SUCCESS != error_code)
+    {
+        throw std::runtime_error(fmt::format("Error in batched decode: {}", error_code));
+    }
+    CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+    // Remove previous batch (keep last 'cuda_batch_size_' items) before adding to cuda_image_cache_
+    // TODO: Utilize the removed tiles if next batch uses them.
+    while (cuda_image_cache_->size() > cuda_batch_size_)
+    {
+        cuda_image_cache_->remove_front();
+    }
+
+    // Add to image cache
+    for (uint32_t i = 0; i < request_count; ++i)
+    {
+        auto& added_tile = tile_to_request[i];
+
+        uint32_t index = added_tile.index;
+        uint64_t index_hash = cucim::codec::splitmix64(index);
+
+        auto key = cuda_image_cache_->create_key(0, index);
+
+        cuda_image_cache_->lock(index_hash);
+
+        uint8_t* tile_data = static_cast<uint8_t*>(cuda_image_cache_->allocate(tile_raster_nbytes_));
+
+        cudaError_t cuda_status;
+        CUDA_TRY(cudaMemcpy2D(tile_data, tile_width_bytes_, raw_cuda_outputs_[i].channel[0],
+                              raw_cuda_outputs_[i].pitch[0], tile_width_bytes_, tile_height_, cudaMemcpyDeviceToDevice));
+
+        const size_t tile_raster_nbytes = raw_cuda_inputs_len_[i];
+        auto value = cuda_image_cache_->create_value(tile_data, tile_raster_nbytes, cucim::io::DeviceType::kCUDA);
+        cuda_image_cache_->insert(key, value);
+        cuda_image_cache_->unlock(index_hash);
+    }
+
+    ++processed_cuda_batch_count_;
+
+    cuda_batch_cond_.notify_all();
+    return request_count;
+}
+
+uint32_t NvJpegProcessor::wait_batch(const uint32_t index_in_task,
+                                     std::deque<uint32_t>& batch_item_counts,
+                                     const uint32_t num_remaining_patches)
+{
+    // Check if the next (cuda) batch needs to be requested whenever an index in a task is divided by cuda batch size.
+    // (each task which is for a patch consists of multiple tile processing)
+    if (index_in_task % cuda_batch_size_ == 0)
+    {
+        request(batch_item_counts, num_remaining_patches);
+    }
+    return 0;
+}
+
+std::shared_ptr<cucim::cache::ImageCacheValue> NvJpegProcessor::wait_for_processing(const uint32_t index)
+{
+    uint64_t index_hash = cucim::codec::splitmix64(index);
+    std::mutex* m = reinterpret_cast<std::mutex*>(cuda_image_cache_->mutex(index_hash));
+    std::shared_ptr<cucim::cache::ImageCacheValue> value;
+
+    std::unique_lock<std::mutex> lock(*m);
+    cuda_batch_cond_.wait(lock, [this, index, &value] {
+        // Exit waiting if the thread needs to be stopped or cache value is available.
+        if (stopped_)
+        {
+            value = std::shared_ptr<cucim::cache::ImageCacheValue>();
+            return true;
+        }
+        auto key = cuda_image_cache_->create_key(0, index);
+        value = cuda_image_cache_->find(key);
+        return static_cast<bool>(value);
+    });
+    return value;
+}
+
+void NvJpegProcessor::shutdown()
+{
+    stopped_ = true;
+    cuda_batch_cond_.notify_all();
+}
+
+uint32_t NvJpegProcessor::preferred_loader_prefetch_factor()
+{
+    return preferred_loader_prefetch_factor_;
+}
+
+void NvJpegProcessor::update_file_block_info(const int64_t* request_location,
+                                             const int64_t* request_size,
+                                             const uint64_t location_len)
+{
+
+    uint32_t width = ifd_->width();
+    uint32_t height = ifd_->height();
+    uint32_t stride_y = width / tile_width_ + !!(width % tile_width_); // # of tiles in a row(y) in the ifd tile array
+                                                                       // as grid
+    uint32_t stride_x = height / tile_height_ + !!(height % tile_height_); // # of tiles in a col(x) in the ifd tile
+                                                                           // array as grid
+    int64_t min_tile_index = 1000000000;
+    int64_t max_tile_index = 0;
+
+    // Assume that offset for tiles are increasing as the index is increasing.
+    for (size_t loc_index = 0; loc_index < location_len; ++loc_index)
+    {
+        int64_t sx = request_location[loc_index * 2];
+        int64_t sy = request_location[loc_index * 2 + 1];
+        int64_t offset_sx = static_cast<uint64_t>(sx) / tile_width_; // x-axis start offset for the requested region in
+                                                                     // the ifd tile array as grid
+        int64_t offset_sy = static_cast<uint64_t>(sy) / tile_height_; // y-axis start offset for the requested region in
+                                                                      // the ifd tile array as grid
+        int64_t tile_index = (offset_sy * stride_y) + offset_sx;
+        min_tile_index = std::min(min_tile_index, tile_index);
+        max_tile_index = std::max(max_tile_index, tile_index);
+    }
+
+    int64_t w = request_size[0];
+    int64_t h = request_size[1];
+    int64_t additional_index_x = (static_cast<uint64_t>(w) + (tile_width_ - 1)) / tile_width_;
+    int64_t additional_index_y = (static_cast<uint64_t>(h) + (tile_height_ - 1)) / tile_height_;
+    min_tile_index = std::max(min_tile_index, 0L);
+    max_tile_index =
+        std::min(stride_x * stride_y - 1,
+                 static_cast<uint32_t>(max_tile_index + (additional_index_y * stride_y) + additional_index_x));
+
+    auto& image_piece_offsets = const_cast<std::vector<uint64_t>&>(ifd_->image_piece_offsets());
+    auto& image_piece_bytecounts = const_cast<std::vector<uint64_t>&>(ifd_->image_piece_bytecounts());
+
+    uint64_t min_offset = image_piece_offsets[min_tile_index];
+    uint64_t max_offset = image_piece_offsets[max_tile_index] + image_piece_bytecounts[max_tile_index];
+
+    file_start_offset_ = min_offset;
+    file_block_size_ = max_offset - min_offset + 1;
+}
+
+} // namespace cuslide::loader
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.h b/cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.h
new file mode 100644
index 000000000..221adf4c7
--- /dev/null
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/loader/nvjpeg_processor.h
@@ -0,0 +1,110 @@
+/*
+ * Apache License, Version 2.0
+ * Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef CUSLIDE_NVJPEG_PROCESSOR_H
+#define CUSLIDE_NVJPEG_PROCESSOR_H
+
+
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <unordered_map>
+#include <vector>
+
+#include <nvjpeg.h>
+
+#include <cucim/filesystem/cufile_driver.h>
+#include <cucim/filesystem/file_handle.h>
+#include <cucim/io/device.h>
+#include <cucim/loader/batch_data_processor.h>
+#include <cucim/loader/tile_info.h>
+
+#include "cuslide/tiff/ifd.h"
+
+namespace cuslide::loader
+{
+
+class NvJpegProcessor : public cucim::loader::BatchDataProcessor
+{
+public:
+    NvJpegProcessor(CuCIMFileHandle* file_handle,
+                    const cuslide::tiff::IFD* ifd,
+                    const int64_t* request_location,
+                    const int64_t* request_size,
+                    uint64_t location_len,
+                    uint32_t batch_size,
+                    uint32_t maximum_tile_count,
+                    const uint8_t* jpegtable_data,
+                    uint32_t jpegtable_size);
+    ~NvJpegProcessor();
+
+    uint32_t request(std::deque<uint32_t>& batch_item_counts, uint32_t num_remaining_patches) override;
+    uint32_t wait_batch(uint32_t index_in_task,
+                        std::deque<uint32_t>& batch_item_counts,
+                        uint32_t num_remaining_patches) override;
+
+    std::shared_ptr<cucim::cache::ImageCacheValue> wait_for_processing(uint32_t index) override;
+
+    void shutdown() override;
+
+    uint32_t preferred_loader_prefetch_factor();
+
+private:
+    void update_file_block_info(const int64_t* request_location, const int64_t* request_size, uint64_t location_len);
+
+    bool stopped_ = false;
+    uint32_t preferred_loader_prefetch_factor_ = 2;
+
+    CuCIMFileHandle* file_handle_ = nullptr;
+    const cuslide::tiff::IFD* ifd_ = nullptr;
+    std::shared_ptr<cucim::filesystem::CuFileDriver> cufile_;
+    size_t tile_width_ = 0;
+    size_t tile_width_bytes_ = 0;
+    size_t tile_height_ = 0;
+    size_t tile_raster_nbytes_ = 0;
+    size_t file_size_ = 0;
+    size_t file_start_offset_ = 0;
+    size_t file_block_size_ = 0;
+
+    uint32_t cuda_batch_size_ = 1;
+    nvjpegHandle_t handle_ = nullptr;
+    nvjpegOutputFormat_t output_format_ = NVJPEG_OUTPUT_RGBI;
+    nvjpegJpegState_t state_;
+    nvjpegBackend_t backend_ = NVJPEG_BACKEND_GPU_HYBRID_DEVICE;
+    cudaStream_t stream_ = nullptr;
+
+    std::condition_variable cuda_batch_cond_;
+    std::unique_ptr<cucim::cache::ImageCache> cuda_image_cache_;
+    uint64_t processed_cuda_batch_count_ = 0;
+    cucim::loader::TileInfo fetch_after_{ -1, -1, 0, 0 };
+
+    std::deque<uint32_t> cache_tile_queue_;
+    std::unordered_map<uint32_t, cucim::loader::TileInfo> cache_tile_map_;
+
+    uint8_t* unaligned_host_ = nullptr;
+    uint8_t* aligned_host_ = nullptr;
+    uint8_t* unaligned_device_ = nullptr;
+    uint8_t* aligned_device_ = nullptr;
+
+    std::vector<const unsigned char*> raw_cuda_inputs_;
+    std::vector<size_t> raw_cuda_inputs_len_;
+    std::vector<nvjpegImage_t> raw_cuda_outputs_;
+};
+
+} // namespace cuslide::loader
+
+#endif // CUSLIDE_NVJPEG_PROCESSOR_H
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.cpp
index b4e59b6c0..9082a98ee 100644
--- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.cpp
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.cpp
@@ -19,7 +19,9 @@
 #include <sys/types.h>
 #include <unistd.h>
 
+#include <algorithm>
 #include <iostream>
+#include <random>
 #include <thread>
 
 #include <fmt/format.h>
@@ -32,10 +34,12 @@
 #include <cucim/logger/timer.h>
 #include <cucim/memory/memory_manager.h>
 #include <cucim/profiler/nvtx3.h>
+#include <cucim/util/cuda.h>
 
 #include "cuslide/deflate/deflate.h"
 #include "cuslide/jpeg/libjpeg_turbo.h"
 #include "cuslide/jpeg2k/libopenjpeg.h"
+#include "cuslide/loader/nvjpeg_processor.h"
 #include "cuslide/lzw/lzw.h"
 #include "cuslide/raw/raw.h"
 #include "tiff.h"
@@ -48,17 +52,16 @@ IFD::IFD(TIFF* tiff, uint16_t index, ifd_offset_t offset) : tiff_(tiff), ifd_ind
 {
     PROF_SCOPED_RANGE(PROF_EVENT(ifd_ifd));
     auto tif = tiff->client();
-    int ret;
 
     char* software_char_ptr = nullptr;
     char* model_char_ptr = nullptr;
-    // TODO: error handling
 
-    ret = TIFFGetField(tif, TIFFTAG_SOFTWARE, &software_char_ptr);
+    // TODO: error handling
+    TIFFGetField(tif, TIFFTAG_SOFTWARE, &software_char_ptr);
     software_ = std::string(software_char_ptr ? software_char_ptr : "");
-    ret = TIFFGetField(tif, TIFFTAG_MODEL, &model_char_ptr);
+    TIFFGetField(tif, TIFFTAG_MODEL, &model_char_ptr);
     model_ = std::string(model_char_ptr ? model_char_ptr : "");
-    ret = TIFFGetField(tif, TIFFTAG_IMAGEDESCRIPTION, &model_char_ptr);
+    TIFFGetField(tif, TIFFTAG_IMAGEDESCRIPTION, &model_char_ptr);
     image_description_ = std::string(model_char_ptr ? model_char_ptr : "");
 
     TIFFDirectory& tif_dir = tif->tif_dir;
@@ -82,26 +85,8 @@ IFD::IFD(TIFF* tiff, uint16_t index, ifd_offset_t offset) : tiff_(tiff), ifd_ind
     photometric_ = tif_dir.td_photometric;
     compression_ = tif_dir.td_compression;
     TIFFGetField(tif, TIFFTAG_PREDICTOR, &predictor_);
-
-    //    ret = TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width_);
-    //    ret = TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height_);
-    //    ret = TIFFGetField(tif, TIFFTAG_TILEWIDTH, &tile_width_);
-    //    ret = TIFFGetField(tif, TIFFTAG_TILELENGTH, &tile_height_);
-    //    ret = TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bits_per_sample_);
-    //    ret = TIFFGetField(tif, TIFFTAG_SAMPLESPERPIXEL, &samples_per_pixel_);
-    //    ret = TIFFGetField(tif, TIFFTAG_SUBFILETYPE, &subfile_type_); // for checking if FILETYPE_REDUCEDIMAGE
-    //    ret = TIFFGetField(tif, TIFFTAG_PLANARCONFIG, &planar_config_);
-    //    ret = TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometric_);
-    //    ret = TIFFGetField(tif, TIFFTAG_COMPRESSION, &compression_);
-    //    printf("[GB] offset_entry:%lu %p\n", tif->tif_dir.td_stripoffset_entry.tdir_count,
-    //    tif->tif_dir.td_stripoffset_p); printf("[GB] width: %d %d\n", tif->tif_dir.td_imagewidth, width_);
-    //    printf("[GB] bytecount entry2:%lu %p\n", tif->tif_dir.td_stripbytecount_entry.tdir_count,
-    //    tif->tif_dir.td_stripbytecount_p);
-    (void)ret;
-
     subifd_count_ = tif_dir.td_nsubifd;
     uint64_t* subifd_offsets = tif_dir.td_subifd;
-    //    ret = TIFFGetField(tif, TIFFTAG_SUBIFD, &subifd_count, &subifd_offsets);
     if (subifd_count_)
     {
         subifd_offsets_.resize(subifd_count_);
@@ -113,7 +98,7 @@ IFD::IFD(TIFF* tiff, uint16_t index, ifd_offset_t offset) : tiff_(tiff), ifd_ind
         uint8_t* jpegtable_data = nullptr;
         uint32_t jpegtable_count = 0;
 
-        ret = TIFFGetField(tif, TIFFTAG_JPEGTABLES, &jpegtable_count, &jpegtable_data);
+        TIFFGetField(tif, TIFFTAG_JPEGTABLES, &jpegtable_count, &jpegtable_data);
         jpegtable_.reserve(jpegtable_count);
         jpegtable_.insert(jpegtable_.end(), jpegtable_data, jpegtable_data + jpegtable_count);
 
@@ -139,7 +124,7 @@ IFD::IFD(TIFF* tiff, uint16_t index, ifd_offset_t offset) : tiff_(tiff), ifd_ind
         image_piece_bytecounts_.end(), &td_stripbytecount_p[0], &td_stripbytecount_p[image_piece_count_]);
 
     // Calculate hash value with IFD index
-    hash_value_ = tiff->file_handle_.hash_value ^ cucim::codec::splitmix64(index);
+    hash_value_ = tiff->file_handle_->hash_value ^ cucim::codec::splitmix64(index);
 
     //    TIFFPrintDirectory(tif, stdout, TIFFPRINT_STRIPS);
 }
@@ -164,12 +149,16 @@ bool IFD::read(const TIFF* tiff,
 
     int64_t sx = request->location[0];
     int64_t sy = request->location[1];
+    uint32_t batch_size = request->batch_size;
     int64_t w = request->size[0];
     int64_t h = request->size[1];
     int32_t n_ch = samples_per_pixel_; // number of channels
+    int ndim = 3;
 
     size_t raster_size = w * h * samples_per_pixel_;
     void* raster = nullptr;
+    auto raster_type = cucim::io::DeviceType::kCPU;
+
     DLTensor* out_buf = request->buf;
     bool is_buf_available = out_buf && out_buf->data;
 
@@ -181,16 +170,143 @@ bool IFD::read(const TIFF* tiff,
 
     if (is_read_optimizable())
     {
-        if (!raster)
+        if (batch_size > 1)
+        {
+            ndim = 4;
+        }
+        int64_t* location = request->location;
+        uint64_t location_len = request->location_len;
+        const uint32_t num_workers = request->num_workers;
+        const bool drop_last = request->drop_last;
+        uint32_t prefetch_factor = request->prefetch_factor;
+        const bool shuffle = request->shuffle;
+        const uint64_t seed = request->seed;
+
+        if (num_workers == 0 && location_len > 1)
         {
-            raster = cucim_malloc(raster_size); // RGB image
-            memset(raster, 0, raster_size);
+            throw std::runtime_error("Cannot read multiple locations with zero workers!");
         }
 
+        // Shuffle data
+        if (shuffle)
+        {
+            auto rng = std::default_random_engine{ seed };
+            struct position
+            {
+                int64_t x;
+                int64_t y;
+            };
+            std::shuffle(reinterpret_cast<position*>(&location[0]),
+                         reinterpret_cast<position*>(&location[location_len * 2]), rng);
+        }
+
+        // Adjust location length based on 'drop_last'
+        const uint32_t remaining_len = location_len % batch_size;
+        if (drop_last)
+        {
+            location_len -= remaining_len;
+        }
 
-        if (!read_region_tiles(tiff, this, sx, sy, w, h, raster, out_device))
+        // Do not use prefetch if the image is too small
+        if (1 + prefetch_factor > location_len)
         {
-            fmt::print(stderr, "[Error] Failed to read region with libjpeg!\n");
+            prefetch_factor = location_len - 1;
+        }
+
+        size_t one_raster_size = raster_size;
+        raster_size *= batch_size;
+
+        const IFD* ifd = this;
+
+        if (location_len > 1 || batch_size > 1 || num_workers > 0)
+        {
+            // Reconstruct location
+            std::unique_ptr<std::vector<int64_t>>* location_unique =
+                reinterpret_cast<std::unique_ptr<std::vector<int64_t>>*>(request->location_unique);
+            std::unique_ptr<std::vector<int64_t>> request_location = std::move(*location_unique);
+            delete location_unique;
+
+            // Reconstruct size
+            std::unique_ptr<std::vector<int64_t>>* size_unique =
+                reinterpret_cast<std::unique_ptr<std::vector<int64_t>>*>(request->size_unique);
+            std::unique_ptr<std::vector<int64_t>> request_size = std::move(*size_unique);
+            delete size_unique;
+
+            auto load_func = [tiff, ifd, location, w, h, out_device](
+                                 cucim::loader::ThreadBatchDataLoader* loader_ptr, uint64_t location_index) {
+                uint8_t* raster_ptr = loader_ptr->raster_pointer(location_index);
+
+                if (!read_region_tiles(tiff, ifd, location, location_index, w, h,
+                                       raster_ptr, out_device, loader_ptr))
+                {
+                    fmt::print(stderr, "[Error] Failed to read region!\n");
+                }
+            };
+
+            uint32_t maximum_tile_count = 0;
+
+            std::unique_ptr<cucim::loader::BatchDataProcessor> batch_processor;
+
+            // Set raster_type to CUDA because loader will handle this with nvjpeg
+            if (out_device.type() == cucim::io::DeviceType::kCUDA)
+            {
+                raster_type = cucim::io::DeviceType::kCUDA;
+
+                // The maximal number of tiles (x-axis) overapped with the given patch
+                uint32_t tile_across_count = std::min(static_cast<uint64_t>(ifd->width_) + (ifd->tile_width_ - 1),
+                                                      static_cast<uint64_t>(w) + (ifd->tile_width_ - 1)) /
+                                                 ifd->tile_width_ +
+                                             1;
+                // The maximal number of tiles (y-axis) overapped with the given patch
+                uint32_t tile_down_count = std::min(static_cast<uint64_t>(ifd->height_) + (ifd->tile_height_ - 1),
+                                                    static_cast<uint64_t>(h) + (ifd->tile_height_ - 1)) /
+                                               ifd->tile_height_ +
+                                           1;
+                // The maximal number of possible tiles (# of tasks) to load for the given image batch
+                maximum_tile_count = tile_across_count * tile_down_count * batch_size;
+
+                // Create NvJpegProcessor
+                auto& jpegtable = ifd->jpegtable_;
+                const void* jpegtable_data = jpegtable.data();
+                uint32_t jpegtable_size = jpegtable.size();
+
+                auto nvjpeg_processor = std::make_unique<cuslide::loader::NvJpegProcessor>(
+                    tiff->file_handle_, ifd, request_location->data(), request_size->data(), location_len, batch_size,
+                    maximum_tile_count, static_cast<const uint8_t*>(jpegtable_data), jpegtable_size);
+
+                // Update prefetch_factor
+                prefetch_factor = nvjpeg_processor->preferred_loader_prefetch_factor();
+
+                batch_processor = std::move(nvjpeg_processor);
+            }
+
+            auto loader = std::make_unique<cucim::loader::ThreadBatchDataLoader>(
+                load_func, std::move(batch_processor), out_device, std::move(request_location), std::move(request_size),
+                location_len, one_raster_size, batch_size, prefetch_factor, num_workers);
+
+            const uint32_t load_size = std::min(static_cast<uint64_t>(batch_size) * (1 + prefetch_factor), location_len);
+
+            loader->request(load_size);
+
+            // If it reads entire image with multi threads (using loader), fetch the next item.
+            if (location_len == 1 && batch_size == 1)
+            {
+                raster = loader->next_data();
+            }
+
+            out_image_data->loader = loader.release(); // set loader to out_image_data
+        }
+        else
+        {
+            if (!raster)
+            {
+                raster = cucim_malloc(one_raster_size);
+            }
+
+            if (!read_region_tiles(tiff, ifd, location, 0, w, h, raster, out_device, nullptr))
+            {
+                fmt::print(stderr, "[Error] Failed to read region!\n");
+            }
         }
     }
     else
@@ -260,14 +376,23 @@ bool IFD::read(const TIFF* tiff,
         }
     }
 
-    int ndim = 3;
     int64_t* shape = static_cast<int64_t*>(cucim_malloc(sizeof(int64_t) * ndim));
-    shape[0] = h;
-    shape[1] = w;
-    shape[2] = n_ch;
+    if (ndim == 3)
+    {
+        shape[0] = h;
+        shape[1] = w;
+        shape[2] = n_ch;
+    }
+    else // ndim == 4
+    {
+        shape[0] = batch_size;
+        shape[1] = h;
+        shape[2] = w;
+        shape[3] = n_ch;
+    }
 
     // Copy the raster memory and free it if needed.
-    if (!is_buf_available)
+    if (!is_buf_available && raster && raster_type == cucim::io::DeviceType::kCPU)
     {
         cucim::memory::move_raster_from_host(&raster, raster_size, out_device);
     }
@@ -386,6 +511,19 @@ const std::vector<uint64_t>& IFD::image_piece_bytecounts() const
     return image_piece_bytecounts_;
 }
 
+size_t IFD::pixel_size_nbytes() const
+{
+    const int pixel_format = TJPF_RGB; // TODO: support other pixel format
+    const int nbytes = tjPixelSize[pixel_format];
+    return nbytes;
+}
+
+size_t IFD::tile_raster_size_nbytes() const
+{
+    const size_t nbytes = tile_width_ * tile_height_ * pixel_size_nbytes();
+    return nbytes;
+}
+
 bool IFD::is_compression_supported() const
 {
     switch (compression_)
@@ -419,16 +557,19 @@ bool IFD::is_format_supported() const
 
 bool IFD::read_region_tiles(const TIFF* tiff,
                             const IFD* ifd,
-                            const int64_t sx,
-                            const int64_t sy,
+                            const int64_t* location,
+                            const int64_t location_index,
                             const int64_t w,
                             const int64_t h,
                             void* raster,
-                            const cucim::io::Device& out_device)
+                            const cucim::io::Device& out_device,
+                            cucim::loader::ThreadBatchDataLoader* loader)
 {
     PROF_SCOPED_RANGE(PROF_EVENT(ifd_read_region_tiles));
     // Reference code: https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/tjexample.c
 
+    int64_t sx = location[location_index * 2];
+    int64_t sy = location[location_index * 2 + 1];
     int64_t ex = sx + w - 1;
     int64_t ey = sy + h - 1;
 
@@ -438,7 +579,7 @@ bool IFD::read_region_tiles(const TIFF* tiff,
     // Handle out-of-boundary case
     if (sx < 0 || sy < 0 || sx >= width || sy >= height || ex < 0 || ey < 0 || ex >= width || ey >= height)
     {
-        return read_region_tiles_boundary(tiff, ifd, sx, sy, w, h, raster, out_device);
+        return read_region_tiles_boundary(tiff, ifd, location, location_index, w, h, raster, out_device, loader);
     }
     cucim::cache::ImageCache& image_cache = cucim::CuImage::cache_manager().cache();
     cucim::cache::CacheType cache_type = image_cache.type();
@@ -476,19 +617,9 @@ bool IFD::read_region_tiles(const TIFF* tiff,
     uint32_t start_index_y = offset_sy * stride_y;
     uint32_t end_index_y = offset_ey * stride_y;
 
-    // Memory for tile_raster would be manually allocated here, instead of using decode_libjpeg().
-    // Need to free the manually. Usually it is set to nullptr and memory is created by decode_libjpeg() by using
-    // tjAlloc() (Also need to free with tjFree() after use. See the documentation of tjAlloc() for the detail.)
-    const int pixel_format = TJPF_RGB; // TODO: support other pixel format
-    const int pixel_size_nbytes = tjPixelSize[pixel_format];
-    const size_t tile_raster_nbytes = tw * th * pixel_size_nbytes;
-    uint8_t* tile_raster = nullptr;
-    if (cache_type == cucim::cache::CacheType::kNoCache)
-    {
-        tile_raster = static_cast<uint8_t*>(cucim_malloc(tile_raster_nbytes));
-    }
+    const size_t tile_raster_nbytes = ifd->tile_raster_size_nbytes();
 
-    int tiff_file = tiff->file_handle_.fd;
+    int tiff_file = tiff->file_handle_->fd;
     uint64_t ifd_hash_value = ifd->hash_value_;
     uint32_t dest_pixel_step_y = w * samples_per_pixel;
 
@@ -506,132 +637,187 @@ bool IFD::read_region_tiles(const TIFF* tiff,
         uint32_t dest_pixel_index_x = 0;
 
         uint32_t index = index_y + offset_sx;
-        // Calculate a simple hash value for the tile index
-        uint64_t index_hash = ifd_hash_value ^ (static_cast<uint64_t>(index) | (static_cast<uint64_t>(index) << 32));
         for (uint32_t offset_x = offset_sx; offset_x <= offset_ex; ++offset_x, ++index)
         {
             PROF_SCOPED_RANGE(PROF_EVENT_P(ifd_read_region_tiles_iter, index));
             auto tiledata_offset = static_cast<uint64_t>(ifd->image_piece_offsets_[index]);
             auto tiledata_size = static_cast<uint64_t>(ifd->image_piece_bytecounts_[index]);
 
+            // Calculate a simple hash value for the tile index
+            uint64_t index_hash = ifd_hash_value ^ (static_cast<uint64_t>(index) | (static_cast<uint64_t>(index) << 32));
+
             uint32_t tile_pixel_offset_x = (offset_x == offset_sx) ? pixel_offset_sx : 0;
             uint32_t nbytes_tile_pixel_size_x = (offset_x == offset_ex) ?
                                                     (pixel_offset_ex - tile_pixel_offset_x + 1) * samples_per_pixel :
                                                     (tw - tile_pixel_offset_x) * samples_per_pixel;
-
-            uint32_t nbytes_tile_index = (tile_pixel_offset_sy * tw + tile_pixel_offset_x) * samples_per_pixel;
-            uint32_t dest_pixel_index = dest_pixel_index_x;
-
-            uint8_t* tile_data = tile_raster;
-
-            if (tiledata_size > 0)
-            {
-                auto key = image_cache.create_key(ifd_hash_value, index);
-                image_cache.lock(index_hash);
-                auto value = image_cache.find(key);
-                if (value)
+            auto decode_func = [=, &image_cache]() {
+                PROF_SCOPED_RANGE(PROF_EVENT_P(ifd_read_region_tiles_task, index_hash));
+                uint32_t nbytes_tile_index = (tile_pixel_offset_sy * tw + tile_pixel_offset_x) * samples_per_pixel;
+                uint32_t dest_pixel_index = dest_pixel_index_x;
+                uint8_t* tile_data = nullptr;
+                if (tiledata_size > 0)
                 {
-                    image_cache.unlock(index_hash);
-                    tile_data = static_cast<uint8_t*>(value->data);
-                }
-                else
-                {
-                    // Lifetime of tile_data is same with `value`
-                    // : do not access this data when `value` is not accessible.
-                    if (cache_type != cucim::cache::CacheType::kNoCache)
-                    {
-                        tile_data = static_cast<uint8_t*>(image_cache.allocate(tile_raster_nbytes));
-                    }
+                    std::unique_ptr<uint8_t, decltype(cucim_free)*> tile_raster =
+                        std::unique_ptr<uint8_t, decltype(cucim_free)*>(nullptr, cucim_free);
 
+                    if (loader && loader->batch_data_processor())
                     {
-                        PROF_SCOPED_RANGE(PROF_EVENT(ifd_decompression));
                         switch (compression_method)
                         {
-                        case COMPRESSION_NONE:
-                            cuslide::raw::decode_raw(tiff_file, nullptr, tiledata_offset, tiledata_size, &tile_data,
-                                                     tile_raster_nbytes, out_device);
-                            break;
                         case COMPRESSION_JPEG:
-                            cuslide::jpeg::decode_libjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size,
-                                                          jpegtable_data, jpegtable_count, &tile_data, out_device,
-                                                          jpeg_color_space);
-                            break;
-                        case COMPRESSION_ADOBE_DEFLATE:
-                        case COMPRESSION_DEFLATE:
-                            cuslide::deflate::decode_deflate(tiff_file, nullptr, tiledata_offset, tiledata_size,
-                                                             &tile_data, tile_raster_nbytes, out_device);
-                            break;
-                        case cuslide::jpeg2k::kAperioJpeg2kYCbCr: // 33003
-                            cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size,
-                                                                &tile_data, tile_raster_nbytes, out_device,
-                                                                cuslide::jpeg2k::ColorSpace::kSYCC);
-                            break;
-                        case cuslide::jpeg2k::kAperioJpeg2kRGB: // 33005
-                            cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size,
-                                                                &tile_data, tile_raster_nbytes, out_device,
-                                                                cuslide::jpeg2k::ColorSpace::kRGB);
-                            break;
-                        case COMPRESSION_LZW:
-                            cuslide::lzw::decode_lzw(tiff_file, nullptr, tiledata_offset, tiledata_size, &tile_data,
-                                                     tile_raster_nbytes, out_device);
-                            // Apply unpredictor
-                            //   1: none, 2: horizontal differencing, 3: floating point predictor
-                            //   https://www.adobe.io/content/dam/udp/en/open/standards/tiff/TIFF6.pdf
-                            if (predictor == 2)
-                            {
-                                cuslide::lzw::horAcc8(tile_data, tile_raster_nbytes, nbytes_tw);
-                            }
                             break;
                         default:
                             throw std::runtime_error("Unsupported compression method");
                         }
+                        auto value = loader->wait_for_processing(index);
+                        if (!value) // if shutdown
+                        {
+                            return;
+                        }
+                        tile_data = static_cast<uint8_t*>(value->data);
+
+                        cudaError_t cuda_status;
+                        CUDA_ERROR(cudaMemcpy2D(dest_start_ptr + dest_pixel_index, dest_pixel_step_y,
+                                                tile_data + nbytes_tile_index, nbytes_tw, nbytes_tile_pixel_size_x,
+                                                tile_pixel_offset_ey - tile_pixel_offset_sy + 1,
+                                                cudaMemcpyDeviceToDevice));
                     }
+                    else
+                    {
+                        auto key = image_cache.create_key(ifd_hash_value, index);
+                        image_cache.lock(index_hash);
+                        auto value = image_cache.find(key);
+                        if (value)
+                        {
+                            image_cache.unlock(index_hash);
+                            tile_data = static_cast<uint8_t*>(value->data);
+                        }
+                        else
+                        {
+                            // Lifetime of tile_data is same with `value`
+                            // : do not access this data when `value` is not accessible.
+                            if (cache_type != cucim::cache::CacheType::kNoCache)
+                            {
+                                tile_data = static_cast<uint8_t*>(image_cache.allocate(tile_raster_nbytes));
+                            }
+                            else
+                            {
+                                // Allocate temporary buffer for tile data
+                                tile_raster = std::unique_ptr<uint8_t, decltype(cucim_free)*>(
+                                    reinterpret_cast<uint8_t*>(cucim_malloc(tile_raster_nbytes)), cucim_free);
+                                tile_data = tile_raster.get();
+                            }
+                            {
+                                PROF_SCOPED_RANGE(PROF_EVENT(ifd_decompression));
+                                switch (compression_method)
+                                {
+                                case COMPRESSION_NONE:
+                                    cuslide::raw::decode_raw(tiff_file, nullptr, tiledata_offset, tiledata_size,
+                                                             &tile_data, tile_raster_nbytes, out_device);
+                                    break;
+                                case COMPRESSION_JPEG:
+                                    cuslide::jpeg::decode_libjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size,
+                                                                  jpegtable_data, jpegtable_count, &tile_data,
+                                                                  out_device, jpeg_color_space);
+                                    break;
+                                case COMPRESSION_ADOBE_DEFLATE:
+                                case COMPRESSION_DEFLATE:
+                                    cuslide::deflate::decode_deflate(tiff_file, nullptr, tiledata_offset, tiledata_size,
+                                                                     &tile_data, tile_raster_nbytes, out_device);
+                                    break;
+                                case cuslide::jpeg2k::kAperioJpeg2kYCbCr: // 33003
+                                    cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset,
+                                                                        tiledata_size, &tile_data, tile_raster_nbytes,
+                                                                        out_device, cuslide::jpeg2k::ColorSpace::kSYCC);
+                                    break;
+                                case cuslide::jpeg2k::kAperioJpeg2kRGB: // 33005
+                                    cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset,
+                                                                        tiledata_size, &tile_data, tile_raster_nbytes,
+                                                                        out_device, cuslide::jpeg2k::ColorSpace::kRGB);
+                                    break;
+                                case COMPRESSION_LZW:
+                                    cuslide::lzw::decode_lzw(tiff_file, nullptr, tiledata_offset, tiledata_size,
+                                                             &tile_data, tile_raster_nbytes, out_device);
+                                    // Apply unpredictor
+                                    //   1: none, 2: horizontal differencing, 3: floating point predictor
+                                    //   https://www.adobe.io/content/dam/udp/en/open/standards/tiff/TIFF6.pdf
+                                    if (predictor == 2)
+                                    {
+                                        cuslide::lzw::horAcc8(tile_data, tile_raster_nbytes, nbytes_tw);
+                                    }
+                                    break;
+                                default:
+                                    throw std::runtime_error("Unsupported compression method");
+                                }
+                            }
 
-                    value = image_cache.create_value(tile_data, tile_raster_nbytes);
-                    image_cache.insert(key, value);
-                    image_cache.unlock(index_hash);
-                }
+                            value = image_cache.create_value(tile_data, tile_raster_nbytes);
+                            image_cache.insert(key, value);
+                            image_cache.unlock(index_hash);
+                        }
 
-                for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey;
-                     ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw)
+                        for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey;
+                             ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw)
+                        {
+                            memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index,
+                                   nbytes_tile_pixel_size_x);
+                        }
+                    }
+                }
+                else
                 {
-                    memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index, nbytes_tile_pixel_size_x);
+                    if (out_device.type() == cucim::io::DeviceType::kCPU)
+                    {
+                        for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey;
+                             ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw)
+                        {
+                            // Set background value such as (255,255,255)
+                            memset(dest_start_ptr + dest_pixel_index, background_value, nbytes_tile_pixel_size_x);
+                        }
+                    }
+                    else
+                    {
+                        cudaError_t cuda_status;
+                        CUDA_ERROR(cudaMemset2D(dest_start_ptr + dest_pixel_index, dest_pixel_step_y, background_value,
+                                                nbytes_tile_pixel_size_x,
+                                                tile_pixel_offset_ey - tile_pixel_offset_sy + 1));
+                    }
                 }
+            };
+
+            if (loader && *loader)
+            {
+                loader->enqueue(std::move(decode_func),
+                                cucim::loader::TileInfo{ location_index, index, tiledata_offset, tiledata_size });
             }
             else
             {
-                for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey;
-                     ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw)
-                {
-                    // Set (255,255,255)
-                    memset(dest_start_ptr + dest_pixel_index, background_value, nbytes_tile_pixel_size_x);
-                }
+                decode_func();
             }
+
             dest_pixel_index_x += nbytes_tile_pixel_size_x;
         }
         dest_start_ptr += dest_pixel_step_y * dest_pixel_offset_len_y;
     }
 
-    if (tile_raster)
-    {
-        cucim_free(tile_raster);
-    }
-
     return true;
 }
 
 bool IFD::read_region_tiles_boundary(const TIFF* tiff,
                                      const IFD* ifd,
-                                     const int64_t sx,
-                                     const int64_t sy,
+                                     const int64_t* location,
+                                     const int64_t location_index,
                                      const int64_t w,
                                      const int64_t h,
                                      void* raster,
-                                     const cucim::io::Device& out_device)
+                                     const cucim::io::Device& out_device,
+                                     cucim::loader::ThreadBatchDataLoader* loader)
 {
     PROF_SCOPED_RANGE(PROF_EVENT(ifd_read_region_tiles_boundary));
     (void)out_device;
     // Reference code: https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/tjexample.c
+    int64_t sx = location[location_index * 2];
+    int64_t sy = location[location_index * 2 + 1];
 
     uint8_t background_value = tiff->background_value_;
     uint16_t compression_method = ifd->compression_;
@@ -647,14 +833,13 @@ bool IFD::read_region_tiles_boundary(const TIFF* tiff,
     // Memory for tile_raster would be manually allocated here, instead of using decode_libjpeg().
     // Need to free the manually. Usually it is set to nullptr and memory is created by decode_libjpeg() by using
     // tjAlloc() (Also need to free with tjFree() after use. See the documentation of tjAlloc() for the detail.)
-    const int pixel_format = TJPF_RGB; // TODO: support other pixel format
-    const int pixel_size_nbytes = tjPixelSize[pixel_format];
+    const int pixel_size_nbytes = ifd->pixel_size_nbytes();
     auto dest_start_ptr = static_cast<uint8_t*>(raster);
 
     bool is_out_of_image = (ex < 0 || width <= sx || ey < 0 || height <= sy);
     if (is_out_of_image)
     {
-        // Fill (255,255,255) and return
+        // Fill background color(255,255,255) and return
         memset(dest_start_ptr, background_value, w * h * pixel_size_nbytes);
         return true;
     }
@@ -665,11 +850,6 @@ bool IFD::read_region_tiles_boundary(const TIFF* tiff,
     uint32_t th = ifd->tile_height_;
 
     const size_t tile_raster_nbytes = tw * th * pixel_size_nbytes;
-    uint8_t* tile_raster = nullptr;
-    if (cache_type == cucim::cache::CacheType::kNoCache)
-    {
-        tile_raster = static_cast<uint8_t*>(cucim_malloc(tile_raster_nbytes));
-    }
 
     // TODO: revert this once we can get RGB data instead of RGBA
     uint32_t samples_per_pixel = 3; // ifd->samples_per_pixel();
@@ -737,7 +917,7 @@ bool IFD::read_region_tiles_boundary(const TIFF* tiff,
     int64_t boundary_index_y = offset_boundary_y * stride_y;
 
 
-    int tiff_file = tiff->file_handle_.fd;
+    int tiff_file = tiff->file_handle_->fd;
     uint64_t ifd_hash_value = ifd->hash_value_;
 
     uint32_t dest_pixel_step_y = w * samples_per_pixel;
@@ -756,13 +936,15 @@ bool IFD::read_region_tiles_boundary(const TIFF* tiff,
         uint32_t dest_pixel_index_x = 0;
 
         int64_t index = index_y + offset_sx;
-        // Calculate a simple hash value for the tile index
-        uint64_t index_hash = ifd_hash_value ^ (static_cast<uint64_t>(index) | (static_cast<uint64_t>(index) << 32));
         for (int64_t offset_x = offset_sx; offset_x <= offset_ex; ++offset_x, ++index)
         {
             PROF_SCOPED_RANGE(PROF_EVENT_P(ifd_read_region_tiles_boundary_iter, index));
             uint64_t tiledata_offset = 0;
             uint64_t tiledata_size = 0;
+
+            // Calculate a simple hash value for the tile index
+            uint64_t index_hash = ifd_hash_value ^ (static_cast<uint64_t>(index) | (static_cast<uint64_t>(index) << 32));
+
             if (offset_x >= offset_min_x && offset_x <= offset_max_x && index_y >= start_index_min_y &&
                 index_y <= end_index_max_y)
             {
@@ -775,166 +957,257 @@ bool IFD::read_region_tiles_boundary(const TIFF* tiff,
                                                     (pixel_offset_ex - tile_pixel_offset_x + 1) * samples_per_pixel :
                                                     (tw - tile_pixel_offset_x) * samples_per_pixel;
 
-            uint32_t nbytes_tile_index = (tile_pixel_offset_sy * tw + tile_pixel_offset_x) * samples_per_pixel;
-            uint32_t dest_pixel_index = dest_pixel_index_x;
-            if (tiledata_size > 0)
-            {
-                bool copy_partial = false;
-                uint32_t fixed_nbytes_tile_pixel_size_x = nbytes_tile_pixel_size_x;
-                uint32_t fixed_tile_pixel_offset_ey = tile_pixel_offset_ey;
+            uint32_t nbytes_tile_index_orig = (tile_pixel_offset_sy * tw + tile_pixel_offset_x) * samples_per_pixel;
+            uint32_t dest_pixel_index_orig = dest_pixel_index_x;
 
-                if (offset_x == offset_boundary_x)
-                {
-                    copy_partial = true;
-                    if (offset_x != offset_ex)
-                    {
-                        fixed_nbytes_tile_pixel_size_x =
-                            (pixel_offset_boundary_x - tile_pixel_offset_x + 1) * samples_per_pixel;
-                    }
-                    else
-                    {
-                        fixed_nbytes_tile_pixel_size_x =
-                            (std::min(pixel_offset_boundary_x, pixel_offset_ex) - tile_pixel_offset_x + 1) *
-                            samples_per_pixel;
-                    }
-                }
-                if (index_y == boundary_index_y)
+            auto decode_func = [=, &image_cache]() {
+                PROF_SCOPED_RANGE(PROF_EVENT_P(ifd_read_region_tiles_boundary_task, index_hash));
+                uint32_t nbytes_tile_index = nbytes_tile_index_orig;
+                uint32_t dest_pixel_index = dest_pixel_index_orig;
+
+                if (tiledata_size > 0)
                 {
-                    copy_partial = true;
-                    if (index_y != end_index_y)
+                    bool copy_partial = false;
+                    uint32_t fixed_nbytes_tile_pixel_size_x = nbytes_tile_pixel_size_x;
+                    uint32_t fixed_tile_pixel_offset_ey = tile_pixel_offset_ey;
+
+                    if (offset_x == offset_boundary_x)
                     {
-                        fixed_tile_pixel_offset_ey = pixel_offset_boundary_y;
+                        copy_partial = true;
+                        if (offset_x != offset_ex)
+                        {
+                            fixed_nbytes_tile_pixel_size_x =
+                                (pixel_offset_boundary_x - tile_pixel_offset_x + 1) * samples_per_pixel;
+                        }
+                        else
+                        {
+                            fixed_nbytes_tile_pixel_size_x =
+                                (std::min(pixel_offset_boundary_x, pixel_offset_ex) - tile_pixel_offset_x + 1) *
+                                samples_per_pixel;
+                        }
                     }
-                    else
+                    if (index_y == boundary_index_y)
                     {
-                        fixed_tile_pixel_offset_ey = std::min(pixel_offset_boundary_y, pixel_offset_ey);
+                        copy_partial = true;
+                        if (index_y != end_index_y)
+                        {
+                            fixed_tile_pixel_offset_ey = pixel_offset_boundary_y;
+                        }
+                        else
+                        {
+                            fixed_tile_pixel_offset_ey = std::min(pixel_offset_boundary_y, pixel_offset_ey);
+                        }
                     }
-                }
 
-                uint8_t* tile_data = tile_raster;
+                    uint8_t* tile_data = nullptr;
+                    std::unique_ptr<uint8_t, decltype(cucim_free)*> tile_raster =
+                        std::unique_ptr<uint8_t, decltype(cucim_free)*>(nullptr, cucim_free);
 
-                auto key = image_cache.create_key(ifd_hash_value, index);
-                image_cache.lock(index_hash);
-                auto value = image_cache.find(key);
-                if (value)
-                {
-                    image_cache.unlock(index_hash);
-                    tile_data = static_cast<uint8_t*>(value->data);
-                }
-                else
-                {
-                    // Lifetime of tile_data is same with `value`
-                    // : do not access this data when `value` is not accessible.
-                    if (cache_type != cucim::cache::CacheType::kNoCache)
+                    if (loader && loader->batch_data_processor())
                     {
-                        tile_data = static_cast<uint8_t*>(image_cache.allocate(tile_raster_nbytes));
-                    }
-
-                    {
-                        PROF_SCOPED_RANGE(PROF_EVENT(ifd_decompression));
                         switch (compression_method)
                         {
-                        case COMPRESSION_NONE:
-                            cuslide::raw::decode_raw(tiff_file, nullptr, tiledata_offset, tiledata_size, &tile_data,
-                                                     tile_raster_nbytes, out_device);
-                            break;
                         case COMPRESSION_JPEG:
-                            cuslide::jpeg::decode_libjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size,
-                                                          jpegtable_data, jpegtable_count, &tile_data, out_device,
-                                                          jpeg_color_space);
-                            break;
-                        case COMPRESSION_ADOBE_DEFLATE:
-                        case COMPRESSION_DEFLATE:
-                            cuslide::deflate::decode_deflate(tiff_file, nullptr, tiledata_offset, tiledata_size,
-                                                             &tile_data, tile_raster_nbytes, out_device);
-                            break;
-                        case cuslide::jpeg2k::kAperioJpeg2kYCbCr: // 33003
-                            cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size,
-                                                                &tile_data, tile_raster_nbytes, out_device,
-                                                                cuslide::jpeg2k::ColorSpace::kSYCC);
-                            break;
-                        case cuslide::jpeg2k::kAperioJpeg2kRGB: // 33005
-                            cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size,
-                                                                &tile_data, tile_raster_nbytes, out_device,
-                                                                cuslide::jpeg2k::ColorSpace::kRGB);
-                            break;
-                        case COMPRESSION_LZW:
-                            cuslide::lzw::decode_lzw(tiff_file, nullptr, tiledata_offset, tiledata_size, &tile_data,
-                                                     tile_raster_nbytes, out_device);
-                            // Apply unpredictor
-                            //   1: none, 2: horizontal differencing, 3: floating point predictor
-                            //   https://www.adobe.io/content/dam/udp/en/open/standards/tiff/TIFF6.pdf
-                            if (predictor == 2)
-                            {
-                                cuslide::lzw::horAcc8(tile_data, tile_raster_nbytes, nbytes_tw);
-                            }
                             break;
                         default:
                             throw std::runtime_error("Unsupported compression method");
                         }
-                    }
-                    value = image_cache.create_value(tile_data, tile_raster_nbytes);
-                    image_cache.insert(key, value);
-                    image_cache.unlock(index_hash);
-                }
-                if (copy_partial)
-                {
-                    uint32_t fill_gap_x = nbytes_tile_pixel_size_x - fixed_nbytes_tile_pixel_size_x;
-                    // Fill original, then fill white for remaining
-                    if (fill_gap_x > 0)
-                    {
-                        for (uint32_t ty = tile_pixel_offset_sy; ty <= fixed_tile_pixel_offset_ey;
-                             ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw)
+                        auto value = loader->wait_for_processing(index);
+                        if (!value) // if shutdown
                         {
-                            memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index,
-                                   fixed_nbytes_tile_pixel_size_x);
-                            memset(dest_start_ptr + dest_pixel_index + fixed_nbytes_tile_pixel_size_x, background_value,
-                                   fill_gap_x);
+                            return;
+                        }
+
+                        tile_data = static_cast<uint8_t*>(value->data);
+
+                        cudaError_t cuda_status;
+                        if (copy_partial)
+                        {
+                            uint32_t fill_gap_x = nbytes_tile_pixel_size_x - fixed_nbytes_tile_pixel_size_x;
+                            // Fill original, then fill white for remaining
+                            if (fill_gap_x > 0)
+                            {
+                                CUDA_ERROR(cudaMemcpy2D(
+                                    dest_start_ptr + dest_pixel_index, dest_pixel_step_y, tile_data + nbytes_tile_index,
+                                    nbytes_tw, fixed_nbytes_tile_pixel_size_x,
+                                    fixed_tile_pixel_offset_ey - tile_pixel_offset_sy + 1, cudaMemcpyDeviceToDevice));
+                                CUDA_ERROR(cudaMemset2D(dest_start_ptr + dest_pixel_index + fixed_nbytes_tile_pixel_size_x,
+                                                        dest_pixel_step_y, background_value, fill_gap_x,
+                                                        fixed_tile_pixel_offset_ey - tile_pixel_offset_sy + 1));
+                                dest_pixel_index +=
+                                    dest_pixel_step_y * (fixed_tile_pixel_offset_ey - tile_pixel_offset_sy + 1);
+                            }
+                            else
+                            {
+                                CUDA_ERROR(cudaMemcpy2D(
+                                    dest_start_ptr + dest_pixel_index, dest_pixel_step_y, tile_data + nbytes_tile_index,
+                                    nbytes_tw, fixed_nbytes_tile_pixel_size_x,
+                                    fixed_tile_pixel_offset_ey - tile_pixel_offset_sy + 1, cudaMemcpyDeviceToDevice));
+                                dest_pixel_index +=
+                                    dest_pixel_step_y * (fixed_tile_pixel_offset_ey - tile_pixel_offset_sy + 1);
+                            }
+
+                            CUDA_ERROR(cudaMemset2D(dest_start_ptr + dest_pixel_index, dest_pixel_step_y,
+                                                    background_value, nbytes_tile_pixel_size_x,
+                                                    tile_pixel_offset_ey - (fixed_tile_pixel_offset_ey + 1) + 1));
+                        }
+                        else
+                        {
+                            CUDA_ERROR(cudaMemcpy2D(dest_start_ptr + dest_pixel_index, dest_pixel_step_y,
+                                                    tile_data + nbytes_tile_index, nbytes_tw, nbytes_tile_pixel_size_x,
+                                                    tile_pixel_offset_ey - tile_pixel_offset_sy + 1,
+                                                    cudaMemcpyDeviceToDevice));
                         }
                     }
                     else
                     {
-                        for (uint32_t ty = tile_pixel_offset_sy; ty <= fixed_tile_pixel_offset_ey;
-                             ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw)
+                        auto key = image_cache.create_key(ifd_hash_value, index);
+                        image_cache.lock(index_hash);
+                        auto value = image_cache.find(key);
+                        if (value)
                         {
-                            memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index,
-                                   fixed_nbytes_tile_pixel_size_x);
+                            image_cache.unlock(index_hash);
+                            tile_data = static_cast<uint8_t*>(value->data);
                         }
-                    }
+                        else
+                        {
+                            // Lifetime of tile_data is same with `value`
+                            // : do not access this data when `value` is not accessible.
+                            if (cache_type != cucim::cache::CacheType::kNoCache)
+                            {
+                                tile_data = static_cast<uint8_t*>(image_cache.allocate(tile_raster_nbytes));
+                            }
+                            else
+                            {
+                                // Allocate temporary buffer for tile data
+                                tile_raster = std::unique_ptr<uint8_t, decltype(cucim_free)*>(
+                                    reinterpret_cast<uint8_t*>(cucim_malloc(tile_raster_nbytes)), cucim_free);
+                                tile_data = tile_raster.get();
+                            }
+                            {
+                                PROF_SCOPED_RANGE(PROF_EVENT(ifd_decompression));
+                                switch (compression_method)
+                                {
+                                case COMPRESSION_NONE:
+                                    cuslide::raw::decode_raw(tiff_file, nullptr, tiledata_offset, tiledata_size,
+                                                             &tile_data, tile_raster_nbytes, out_device);
+                                    break;
+                                case COMPRESSION_JPEG:
+                                    cuslide::jpeg::decode_libjpeg(tiff_file, nullptr, tiledata_offset, tiledata_size,
+                                                                  jpegtable_data, jpegtable_count, &tile_data,
+                                                                  out_device, jpeg_color_space);
+                                    break;
+                                case COMPRESSION_ADOBE_DEFLATE:
+                                case COMPRESSION_DEFLATE:
+                                    cuslide::deflate::decode_deflate(tiff_file, nullptr, tiledata_offset, tiledata_size,
+                                                                     &tile_data, tile_raster_nbytes, out_device);
+                                    break;
+                                case cuslide::jpeg2k::kAperioJpeg2kYCbCr: // 33003
+                                    cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset,
+                                                                        tiledata_size, &tile_data, tile_raster_nbytes,
+                                                                        out_device, cuslide::jpeg2k::ColorSpace::kSYCC);
+                                    break;
+                                case cuslide::jpeg2k::kAperioJpeg2kRGB: // 33005
+                                    cuslide::jpeg2k::decode_libopenjpeg(tiff_file, nullptr, tiledata_offset,
+                                                                        tiledata_size, &tile_data, tile_raster_nbytes,
+                                                                        out_device, cuslide::jpeg2k::ColorSpace::kRGB);
+                                    break;
+                                case COMPRESSION_LZW:
+                                    cuslide::lzw::decode_lzw(tiff_file, nullptr, tiledata_offset, tiledata_size,
+                                                             &tile_data, tile_raster_nbytes, out_device);
+                                    // Apply unpredictor
+                                    //   1: none, 2: horizontal differencing, 3: floating point predictor
+                                    //   https://www.adobe.io/content/dam/udp/en/open/standards/tiff/TIFF6.pdf
+                                    if (predictor == 2)
+                                    {
+                                        cuslide::lzw::horAcc8(tile_data, tile_raster_nbytes, nbytes_tw);
+                                    }
+                                    break;
+                                default:
+                                    throw std::runtime_error("Unsupported compression method");
+                                }
+                            }
+                            value = image_cache.create_value(tile_data, tile_raster_nbytes);
+                            image_cache.insert(key, value);
+                            image_cache.unlock(index_hash);
+                        }
+                        if (copy_partial)
+                        {
+                            uint32_t fill_gap_x = nbytes_tile_pixel_size_x - fixed_nbytes_tile_pixel_size_x;
+                            // Fill original, then fill white for remaining
+                            if (fill_gap_x > 0)
+                            {
+                                for (uint32_t ty = tile_pixel_offset_sy; ty <= fixed_tile_pixel_offset_ey;
+                                     ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw)
+                                {
+                                    memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index,
+                                           fixed_nbytes_tile_pixel_size_x);
+                                    memset(dest_start_ptr + dest_pixel_index + fixed_nbytes_tile_pixel_size_x,
+                                           background_value, fill_gap_x);
+                                }
+                            }
+                            else
+                            {
+                                for (uint32_t ty = tile_pixel_offset_sy; ty <= fixed_tile_pixel_offset_ey;
+                                     ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw)
+                                {
+                                    memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index,
+                                           fixed_nbytes_tile_pixel_size_x);
+                                }
+                            }
 
-                    for (uint32_t ty = fixed_tile_pixel_offset_ey + 1; ty <= tile_pixel_offset_ey;
-                         ++ty, dest_pixel_index += dest_pixel_step_y)
-                    {
-                        memset(dest_start_ptr + dest_pixel_index, background_value, nbytes_tile_pixel_size_x);
+                            for (uint32_t ty = fixed_tile_pixel_offset_ey + 1; ty <= tile_pixel_offset_ey;
+                                 ++ty, dest_pixel_index += dest_pixel_step_y)
+                            {
+                                memset(dest_start_ptr + dest_pixel_index, background_value, nbytes_tile_pixel_size_x);
+                            }
+                        }
+                        else
+                        {
+                            for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey;
+                                 ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw)
+                            {
+                                memcpy(dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index,
+                                       nbytes_tile_pixel_size_x);
+                            }
+                        }
                     }
                 }
                 else
                 {
-                    for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey;
-                         ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw)
+
+                    if (out_device.type() == cucim::io::DeviceType::kCPU)
                     {
-                        memcpy(
-                            dest_start_ptr + dest_pixel_index, tile_data + nbytes_tile_index, nbytes_tile_pixel_size_x);
+                        for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey;
+                             ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw)
+                        {
+                            // Set (255,255,255)
+                            memset(dest_start_ptr + dest_pixel_index, background_value, nbytes_tile_pixel_size_x);
+                        }
+                    }
+                    else
+                    {
+                        cudaError_t cuda_status;
+                        CUDA_ERROR(cudaMemset2D(dest_start_ptr + dest_pixel_index, dest_pixel_step_y, background_value,
+                                                nbytes_tile_pixel_size_x, tile_pixel_offset_ey - tile_pixel_offset_sy));
                     }
                 }
+            };
+
+            if (loader && *loader)
+            {
+                loader->enqueue(std::move(decode_func),
+                                cucim::loader::TileInfo{ location_index, index, tiledata_offset, tiledata_size });
             }
             else
             {
-                for (uint32_t ty = tile_pixel_offset_sy; ty <= tile_pixel_offset_ey;
-                     ++ty, dest_pixel_index += dest_pixel_step_y, nbytes_tile_index += nbytes_tw)
-                {
-                    // Set (255,255,255)
-                    memset(dest_start_ptr + dest_pixel_index, background_value, nbytes_tile_pixel_size_x);
-                }
+                decode_func();
             }
+
             dest_pixel_index_x += nbytes_tile_pixel_size_x;
         }
         dest_start_ptr += dest_pixel_step_y * dest_pixel_offset_len_y;
     }
-    if (tile_raster)
-    {
-        cucim_free(tile_raster);
-    }
     return true;
 }
 
@@ -966,4 +1239,4 @@ void IFD::write_offsets_(const char* file_path)
     offsets.close();
 }
 
-} // namespace cuslide::tiff
\ No newline at end of file
+} // namespace cuslide::tiff
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.h b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.h
index 16601f0d7..e15724782 100644
--- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.h
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.h
@@ -19,13 +19,15 @@
 
 #include "types.h"
 
+#include <memory>
+#include <vector>
+
+#include <cucim/concurrent/threadpool.h>
 #include <cucim/io/format/image_format.h>
 #include <cucim/io/device.h>
+#include <cucim/loader/thread_batch_data_loader.h>
 //#include <tiffio.h>
 
-#include <memory>
-#include <vector>
-
 namespace cuslide::tiff
 {
 
@@ -40,21 +42,23 @@ class EXPORT_VISIBLE IFD : public std::enable_shared_from_this<IFD>
 
     static bool read_region_tiles(const TIFF* tiff,
                                   const IFD* ifd,
-                                  const int64_t sx,
-                                  const int64_t sy,
+                                  const int64_t* location,
+                                  const int64_t location_index,
                                   const int64_t w,
                                   const int64_t h,
                                   void* raster,
-                                  const cucim::io::Device& out_device);
+                                  const cucim::io::Device& out_device,
+                                  cucim::loader::ThreadBatchDataLoader* loader);
 
     static bool read_region_tiles_boundary(const TIFF* tiff,
                                            const IFD* ifd,
-                                           const int64_t sx,
-                                           const int64_t sy,
+                                           const int64_t* location,
+                                           const int64_t location_index,
                                            const int64_t w,
                                            const int64_t h,
                                            void* raster,
-                                           const cucim::io::Device& out_device);
+                                           const cucim::io::Device& out_device,
+                                           cucim::loader::ThreadBatchDataLoader* loader);
 
     bool read(const TIFF* tiff,
               const cucim::io::format::ImageMetadataDesc* metadata,
@@ -88,6 +92,9 @@ class EXPORT_VISIBLE IFD : public std::enable_shared_from_this<IFD>
     const std::vector<uint64_t>& image_piece_offsets() const;
     const std::vector<uint64_t>& image_piece_bytecounts() const;
 
+    size_t pixel_size_nbytes() const;
+    size_t tile_raster_size_nbytes() const;
+
     // Hidden methods for benchmarking
     void write_offsets_(const char* file_path);
 
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.cpp
index a99d1a411..cc3c5fd31 100644
--- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.cpp
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.cpp
@@ -281,8 +281,8 @@ TIFF::TIFF(const cucim::filesystem::Path& file_path, int mode) : file_path_(file
         cucim_free(file_path_cstr);
         throw std::invalid_argument(fmt::format("Cannot load {}!", file_path));
     }
-    // TODO: make file_handle_ object to pointer
-    file_handle_ = CuCIMFileHandle{ fd, nullptr, FileHandleType::kPosix, file_path_cstr, this };
+    file_handle_shared_ = std::make_shared<CuCIMFileHandle>(fd, nullptr, FileHandleType::kPosix, file_path_cstr, this);
+    file_handle_ = file_handle_shared_.get();
 
     // TODO: warning if the file is big endian
     is_big_endian_ = ::TIFFIsBigEndian(tiff_client_);
@@ -318,17 +318,6 @@ void TIFF::close()
         TIFFClose(tiff_client_);
         tiff_client_ = nullptr;
     }
-    if (file_handle_.path)
-    {
-        cucim_free(file_handle_.path);
-        file_handle_.path = nullptr;
-    }
-    if (file_handle_.client_data)
-    {
-        // Deleting file_handle_.client_data is parser_close()'s responsibility
-        // Do not execute this: `delete static_cast<cuslide::tiff::TIFF*>(file_handle_.client_data);`
-        file_handle_.client_data = nullptr;
-    }
     if (metadata_)
     {
         delete reinterpret_cast<json*>(metadata_);
@@ -698,8 +687,8 @@ bool TIFF::read(const cucim::io::format::ImageMetadataDesc* metadata,
         return read_associated_image(metadata, request, out_image_data, out_metadata);
     }
 
-    // TODO: assume length of location/size to 2.
-    constexpr int32_t ndims = 2;
+    const int32_t ndim = request->size_ndim;
+    const uint64_t location_len = request->location_len;
 
     if (request->level >= level_to_ifd_idx_.size())
     {
@@ -711,7 +700,7 @@ bool TIFF::read(const cucim::io::format::ImageMetadataDesc* metadata,
     auto original_img_width = main_ifd->width();
     auto original_img_height = main_ifd->height();
 
-    for (int32_t i = 0; i < ndims; ++i)
+    for (int32_t i = 0; i < ndim; ++i)
     {
         if (request->size[i] <= 0)
         {
@@ -733,7 +722,7 @@ bool TIFF::read(const cucim::io::format::ImageMetadataDesc* metadata,
     float downsample_factor = metadata->resolution_info.level_downsamples[request->level];
 
     // Change request based on downsample factor. (normalized value at level-0 -> real location at the requested level)
-    for (int32_t i = 0; i < ndims; ++i)
+    for (int64_t i = ndim * location_len - 1; i >= 0; --i)
     {
         request->location[i] /= downsample_factor;
     }
@@ -831,8 +820,9 @@ bool TIFF::read_associated_image(const cucim::io::format::ImageMetadataDesc* met
                 switch (compression_method)
                 {
                 case COMPRESSION_JPEG:
-                    if (!cuslide::jpeg::decode_libjpeg(file_handle_.fd, nullptr /*jpeg_buf*/, offset, size, jpegtable_data,
-                                                       jpegtable_count, &target_ptr, out_device, jpeg_color_space))
+                    if (!cuslide::jpeg::decode_libjpeg(file_handle_->fd, nullptr /*jpeg_buf*/, offset, size,
+                                                       jpegtable_data, jpegtable_count, &target_ptr, out_device,
+                                                       jpeg_color_space))
                     {
                         cucim_free(raster);
                         fmt::print(stderr, "[Error] Failed to read region with libjpeg!\n");
@@ -840,8 +830,8 @@ bool TIFF::read_associated_image(const cucim::io::format::ImageMetadataDesc* met
                     }
                     break;
                 case COMPRESSION_LZW:
-                    if (!cuslide::lzw::decode_lzw(
-                            file_handle_.fd, nullptr /*jpeg_buf*/, offset, size, &target_ptr, strip_nbytes, out_device))
+                    if (!cuslide::lzw::decode_lzw(file_handle_->fd, nullptr /*jpeg_buf*/, offset, size, &target_ptr,
+                                                  strip_nbytes, out_device))
                     {
                         cucim_free(raster);
                         fmt::print(stderr, "[Error] Failed to read region with lzw decoder!\n");
@@ -1041,9 +1031,9 @@ cucim::filesystem::Path TIFF::file_path() const
     return file_path_;
 }
 
-CuCIMFileHandle TIFF::file_handle() const
+std::shared_ptr<CuCIMFileHandle>& TIFF::file_handle()
 {
-    return file_handle_;
+    return file_handle_shared_;
 }
 ::TIFF* TIFF::client() const
 {
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.h b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.h
index 6b3730e88..c770669aa 100644
--- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.h
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.h
@@ -68,7 +68,8 @@ class EXPORT_VISIBLE TIFF : public std::enable_shared_from_this<TIFF>
                                cucim::io::format::ImageMetadataDesc* out_metadata);
 
     cucim::filesystem::Path file_path() const;
-    CuCIMFileHandle file_handle() const;
+    std::shared_ptr<CuCIMFileHandle>& file_handle(); /// used for moving the ownership of the file handle to the caller.
+                                                     /// Do not use for the application -- it will return nullptr.
     ::TIFF* client() const;
     const std::vector<ifd_offset_t>& ifd_offsets() const;
     std::shared_ptr<IFD> ifd(size_t index) const;
@@ -104,7 +105,9 @@ class EXPORT_VISIBLE TIFF : public std::enable_shared_from_this<TIFF>
     void _populate_aperio_svs_metadata(uint16_t ifd_count, void* metadata, std::shared_ptr<IFD>& first_ifd);
 
     cucim::filesystem::Path file_path_;
-    CuCIMFileHandle file_handle_{};
+    /// Temporary shared file handle whose ownership would be transferred to CuImage through parser_open()
+    std::shared_ptr<CuCIMFileHandle> file_handle_shared_;
+    CuCIMFileHandle* file_handle_ = nullptr;
     ::TIFF* tiff_client_ = nullptr;
     std::vector<ifd_offset_t> ifd_offsets_; /// IFD offset for an index (IFD index)
     std::vector<std::shared_ptr<IFD>> ifds_; /// IFD object for an index (IFD index)
diff --git a/cpp/src/cache/image_cache.cpp b/cpp/src/cache/image_cache.cpp
index 3cf4099ce..5036911b7 100644
--- a/cpp/src/cache/image_cache.cpp
+++ b/cpp/src/cache/image_cache.cpp
@@ -26,7 +26,8 @@ ImageCacheKey::ImageCacheKey(uint64_t file_hash, uint64_t index) : file_hash(fil
 {
 }
 
-ImageCacheValue::ImageCacheValue(void* data, uint64_t size, void* user_obj) : data(data), size(size), user_obj(user_obj)
+ImageCacheValue::ImageCacheValue(void* data, uint64_t size, void* user_obj, const cucim::io::DeviceType device_type)
+    : data(data), size(size), user_obj(user_obj), device_type(device_type)
 {
 }
 
@@ -36,7 +37,8 @@ ImageCacheValue::operator bool() const
 }
 
 
-ImageCache::ImageCache(const ImageCacheConfig& config, CacheType type) : type_(type), config_(config){};
+ImageCache::ImageCache(const ImageCacheConfig& config, CacheType type, const cucim::io::DeviceType device_type)
+    : type_(type), device_type_(device_type), config_(config){};
 
 CacheType ImageCache::type() const
 {
@@ -48,6 +50,11 @@ const char* ImageCache::type_str() const
     return "nocache";
 }
 
+cucim::io::DeviceType ImageCache::device_type() const
+{
+    return device_type_;
+}
+
 ImageCacheConfig& ImageCache::config()
 {
     return config_;
diff --git a/cpp/src/cache/image_cache_empty.cpp b/cpp/src/cache/image_cache_empty.cpp
index 0cd0a0c5e..c5935eb62 100644
--- a/cpp/src/cache/image_cache_empty.cpp
+++ b/cpp/src/cache/image_cache_empty.cpp
@@ -27,7 +27,7 @@ std::shared_ptr<ImageCacheKey> EmptyImageCache::create_key(uint64_t, uint64_t)
 {
     return std::make_shared<ImageCacheKey>(0, 0);
 }
-std::shared_ptr<ImageCacheValue> EmptyImageCache::create_value(void*, uint64_t)
+std::shared_ptr<ImageCacheValue> EmptyImageCache::create_value(void*, uint64_t, const cucim::io::DeviceType)
 {
     return std::make_shared<ImageCacheValue>(nullptr, 0);
 }
@@ -47,11 +47,19 @@ void EmptyImageCache::unlock(uint64_t)
     return;
 }
 
+void* EmptyImageCache::mutex(uint64_t)
+{
+    return nullptr;
+}
+
 bool EmptyImageCache::insert(std::shared_ptr<ImageCacheKey>&, std::shared_ptr<ImageCacheValue>&)
 {
     return true;
 }
 
+void EmptyImageCache::remove_front()
+{
+}
 
 uint32_t EmptyImageCache::size() const
 {
diff --git a/cpp/src/cache/image_cache_empty.h b/cpp/src/cache/image_cache_empty.h
index b592c817a..5ad367d9b 100644
--- a/cpp/src/cache/image_cache_empty.h
+++ b/cpp/src/cache/image_cache_empty.h
@@ -35,13 +35,16 @@ class EmptyImageCache : public ImageCache
     EmptyImageCache(const ImageCacheConfig& config);
 
     std::shared_ptr<ImageCacheKey> create_key(uint64_t file_hash, uint64_t index) override;
-    std::shared_ptr<ImageCacheValue> create_value(void* data, uint64_t size) override;
+    std::shared_ptr<ImageCacheValue> create_value(
+        void* data, uint64_t size, const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU) override;
 
     void* allocate(std::size_t n) override;
     void lock(uint64_t index) override;
     void unlock(uint64_t index) override;
+    void* mutex(uint64_t index) override;
 
     bool insert(std::shared_ptr<ImageCacheKey>& key, std::shared_ptr<ImageCacheValue>& value) override;
+    void remove_front() override;
 
     uint32_t size() const override;
     uint64_t memory_size() const override;
diff --git a/cpp/src/cache/image_cache_manager.cpp b/cpp/src/cache/image_cache_manager.cpp
index bd8a8a7e9..ce67b5f5d 100644
--- a/cpp/src/cache/image_cache_manager.cpp
+++ b/cpp/src/cache/image_cache_manager.cpp
@@ -29,7 +29,7 @@
 namespace cucim::cache
 {
 
-uint32_t preferred_memory_capacity(const std::vector<uint32_t>& image_size,
+uint32_t preferred_memory_capacity(const std::vector<uint64_t>& image_size,
                                    const std::vector<uint32_t>& tile_size,
                                    const std::vector<uint32_t>& patch_size,
                                    uint32_t bytes_per_pixel)
@@ -47,7 +47,9 @@ uint32_t preferred_memory_capacity(const std::vector<uint32_t>& image_size,
 
     // The maximal number of tiles (y-axis) overapped with the given patch
     uint32_t patch_down_count =
-        std::min(image_size[1] + (tile_size[1] - 1), patch_size[1] + (tile_size[1] - 1)) / tile_size[1] + 1;
+        std::min(image_size[1] + (tile_size[1] - 1), static_cast<uint64_t>(patch_size[1] + (tile_size[1] - 1))) /
+            tile_size[1] +
+        1;
 
     // (tile_accross_count) x (tile width) x (tile_height) x (patch_down_count) x (bytes per pixel)
     uint64_t bytes_needed =
@@ -95,14 +97,8 @@ void ImageCacheManager::reserve(uint32_t new_memory_capacity, uint32_t new_capac
     cache_->reserve(cache_config);
 }
 
-std::unique_ptr<ImageCache> ImageCacheManager::create_cache() const
-{
-    ImageCacheConfig& cache_config = cucim::CuImage::get_config()->cache();
-
-    return create_cache(cache_config);
-}
-
-std::unique_ptr<ImageCache> ImageCacheManager::create_cache(const ImageCacheConfig& cache_config) const
+std::unique_ptr<ImageCache> ImageCacheManager::create_cache(const ImageCacheConfig& cache_config,
+                                                            const cucim::io::DeviceType device_type)
 {
     PROF_SCOPED_RANGE(PROF_EVENT(image_cache_create_cache));
     switch (cache_config.type)
@@ -110,12 +106,19 @@ std::unique_ptr<ImageCache> ImageCacheManager::create_cache(const ImageCacheConf
     case CacheType::kNoCache:
         return std::make_unique<EmptyImageCache>(cache_config);
     case CacheType::kPerProcess:
-        return std::make_unique<PerProcessImageCache>(cache_config);
+        return std::make_unique<PerProcessImageCache>(cache_config, device_type);
     case CacheType::kSharedMemory:
-        return std::make_unique<SharedMemoryImageCache>(cache_config);
+        return std::make_unique<SharedMemoryImageCache>(cache_config, device_type);
     default:
         return std::make_unique<EmptyImageCache>(cache_config);
     }
 }
 
+std::unique_ptr<ImageCache> ImageCacheManager::create_cache() const
+{
+    ImageCacheConfig& cache_config = cucim::CuImage::get_config()->cache();
+
+    return create_cache(cache_config);
+}
+
 } // namespace cucim::cache
diff --git a/cpp/src/cache/image_cache_per_process.cpp b/cpp/src/cache/image_cache_per_process.cpp
index f2b36889f..95d2286c5 100644
--- a/cpp/src/cache/image_cache_per_process.cpp
+++ b/cpp/src/cache/image_cache_per_process.cpp
@@ -17,7 +17,9 @@
 
 #include "image_cache_per_process.h"
 
+#include "cucim/cache/image_cache.h"
 #include "cucim/memory/memory_manager.h"
+#include "cucim/util/cuda.h"
 
 #include <fmt/format.h>
 
@@ -53,19 +55,38 @@ struct PerProcessImageCacheItem
     std::shared_ptr<ImageCacheValue> value;
 };
 
-PerProcessImageCacheValue::PerProcessImageCacheValue(void* data, uint64_t size, void* user_obj)
-    : ImageCacheValue(data, size, user_obj){};
+PerProcessImageCacheValue::PerProcessImageCacheValue(void* data,
+                                                     uint64_t size,
+                                                     void* user_obj,
+                                                     const cucim::io::DeviceType device_type)
+    : ImageCacheValue(data, size, user_obj, device_type){};
+
 PerProcessImageCacheValue::~PerProcessImageCacheValue()
 {
     if (data)
     {
-        cucim_free(data);
+        switch (device_type)
+        {
+        case io::DeviceType::kCPU:
+            cucim_free(data);
+            break;
+        case io::DeviceType::kCUDA: {
+            cudaError_t cuda_status;
+            CUDA_TRY(cudaFree(data));
+            break;
+        }
+        case io::DeviceType::kPinned:
+        case io::DeviceType::kCPUShared:
+        case io::DeviceType::kCUDAShared:
+            fmt::print(stderr, "Device type {} is not supported!\n", device_type);
+            break;
+        }
         data = nullptr;
     }
 };
 
-PerProcessImageCache::PerProcessImageCache(const ImageCacheConfig& config)
-    : ImageCache(config, CacheType::kPerProcess),
+PerProcessImageCache::PerProcessImageCache(const ImageCacheConfig& config, const cucim::io::DeviceType device_type)
+    : ImageCache(config, CacheType::kPerProcess, device_type),
       mutex_array_(config.mutex_pool_capacity),
       capacity_nbytes_(kOneMiB * config.memory_capacity),
       capacity_(config.capacity),
@@ -89,14 +110,32 @@ std::shared_ptr<ImageCacheKey> PerProcessImageCache::create_key(uint64_t file_ha
 {
     return std::make_shared<ImageCacheKey>(file_hash, index);
 }
-std::shared_ptr<ImageCacheValue> PerProcessImageCache::create_value(void* data, uint64_t size)
+std::shared_ptr<ImageCacheValue> PerProcessImageCache::create_value(void* data,
+                                                                    uint64_t size,
+                                                                    const cucim::io::DeviceType device_type)
 {
-    return std::make_shared<PerProcessImageCacheValue>(data, size);
+    return std::make_shared<PerProcessImageCacheValue>(data, size, nullptr, device_type);
 }
 
 void* PerProcessImageCache::allocate(std::size_t n)
 {
-    return cucim_malloc(n);
+    switch (device_type_)
+    {
+    case io::DeviceType::kCPU:
+        return cucim_malloc(n);
+    case io::DeviceType::kCUDA: {
+        cudaError_t cuda_status;
+        void* image_data_ptr = nullptr;
+        CUDA_TRY(cudaMalloc(&image_data_ptr, n));
+        return image_data_ptr;
+    }
+    case io::DeviceType::kPinned:
+    case io::DeviceType::kCPUShared:
+    case io::DeviceType::kCUDAShared:
+        fmt::print(stderr, "Device type {} is not supported!\n", device_type_);
+        break;
+    }
+    return nullptr;
 }
 
 void PerProcessImageCache::lock(uint64_t index)
@@ -109,6 +148,11 @@ void PerProcessImageCache::unlock(uint64_t index)
     mutex_array_[index % mutex_pool_capacity_].unlock();
 }
 
+void* PerProcessImageCache::mutex(uint64_t index)
+{
+    return &mutex_array_[index % mutex_pool_capacity_];
+}
+
 bool PerProcessImageCache::insert(std::shared_ptr<ImageCacheKey>& key, std::shared_ptr<ImageCacheValue>& value)
 {
     if (value->size > capacity_nbytes_ || capacity_ < 1)
@@ -135,6 +179,32 @@ bool PerProcessImageCache::insert(std::shared_ptr<ImageCacheKey>& key, std::shar
     return succeed;
 }
 
+void PerProcessImageCache::remove_front()
+{
+    while (true)
+    {
+        uint32_t head = list_head_.load(std::memory_order_relaxed);
+        uint32_t tail = list_tail_.load(std::memory_order_relaxed);
+        if (head != tail)
+        {
+            // Remove front by increasing head
+            if (list_head_.compare_exchange_weak(
+                    head, (head + 1) % list_capacity_, std::memory_order_release, std::memory_order_relaxed))
+            {
+                std::shared_ptr<PerProcessImageCacheItem> head_item = list_[head];
+                size_nbytes_.fetch_sub(head_item->value->size, std::memory_order_relaxed);
+                hashmap_.erase(head_item->key);
+                list_[head].reset(); // decrease refcount
+                break;
+            }
+        }
+        else
+        {
+            break; // already empty
+        }
+    }
+}
+
 uint32_t PerProcessImageCache::size() const
 {
     uint32_t head = list_head_.load(std::memory_order_relaxed);
@@ -279,36 +349,6 @@ bool PerProcessImageCache::is_memory_full(uint64_t additional_size) const
     }
 }
 
-void PerProcessImageCache::remove_front()
-{
-    while (true)
-    {
-        uint32_t head = list_head_.load(std::memory_order_relaxed);
-        uint32_t tail = list_tail_.load(std::memory_order_relaxed);
-        if (head != tail)
-        {
-            // Remove front by increasing head
-            if (list_head_.compare_exchange_weak(
-                    head, (head + 1) % list_capacity_, std::memory_order_release, std::memory_order_relaxed))
-            {
-                // fmt::print(stderr, "{} remove list_[{:05}]\n", std::hash<std::thread::id>{}(std::this_thread::get_id()), head); //[print_list]
-                std::shared_ptr<PerProcessImageCacheItem> head_item = list_[head];
-                // if (head_item) // it is possible that head_item is nullptr.
-                // {
-                size_nbytes_.fetch_sub(head_item->value->size, std::memory_order_relaxed);
-                hashmap_.erase(head_item->key);
-                list_[head].reset(); // decrease refcount
-                break;
-                    // }
-            }
-        }
-        else
-        {
-            break; // already empty
-        }
-    }
-}
-
 void PerProcessImageCache::push_back(std::shared_ptr<PerProcessImageCacheItem>& item)
 {
     uint32_t tail = list_tail_.load(std::memory_order_relaxed);
@@ -318,7 +358,6 @@ void PerProcessImageCache::push_back(std::shared_ptr<PerProcessImageCacheItem>&
         if (list_tail_.compare_exchange_weak(
                 tail, (tail + 1) % list_capacity_, std::memory_order_release, std::memory_order_relaxed))
         {
-            // fmt::print(stderr, "{} list_[{:05}]={}\n", std::hash<std::thread::id>{}(std::this_thread::get_id()), tail, (uint64_t)item->key->location_hash); // [print_list]
             list_[tail] = item;
             size_nbytes_.fetch_add(item->value->size, std::memory_order_relaxed);
             break;
diff --git a/cpp/src/cache/image_cache_per_process.h b/cpp/src/cache/image_cache_per_process.h
index 67c325fd0..5279e44e6 100644
--- a/cpp/src/cache/image_cache_per_process.h
+++ b/cpp/src/cache/image_cache_per_process.h
@@ -49,7 +49,10 @@ struct PerProcessImageCacheItem;
 
 struct PerProcessImageCacheValue : public ImageCacheValue
 {
-    PerProcessImageCacheValue(void* data, uint64_t size, void* user_obj = nullptr);
+    PerProcessImageCacheValue(void* data,
+                              uint64_t size,
+                              void* user_obj = nullptr,
+                              const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU);
     ~PerProcessImageCacheValue() override;
 };
 
@@ -64,19 +67,23 @@ struct PerProcessImageCacheValue : public ImageCacheValue
 class PerProcessImageCache : public ImageCache
 {
 public:
-    PerProcessImageCache(const ImageCacheConfig& config);
+    PerProcessImageCache(const ImageCacheConfig& config,
+                         const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU);
     ~PerProcessImageCache();
 
     const char* type_str() const override;
 
     std::shared_ptr<ImageCacheKey> create_key(uint64_t file_hash, uint64_t index) override;
-    std::shared_ptr<ImageCacheValue> create_value(void* data, uint64_t size) override;
+    std::shared_ptr<ImageCacheValue> create_value(
+        void* data, uint64_t size, const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU) override;
 
     void* allocate(std::size_t n) override;
     void lock(uint64_t index) override;
     void unlock(uint64_t index) override;
+    void* mutex(uint64_t index) override;
 
     bool insert(std::shared_ptr<ImageCacheKey>& key, std::shared_ptr<ImageCacheValue>& value) override;
+    void remove_front() override;
 
     uint32_t size() const override;
     uint64_t memory_size() const override;
@@ -98,7 +105,6 @@ class PerProcessImageCache : public ImageCache
 private:
     bool is_list_full() const;
     bool is_memory_full(uint64_t additional_size = 0) const;
-    void remove_front();
     void push_back(std::shared_ptr<PerProcessImageCacheItem>& item);
     bool erase(const std::shared_ptr<ImageCacheKey>& key);
 
diff --git a/cpp/src/cache/image_cache_shared_memory.cpp b/cpp/src/cache/image_cache_shared_memory.cpp
index d09e8d47c..8b62cac68 100644
--- a/cpp/src/cache/image_cache_shared_memory.cpp
+++ b/cpp/src/cache/image_cache_shared_memory.cpp
@@ -154,8 +154,11 @@ struct ImageCacheItemDetail
     deleter_type<SharedMemoryImageCacheValue> value;
 };
 
-SharedMemoryImageCacheValue::SharedMemoryImageCacheValue(void* data, uint64_t size, void* user_obj)
-    : ImageCacheValue(data, size, user_obj){};
+SharedMemoryImageCacheValue::SharedMemoryImageCacheValue(void* data,
+                                                         uint64_t size,
+                                                         void* user_obj,
+                                                         const cucim::io::DeviceType device_type)
+    : ImageCacheValue(data, size, user_obj, device_type){};
 
 SharedMemoryImageCacheValue::~SharedMemoryImageCacheValue()
 {
@@ -169,8 +172,8 @@ SharedMemoryImageCacheValue::~SharedMemoryImageCacheValue()
     }
 };
 
-SharedMemoryImageCache::SharedMemoryImageCache(const ImageCacheConfig& config)
-    : ImageCache(config, CacheType::kSharedMemory),
+SharedMemoryImageCache::SharedMemoryImageCache(const ImageCacheConfig& config, const cucim::io::DeviceType device_type)
+    : ImageCache(config, CacheType::kSharedMemory, device_type),
       segment_(create_segment(config)),
       //   mutex_array_(nullptr, shared_mem_deleter<boost::interprocess::interprocess_mutex>(segment_)),
       size_nbytes_(nullptr, shared_mem_deleter<std::atomic<uint64_t>>(segment_)),
@@ -190,6 +193,12 @@ SharedMemoryImageCache::SharedMemoryImageCache(const ImageCacheConfig& config)
     const uint32_t& mutex_pool_capacity = config.mutex_pool_capacity;
     const bool& record_stat = config.record_stat;
 
+    if (device_type != cucim::io::DeviceType::kCPU)
+    {
+        throw std::runtime_error(
+            fmt::format("[Error] SharedMemoryImageCache doesn't support other memory type other than CPU memory!\n"));
+    }
+
     try
     {
         // mutex_array_.reset(segment_->find_or_construct_it<boost::interprocess::interprocess_mutex>(
@@ -282,11 +291,13 @@ std::shared_ptr<ImageCacheKey> SharedMemoryImageCache::create_key(uint64_t file_
 
     return std::shared_ptr<ImageCacheKey>(key.get().get(), null_deleter<decltype(key)>(key));
 }
-std::shared_ptr<ImageCacheValue> SharedMemoryImageCache::create_value(void* data, uint64_t size)
+std::shared_ptr<ImageCacheValue> SharedMemoryImageCache::create_value(void* data,
+                                                                      uint64_t size,
+                                                                      const cucim::io::DeviceType device_type)
 {
     auto value = boost::interprocess::make_managed_shared_ptr(
         segment_->find_or_construct<SharedMemoryImageCacheValue>(boost::interprocess::anonymous_instance)(
-            data, size, &*segment_),
+            data, size, &*segment_, device_type),
         *segment_);
 
     return std::shared_ptr<ImageCacheValue>(value.get().get(), null_deleter<decltype(value)>(value));
@@ -333,6 +344,11 @@ void SharedMemoryImageCache::unlock(uint64_t index)
     mutex_array_[index % *mutex_pool_capacity_].unlock();
 }
 
+void* SharedMemoryImageCache::mutex(uint64_t index)
+{
+    return &mutex_array_[index % *mutex_pool_capacity_];
+}
+
 bool SharedMemoryImageCache::insert(std::shared_ptr<ImageCacheKey>& key, std::shared_ptr<ImageCacheValue>& value)
 {
     if (value->size > *capacity_nbytes_ || *capacity_ < 1)
@@ -359,6 +375,36 @@ bool SharedMemoryImageCache::insert(std::shared_ptr<ImageCacheKey>& key, std::sh
     return succeed;
 }
 
+void SharedMemoryImageCache::remove_front()
+{
+    while (true)
+    {
+        uint32_t head = (*list_head_).load(std::memory_order_relaxed);
+        uint32_t tail = (*list_tail_).load(std::memory_order_relaxed);
+        if (head != tail)
+        {
+            // Remove front by increasing head
+            if ((*list_head_)
+                    .compare_exchange_weak(
+                        head, (head + 1) % (*list_capacity_), std::memory_order_release, std::memory_order_relaxed))
+            {
+                auto& head_item = (*list_)[head];
+                if (head_item) // it is possible that head_item is nullptr
+                {
+                    (*size_nbytes_).fetch_sub(head_item->value->size, std::memory_order_relaxed);
+                    hashmap_->erase(head_item->key);
+                    (*list_)[head].reset(); // decrease refcount
+                    break;
+                }
+            }
+        }
+        else
+        {
+            break; // already empty
+        }
+    }
+}
+
 uint32_t SharedMemoryImageCache::size() const
 {
     uint32_t head = list_head_->load(std::memory_order_relaxed);
@@ -508,36 +554,6 @@ bool SharedMemoryImageCache::is_memory_full(uint64_t additional_size) const
     }
 }
 
-void SharedMemoryImageCache::remove_front()
-{
-    while (true)
-    {
-        uint32_t head = (*list_head_).load(std::memory_order_relaxed);
-        uint32_t tail = (*list_tail_).load(std::memory_order_relaxed);
-        if (head != tail)
-        {
-            // Remove front by increasing head
-            if ((*list_head_)
-                    .compare_exchange_weak(
-                        head, (head + 1) % (*list_capacity_), std::memory_order_release, std::memory_order_relaxed))
-            {
-                auto& head_item = (*list_)[head];
-                if (head_item) // it is possible that head_item is nullptr
-                {
-                    (*size_nbytes_).fetch_sub(head_item->value->size, std::memory_order_relaxed);
-                    hashmap_->erase(head_item->key);
-                    (*list_)[head].reset(); // decrease refcount
-                    break;
-                }
-            }
-        }
-        else
-        {
-            break; // already empty
-        }
-    }
-}
-
 void SharedMemoryImageCache::push_back(cache_item_type<ImageCacheItemDetail>& item)
 {
     uint32_t tail = (*list_tail_).load(std::memory_order_relaxed);
diff --git a/cpp/src/cache/image_cache_shared_memory.h b/cpp/src/cache/image_cache_shared_memory.h
index dcf5b560c..3f8134977 100644
--- a/cpp/src/cache/image_cache_shared_memory.h
+++ b/cpp/src/cache/image_cache_shared_memory.h
@@ -40,7 +40,10 @@ struct ImageCacheItemDetail;
 
 struct SharedMemoryImageCacheValue : public ImageCacheValue
 {
-    SharedMemoryImageCacheValue(void* data, uint64_t size, void* user_obj = nullptr);
+    SharedMemoryImageCacheValue(void* data,
+                                uint64_t size,
+                                void* user_obj = nullptr,
+                                const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU);
     ~SharedMemoryImageCacheValue() override;
 };
 
@@ -121,19 +124,23 @@ using cache_item_type = boost::interprocess::shared_ptr<
 class SharedMemoryImageCache : public ImageCache
 {
 public:
-    SharedMemoryImageCache(const ImageCacheConfig& config);
+    SharedMemoryImageCache(const ImageCacheConfig& config,
+                           const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU);
     ~SharedMemoryImageCache();
 
     const char* type_str() const override;
 
     std::shared_ptr<ImageCacheKey> create_key(uint64_t file_hash, uint64_t index) override;
-    std::shared_ptr<ImageCacheValue> create_value(void* data, uint64_t size) override;
+    std::shared_ptr<ImageCacheValue> create_value(
+        void* data, uint64_t size, const cucim::io::DeviceType device_type = cucim::io::DeviceType::kCPU) override;
 
     void* allocate(std::size_t n) override;
     void lock(uint64_t index) override;
     void unlock(uint64_t index) override;
+    void* mutex(uint64_t index) override;
 
     bool insert(std::shared_ptr<ImageCacheKey>& key, std::shared_ptr<ImageCacheValue>& value) override;
+    void remove_front() override;
 
     uint32_t size() const override;
     uint64_t memory_size() const override;
@@ -155,7 +162,6 @@ class SharedMemoryImageCache : public ImageCache
 private:
     bool is_list_full() const;
     bool is_memory_full(uint64_t additional_size = 0) const;
-    void remove_front();
     void push_back(cache_item_type<ImageCacheItemDetail>& item);
     bool erase(const std::shared_ptr<ImageCacheKey>& key);
 
diff --git a/cpp/src/concurrent/threadpool.cpp b/cpp/src/concurrent/threadpool.cpp
new file mode 100644
index 000000000..595b40aff
--- /dev/null
+++ b/cpp/src/concurrent/threadpool.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cucim/concurrent/threadpool.h"
+
+#include <fmt/format.h>
+#include <taskflow/taskflow.hpp>
+
+#include "cucim/profiler/nvtx3.h"
+
+namespace cucim::concurrent
+{
+
+struct ThreadPool::Executor : public tf::Executor
+{
+    // inherits  Constructor
+    using tf::Executor::Executor;
+};
+
+
+ThreadPool::ThreadPool(int32_t num_workers)
+{
+    num_workers_ = num_workers;
+    if (num_workers > 0)
+    {
+        executor_ = std::make_unique<Executor>(num_workers);
+    }
+}
+
+ThreadPool::~ThreadPool()
+{
+    if (executor_)
+    {
+        executor_->wait_for_all();
+    }
+}
+
+ThreadPool::operator bool() const
+{
+    return (num_workers_ > 0);
+}
+
+std::future<void> ThreadPool::enqueue(std::function<void()> task)
+{
+    auto future = executor_->async([task]() { task(); });
+    return std::move(future);
+}
+
+void ThreadPool::wait()
+{
+    if (executor_)
+    {
+        executor_->wait_for_all();
+    }
+}
+
+} // namespace cucim::concurrent
diff --git a/cpp/src/cuimage.cpp b/cpp/src/cuimage.cpp
index b41b5be64..e4b03d0fb 100644
--- a/cpp/src/cuimage.cpp
+++ b/cpp/src/cuimage.cpp
@@ -19,6 +19,7 @@
 #include <cstring>
 #include <fstream>
 #include <iostream>
+#include <memory>
 
 #if CUCIM_SUPPORT_CUDA
 #    include <cuda_runtime.h>
@@ -143,21 +144,25 @@ std::unique_ptr<plugin::ImageFormat> CuImage::image_format_plugins_ = std::make_
 CuImage::CuImage(const filesystem::Path& path)
 {
     PROF_SCOPED_RANGE(PROF_EVENT_P(cuimage_cuimage, 1));
-    //    printf("[cuCIM] CuImage::CuImage(filesystem::Path path)\n");
+
     ensure_init();
     image_format_ = image_format_plugins_->detect_image_format(path);
 
     // TODO: need to detect available format for the file path
     {
         PROF_SCOPED_RANGE(PROF_EVENT(cuimage_cuimage_open));
-        file_handle_ = image_format_->image_parser.open(path.c_str());
+        std::shared_ptr<CuCIMFileHandle>* file_handle_shared =
+            reinterpret_cast<std::shared_ptr<CuCIMFileHandle>*>(image_format_->image_parser.open(path.c_str()));
+        file_handle_ = *file_handle_shared;
+        delete file_handle_shared;
+
+        // Set deleter to close the file handle
+        file_handle_->set_deleter(image_format_->image_parser.close);
     }
-    //    printf("[GB] file_handle: %s\n", file_handle_.path);
-    //    fmt::print("[GB] CuImage path char: '{}'\n", file_handle_.path[0]);
 
     io::format::ImageMetadata& image_metadata = *(new io::format::ImageMetadata{});
     image_metadata_ = &image_metadata.desc();
-    is_loaded_ = image_format_->image_parser.parse(&file_handle_, image_metadata_);
+    is_loaded_ = image_format_->image_parser.parse(file_handle_.get(), image_metadata_);
     dim_indices_ = DimIndices(image_metadata_->dims);
 
     auto& associated_image_info = image_metadata_->associated_image_info;
@@ -178,18 +183,10 @@ CuImage::CuImage(const filesystem::Path& path, const std::string& plugin_name)
     (void)plugin_name;
 }
 
-// CuImage::CuImage(const CuImage& cuimg) : std::enable_shared_from_this<CuImage>()
-//{
-//    printf("[cuCIM] CuImage::CuImage(const CuImage& cuimg)\n");
-//    (void)cuimg;
-//
-//}
-
 CuImage::CuImage(CuImage&& cuimg) : std::enable_shared_from_this<CuImage>()
 {
     PROF_SCOPED_RANGE(PROF_EVENT_P(cuimage_cuimage, 3));
-    // printf("[cuCIM] CuImage::CuImage(CuImage&& cuimg) %s\n", cuimg.file_handle_.path);
-    (void)cuimg;
+
     std::swap(file_handle_, cuimg.file_handle_);
     std::swap(image_format_, cuimg.image_format_);
     std::swap(image_metadata_, cuimg.image_metadata_);
@@ -205,11 +202,8 @@ CuImage::CuImage(const CuImage* cuimg,
     : std::enable_shared_from_this<CuImage>()
 {
     PROF_SCOPED_RANGE(PROF_EVENT_P(cuimage_cuimage, 4));
-    //    printf(
-    //        "[cuCIM] CuImage::CuImage(CuImage* cuimg, io::format::ImageMetadataDesc* image_metadata,
-    //        cucim::io::format::ImageDataDesc* image_data)\n");
 
-    //    file_handle_ = cuimg->file_handle_; ==> Don't do this. it will cause a double free.
+    file_handle_ = cuimg->file_handle_;
     image_format_ = cuimg->image_format_;
     image_metadata_ = image_metadata;
     image_data_ = image_data;
@@ -233,15 +227,14 @@ CuImage::CuImage(const CuImage* cuimg,
 CuImage::CuImage() : std::enable_shared_from_this<CuImage>()
 {
     PROF_SCOPED_RANGE(PROF_EVENT_P(cuimage_cuimage, 5));
-    file_handle_.path = const_cast<char*>("<null>");
+    file_handle_ = std::make_shared<CuCIMFileHandle>();
+    file_handle_->path = const_cast<char*>("");
 }
 
 CuImage::~CuImage()
 {
     PROF_SCOPED_RANGE(PROF_EVENT(cuimage__cuimage));
-    //    printf("[cuCIM] CuImage::~CuImage()\n");
-    close();
-    image_format_ = nullptr; // memory release is handled by the framework
+
     if (image_metadata_)
     {
         // Memory for json_data needs to be manually released if image_metadata_->json_data is not ""
@@ -273,18 +266,18 @@ CuImage::~CuImage()
                 image_data_->container.data = nullptr;
                 break;
             case io::DeviceType::kCUDA:
-                cudaError_t cuda_status;
-                CUDA_TRY(cudaFree(image_data_->container.data));
-                image_data_->container.data = nullptr;
-                if (cuda_status)
+
+                if (image_data_->loader)
                 {
-                    fmt::print(stderr, "[Error] Cannot free memory!");
+                    cudaError_t cuda_status;
+                    CUDA_TRY(cudaFree(image_data_->container.data));
                 }
+                image_data_->container.data = nullptr;
                 break;
             case io::DeviceType::kPinned:
             case io::DeviceType::kCPUShared:
             case io::DeviceType::kCUDAShared:
-                fmt::print(stderr, "Device type {} is not supported!", device_type);
+                fmt::print(stderr, "Device type {} is not supported!\n", device_type);
                 break;
             }
         }
@@ -303,9 +296,20 @@ CuImage::~CuImage()
             cucim_free(image_data_->shm_name);
             image_data_->shm_name = nullptr;
         }
+        if (image_data_->loader)
+        {
+            auto loader = reinterpret_cast<cucim::loader::ThreadBatchDataLoader*>(image_data_->loader);
+            delete loader;
+
+            image_data_->loader = nullptr;
+        }
+
         cucim_free(image_data_);
         image_data_ = nullptr;
     }
+
+    close(); // close file handle (NOTE:: close the file handle after loader is deleted)
+    image_format_ = nullptr; // memory release is handled by the framework
 }
 
 Framework* CuImage::get_framework()
@@ -351,7 +355,7 @@ bool CuImage::is_trace_enabled()
 
 filesystem::Path CuImage::path() const
 {
-    return file_handle_.path == nullptr ? "" : file_handle_.path;
+    return file_handle_->path == nullptr ? "" : file_handle_->path;
 }
 bool CuImage::is_loaded() const
 {
@@ -588,9 +592,27 @@ memory::DLTContainer CuImage::container() const
     }
 }
 
+loader::ThreadBatchDataLoader* CuImage::loader() const
+{
+    if (image_data_)
+    {
+        return reinterpret_cast<loader::ThreadBatchDataLoader*>(image_data_->loader);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
 CuImage CuImage::read_region(std::vector<int64_t>&& location,
                              std::vector<int64_t>&& size,
                              uint16_t level,
+                             uint32_t num_workers,
+                             uint32_t batch_size,
+                             bool drop_last,
+                             uint32_t prefetch_factor,
+                             bool shuffle,
+                             uint64_t seed,
                              const DimIndices& region_dim_indices,
                              const io::Device& device,
                              DLTensor* buf,
@@ -598,7 +620,6 @@ CuImage CuImage::read_region(std::vector<int64_t>&& location,
 {
     PROF_SCOPED_RANGE(PROF_EVENT(cuimage_read_region));
     (void)region_dim_indices;
-    (void)device;
     (void)buf;
     (void)shm_name;
 
@@ -621,44 +642,101 @@ CuImage CuImage::read_region(std::vector<int64_t>&& location,
         size.insert(size.end(), level_dimension.begin(), level_dimension.end());
     }
 
+    // The number of locations should be the multiplication of the number of dimensions in the size.
+    if (location.size() % size.size() != 0)
+    {
+        throw std::runtime_error(
+            "[Error] The number of locations should be the multiplication of the number of dimensions in the size!");
+    }
+
+    // Make sure the batch size is not zero.
+    if (batch_size == 0)
+    {
+        batch_size = 1;
+    }
+
+    // num_workers would be always > 0 if output device type is CUDA
+    if (num_workers == 0 && device.type() == cucim::io::DeviceType::kCUDA)
+    {
+        num_workers = 1;
+    }
+
+    uint32_t size_ndim = size.size();
+    uint64_t location_len = location.size() / size_ndim;
     std::string device_name = std::string(device);
     cucim::io::format::ImageReaderRegionRequestDesc request{};
-    int64_t request_location[2] = { location[0], location[1] };
-    request.location = request_location;
+
+    if (location_len > 1 || batch_size > 1 || num_workers > 0)
+    {
+        // ::Note:: Here, to pass vector data to C interface, we move data in the original vector to the vector in heap
+        // memory and create a unique pointer with 'new'. The data is transferred to ThreadBatchDataLoader class members
+        // (locations_ and size_) for automatic deletion on exit.
+        auto location_ptr = new std::vector<int64_t>();
+        location_ptr->swap(location);
+        auto location_unique = reinterpret_cast<void*>(new std::unique_ptr<std::vector<int64_t>>(location_ptr));
+
+        auto size_ptr = new std::vector<int64_t>();
+        size_ptr->swap(size);
+        auto size_unique = reinterpret_cast<void*>(new std::unique_ptr<std::vector<int64_t>>(size_ptr));
+
+        request.location = location_ptr->data();
+        request.location_unique = location_unique;
+        request.size = size_ptr->data();
+        request.size_unique = size_unique;
+    }
+    else
+    {
+        request.location = location.data();
+        request.size = size.data();
+    }
+    request.location_len = location_len;
+    request.size_ndim = size_ndim;
     request.level = level;
-    int64_t request_size[2] = { size[0], size[1] };
-    request.size = request_size;
+    request.num_workers = num_workers;
+    request.batch_size = batch_size;
+    request.drop_last = drop_last;
+    request.prefetch_factor = prefetch_factor;
+    request.shuffle = shuffle;
+    request.seed = seed;
     request.device = device_name.data();
 
-    //    cucim::io::format::ImageDataDesc image_data{};
+    auto image_data = std::unique_ptr<io::format::ImageDataDesc, decltype(cucim_free)*>(
+        reinterpret_cast<io::format::ImageDataDesc*>(cucim_malloc(sizeof(io::format::ImageDataDesc))), cucim_free);
+    memset(image_data.get(), 0, sizeof(io::format::ImageDataDesc));
 
-    cucim::io::format::ImageDataDesc* image_data =
-        static_cast<cucim::io::format::ImageDataDesc*>(cucim_malloc(sizeof(cucim::io::format::ImageDataDesc)));
-    memset(image_data, 0, sizeof(cucim::io::format::ImageDataDesc));
     try
     {
         // Read region from internal file if image_data_ is nullptr
         if (image_data_ == nullptr)
         {
-            if (file_handle_.fd < 0) // file_handle_ is not opened
+            if (!file_handle_) // file_handle_ is not opened
             {
                 throw std::runtime_error("[Error] The image file is closed!");
             }
             if (!image_format_->image_reader.read(
-                    &file_handle_, image_metadata_, &request, image_data, nullptr /*out_metadata*/))
+                    file_handle_.get(), image_metadata_, &request, image_data.get(), nullptr /*out_metadata*/))
             {
-                cucim_free(image_data);
                 throw std::runtime_error("[Error] Failed to read image!");
             }
         }
         else // Read region by cropping image
         {
+            const char* dims_str = image_metadata_->dims;
+            if (strncmp("YXC", dims_str, 4) != 0)
+            {
+                throw std::runtime_error(fmt::format("[Error] The image is not in YXC format! ({})", dims_str));
+            }
+            if (image_data_->container.data == nullptr)
+            {
+                throw std::runtime_error(
+                    "[Error] The image data is nullptr! It is possible that the object is iterator and the image data "
+                    "is not loaded yet! Please advance the iterator first!");
+            }
             crop_image(request, *image_data);
         }
     }
     catch (std::invalid_argument& e)
     {
-        cucim_free(image_data);
         throw e;
     }
 
@@ -676,6 +754,10 @@ CuImage CuImage::read_region(std::vector<int64_t>&& location,
     auto& resource = out_metadata.get_resource();
 
     std::string_view dims{ "YXC" };
+    if (batch_size > 1)
+    {
+        dims = { "NYXC" };
+    }
 
     // Information from image_data
     std::pmr::vector<int64_t> shape(&resource);
@@ -712,9 +794,17 @@ CuImage CuImage::read_region(std::vector<int64_t>&& location,
 
     std::pmr::vector<std::string_view> spacing_units(&resource);
     spacing_units.reserve(ndim);
-    for (int i = 0; i < ndim; i++)
+
+    int index = 0;
+    if (ndim == 4)
     {
-        int64_t dim_char = dim_indices_.index(dims[i]);
+        index = 1;
+        // The first dimension is for 'batch' ('N')
+        spacing_units.emplace_back(std::string_view{ "batch" });
+    }
+    for (; index < ndim; ++index)
+    {
+        int64_t dim_char = dim_indices_.index(dims[index]);
 
         const char* str_ptr = image_metadata_->spacing_units[dim_char];
         size_t str_len = strlen(image_metadata_->spacing_units[dim_char]);
@@ -758,7 +848,7 @@ CuImage CuImage::read_region(std::vector<int64_t>&& location,
     const uint16_t level_ndim = 2;
     std::pmr::vector<int64_t> level_dimensions(&resource);
     level_dimensions.reserve(level_ndim * 1); // it has only one size
-    level_dimensions.insert(level_dimensions.end(), &size[0], &size[level_ndim]);
+    level_dimensions.insert(level_dimensions.end(), request.location, &request.location[request.location_len]);
 
     std::pmr::vector<float> level_downsamples(&resource);
     level_downsamples.reserve(1);
@@ -766,7 +856,8 @@ CuImage CuImage::read_region(std::vector<int64_t>&& location,
 
     std::pmr::vector<uint32_t> level_tile_sizes(&resource);
     level_tile_sizes.reserve(level_ndim * 1); // it has only one size
-    level_tile_sizes.insert(level_tile_sizes.end(), &size[0], &size[level_ndim]); // same with level_dimension
+    level_tile_sizes.insert(
+        level_tile_sizes.end(), request.location, &request.location[request.location_len]); // same with level_dimension
 
     // Empty associated images
     const size_t associated_image_count = 0;
@@ -797,7 +888,7 @@ CuImage CuImage::read_region(std::vector<int64_t>&& location,
     out_metadata.raw_data(raw_data);
     out_metadata.json_data(json_data);
 
-    return CuImage(this, &out_metadata.desc(), image_data);
+    return CuImage(this, &out_metadata.desc(), image_data.release());
 }
 
 std::set<std::string> CuImage::associated_images() const
@@ -808,7 +899,7 @@ std::set<std::string> CuImage::associated_images() const
 CuImage CuImage::associated_image(const std::string& name, const io::Device& device) const
 {
     PROF_SCOPED_RANGE(PROF_EVENT(cuimage_associated_image));
-    if (file_handle_.fd < 0) // file_handle_ is not opened
+    if (file_handle_->fd < 0) // file_handle_ is not opened
     {
         throw std::runtime_error("[Error] The image file is closed!");
     }
@@ -820,20 +911,19 @@ CuImage CuImage::associated_image(const std::string& name, const io::Device& dev
         std::string device_name = std::string(device);
         request.device = device_name.data();
 
-        io::format::ImageDataDesc* out_image_data =
-            static_cast<cucim::io::format::ImageDataDesc*>(cucim_malloc(sizeof(cucim::io::format::ImageDataDesc)));
+        auto out_image_data = std::unique_ptr<io::format::ImageDataDesc, decltype(cucim_free)*>(
+            reinterpret_cast<io::format::ImageDataDesc*>(cucim_malloc(sizeof(io::format::ImageDataDesc))), cucim_free);
+        memset(out_image_data.get(), 0, sizeof(io::format::ImageDataDesc));
 
         io::format::ImageMetadata& out_metadata = *(new io::format::ImageMetadata{});
 
         if (!image_format_->image_reader.read(
-                &file_handle_, image_metadata_, &request, out_image_data, &out_metadata.desc()))
+                file_handle_.get(), image_metadata_, &request, out_image_data.get(), &out_metadata.desc()))
         {
-            cucim_free(out_image_data);
-            delete &out_metadata;
             throw std::runtime_error("[Error] Failed to read image!");
         }
 
-        return CuImage(this, &out_metadata.desc(), out_image_data);
+        return CuImage(this, &out_metadata.desc(), out_image_data.release());
     }
     return CuImage{};
 }
@@ -892,13 +982,7 @@ void CuImage::save(std::string file_path) const
 
 void CuImage::close()
 {
-    if (file_handle_.client_data)
-    {
-        image_format_->image_parser.close(&file_handle_);
-    }
-    file_handle_.cufile = nullptr;
-    file_handle_.path = nullptr;
-    file_handle_.fd = -1;
+    file_handle_ = nullptr;
 }
 
 void CuImage::ensure_init()
@@ -947,8 +1031,7 @@ bool CuImage::crop_image(const io::format::ImageReaderRegionRequestDesc& request
                          io::format::ImageDataDesc& out_image_data) const
 {
     PROF_SCOPED_RANGE(PROF_EVENT(cuimage_crop_image));
-    // TODO: assume length of location/size to 2.
-    constexpr int32_t ndims = 2;
+    const int32_t ndim = request.size_ndim;
 
     if (request.level >= image_metadata_->resolution_info.level_count)
     {
@@ -964,7 +1047,7 @@ bool CuImage::crop_image(const io::format::ImageReaderRegionRequestDesc& request
     //       (we cannot use `ifd->samples_per_pixel()` here)
     uint32_t samples_per_pixel = static_cast<uint32_t>(image_metadata_->shape[dim_indices_.index('C')]);
 
-    for (int32_t i = 0; i < ndims; ++i)
+    for (int32_t i = 0; i < ndim; ++i)
     {
         if (request.location[i] < 0)
         {
@@ -1110,4 +1193,217 @@ bool CuImage::crop_image(const io::format::ImageReaderRegionRequestDesc& request
     return true;
 }
 
+/////////////////////////////
+// Iterator implementation //
+/////////////////////////////
+
+CuImage::iterator CuImage::begin()
+{
+    return iterator(shared_from_this());
+}
+CuImage::iterator CuImage::end()
+{
+    return iterator(shared_from_this(), true);
+}
+
+CuImage::const_iterator CuImage::begin() const
+{
+    return const_iterator(shared_from_this());
+}
+
+CuImage::const_iterator CuImage::end() const
+{
+    return const_iterator(shared_from_this(), true);
+}
+
+template <typename DataType>
+CuImageIterator<DataType>::CuImageIterator(std::shared_ptr<DataType> cuimg, bool ending)
+    : cuimg_(cuimg), loader_(nullptr), batch_index_(0), total_batch_count_(0)
+{
+    if (!cuimg_)
+    {
+        throw std::runtime_error("CuImageIterator: cuimg is nullptr!");
+    }
+
+    auto& image_data = cuimg_->image_data_;
+    cucim::loader::ThreadBatchDataLoader* loader = nullptr;
+    if (image_data)
+    {
+        loader = reinterpret_cast<cucim::loader::ThreadBatchDataLoader*>(image_data->loader);
+        loader_ = loader;
+    }
+
+    if (ending) // point to the end
+    {
+        if (image_data)
+        {
+            if (loader)
+            {
+                total_batch_count_ = loader->total_batch_count();
+                batch_index_ = total_batch_count_;
+            }
+            else
+            {
+                total_batch_count_ = 1;
+                batch_index_ = 1;
+            }
+        }
+        else
+        {
+            batch_index_ = 0;
+        }
+    }
+    else
+    {
+        if (image_data)
+        {
+            if (loader)
+            {
+                total_batch_count_ = loader->total_batch_count();
+                if (loader->size() > 1)
+                {
+                    batch_index_ = loader->processed_batch_count();
+                }
+                else
+                {
+                    batch_index_ = 0;
+                }
+            }
+            else
+            {
+                total_batch_count_ = 1;
+                batch_index_ = 0;
+            }
+        }
+        else
+        {
+            throw std::out_of_range("Batch index out of range! ('image_data_' is null)");
+        }
+    }
+}
+
+template <typename DataType>
+typename CuImageIterator<DataType>::reference CuImageIterator<DataType>::operator*() const
+{
+    return cuimg_;
+}
+
+template <typename DataType>
+typename CuImageIterator<DataType>::pointer CuImageIterator<DataType>::operator->()
+{
+    return cuimg_.get();
+}
+
+template <typename DataType>
+CuImageIterator<DataType>& CuImageIterator<DataType>::operator++()
+{
+    // Prefix increment
+    increase_index_();
+    return *this;
+}
+
+template <typename DataType>
+CuImageIterator<DataType> CuImageIterator<DataType>::operator++(int)
+{
+    // Postfix increment
+    auto temp(*this);
+    increase_index_();
+    return temp;
+}
+
+template <typename DataType>
+bool CuImageIterator<DataType>::operator==(const CuImageIterator<DataType>& other)
+{
+    return cuimg_.get() == other.cuimg_.get() && batch_index_ == other.batch_index_;
+};
+
+template <typename DataType>
+bool CuImageIterator<DataType>::operator!=(const CuImageIterator<DataType>& other)
+{
+    return cuimg_.get() != other.cuimg_.get() || batch_index_ != other.batch_index_;
+};
+
+template <typename DataType>
+int64_t CuImageIterator<DataType>::index()
+{
+    auto loader = reinterpret_cast<cucim::loader::ThreadBatchDataLoader*>(loader_);
+    if (loader && (loader->size() > 1))
+    {
+        batch_index_ = loader->processed_batch_count();
+    }
+    return batch_index_;
+}
+
+template <typename DataType>
+uint64_t CuImageIterator<DataType>::size() const
+{
+    return total_batch_count_;
+}
+
+template <typename DataType>
+void CuImageIterator<DataType>::increase_index_()
+{
+    auto loader = reinterpret_cast<cucim::loader::ThreadBatchDataLoader*>(loader_);
+    if (loader)
+    {
+        auto next_data = loader->next_data();
+        if (next_data)
+        {
+            auto& image_data = cuimg_->image_data_;
+            auto image_data_ptr = reinterpret_cast<uint8_t**>(&(image_data->container.data));
+
+            DLContext& ctx = image_data->container.ctx;
+            auto device_type = static_cast<io::DeviceType>(ctx.device_type);
+            switch (device_type)
+            {
+            case io::DeviceType::kCPU:
+                if (*image_data_ptr)
+                {
+                    cucim_free(*image_data_ptr);
+                }
+                break;
+            case io::DeviceType::kCUDA:
+                if (*image_data_ptr)
+                {
+                    cudaError_t cuda_status;
+                    CUDA_ERROR(cudaFree(*image_data_ptr));
+                }
+                break;
+            case io::DeviceType::kPinned:
+            case io::DeviceType::kCPUShared:
+            case io::DeviceType::kCUDAShared:
+                fmt::print(stderr, "Device type {} is not supported!\n", device_type);
+                break;
+            }
+
+            *image_data_ptr = next_data;
+
+            if (loader->batch_size() > 1)
+            {
+                // Set value for dimension 'N'
+                cuimg_->image_data_->container.shape[0] = loader->data_batch_size();
+                cuimg_->image_metadata_->shape[0] = loader->data_batch_size();
+            }
+        }
+        if (loader->size() > 1)
+        {
+            batch_index_ = loader->processed_batch_count();
+        }
+        else
+        {
+            if (batch_index_ < static_cast<int64_t>(total_batch_count_))
+            {
+                ++batch_index_;
+            }
+        }
+    }
+    else
+    {
+        if (batch_index_ < static_cast<int64_t>(total_batch_count_))
+        {
+            ++batch_index_;
+        }
+    }
+}
+
 } // namespace cucim
\ No newline at end of file
diff --git a/cpp/src/filesystem/cufile_driver.cpp b/cpp/src/filesystem/cufile_driver.cpp
index d2a7bd9eb..67ca9cdda 100644
--- a/cpp/src/filesystem/cufile_driver.cpp
+++ b/cpp/src/filesystem/cufile_driver.cpp
@@ -195,14 +195,13 @@ CuFileDriver::CuFileDriver(int fd, bool no_gds, bool use_mmap, const char* file_
     file_flags_ = flags;
 
     FileHandleType file_type = (flags & O_DIRECT) ? FileHandleType::kPosixODirect : FileHandleType::kPosix;
-    handle_ = CuCIMFileHandle{ fd,
-                               nullptr,
-                               file_type,
-                               const_cast<char*>(file_path_.c_str()),
-                               this,
-                               static_cast<uint64_t>(st.st_dev),
-                               static_cast<uint64_t>(st.st_ino),
-                               static_cast<int64_t>(st.st_mtim.tv_nsec) };
+    // Copy file path (Allocated memory would be freed at close() method.)
+    char* file_path_cstr = static_cast<char*>(cucim_malloc(file_path_.size() + 1));
+    memcpy(file_path_cstr, file_path_.c_str(), file_path_.size());
+    file_path_cstr[file_path_.size()] = '\0';
+    handle_ = std::make_shared<CuCIMFileHandle>(fd, nullptr, file_type, const_cast<char*>(file_path_cstr), this,
+                                                static_cast<uint64_t>(st.st_dev), static_cast<uint64_t>(st.st_ino),
+                                                static_cast<int64_t>(st.st_mtim.tv_nsec));
 
     CUfileError_t status;
     CUfileDescr_t cf_descr{}; // It is important to set zero!
@@ -212,10 +211,10 @@ CuFileDriver::CuFileDriver(int fd, bool no_gds, bool use_mmap, const char* file_
     {
         cf_descr.handle.fd = fd;
         cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
-        status = cuFileHandleRegister(&handle_.cufile, &cf_descr);
+        status = cuFileHandleRegister(&handle_->cufile, &cf_descr);
         if (status.err == CU_FILE_SUCCESS)
         {
-            handle_.type = FileHandleType::kGPUDirect;
+            handle_->type = FileHandleType::kGPUDirect;
         }
         else
         {
@@ -236,7 +235,7 @@ CuFileDriver::CuFileDriver(int fd, bool no_gds, bool use_mmap, const char* file_
         mmap_ptr_ = mmap((void*)0, file_size_, PROT_READ, MAP_SHARED, fd, 0);
         if (mmap_ptr_ != MAP_FAILED)
         {
-            handle_.type = FileHandleType::kMemoryMapped;
+            handle_->type = FileHandleType::kMemoryMapped;
         }
         else
         {
@@ -258,7 +257,7 @@ ssize_t pread(const std::shared_ptr<CuFileDriver>& fd, void* buf, size_t count,
     }
     else
     {
-        fmt::print(stderr, "fd (CuFileDriver) is null!");
+        fmt::print(stderr, "fd (CuFileDriver) is null!\n");
         return -1;
     }
 }
@@ -270,7 +269,7 @@ ssize_t pwrite(const std::shared_ptr<CuFileDriver>& fd, const void* buf, size_t
     }
     else
     {
-        fmt::print(stderr, "fd (CuFileDriver) is null!");
+        fmt::print(stderr, "fd (CuFileDriver) is null!\n");
         return -1;
     }
 }
@@ -448,7 +447,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu
     cudaPointerAttributes attributes;
     cudaMemoryType memory_type;
 
-    FileHandleType file_type = handle_.type;
+    FileHandleType file_type = handle_->type;
 
     CUDA_TRY(cudaPointerGetAttributes(&attributes, buf));
     if (cuda_status)
@@ -482,7 +481,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu
                 {
                     break;
                 }
-                read_cnt = ::pread(handle_.fd, cache_buf, bytes_to_copy, read_offset);
+                read_cnt = ::pread(handle_->fd, cache_buf, bytes_to_copy, read_offset);
                 CUDA_TRY(cudaMemcpy(output_buf, cache_buf, bytes_to_copy, cudaMemcpyHostToDevice));
                 if (cuda_status)
                 {
@@ -497,7 +496,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu
         }
         else
         {
-            total_read_cnt = ::pread(handle_.fd, reinterpret_cast<char*>(buf) + buf_offset, count, file_offset);
+            total_read_cnt = ::pread(handle_->fd, reinterpret_cast<char*>(buf) + buf_offset, count, file_offset);
         }
     }
     else if (file_type == FileHandleType::kMemoryMapped)
@@ -517,7 +516,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu
         }
         total_read_cnt = count;
     }
-    else if (memory_type == cudaMemoryTypeUnregistered || handle_.type == FileHandleType::kPosixODirect)
+    else if (memory_type == cudaMemoryTypeUnregistered || handle_->type == FileHandleType::kPosixODirect)
     {
         uint64_t buf_align = (reinterpret_cast<uint64_t>(buf) + buf_offset) % PAGE_SIZE;
         bool is_aligned = (buf_align == 0) && ((file_offset % PAGE_SIZE) == 0);
@@ -531,7 +530,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu
                 if (memory_type == cudaMemoryTypeUnregistered)
                 {
                     read_cnt =
-                        ::pread(handle_.fd, reinterpret_cast<char*>(buf) + buf_offset, block_read_size, file_offset);
+                        ::pread(handle_->fd, reinterpret_cast<char*>(buf) + buf_offset, block_read_size, file_offset);
                     total_read_cnt += read_cnt;
                 }
                 else
@@ -551,7 +550,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu
                             break;
                         }
 
-                        read_cnt = ::pread(handle_.fd, cache_buf, bytes_to_copy, read_offset);
+                        read_cnt = ::pread(handle_->fd, cache_buf, bytes_to_copy, read_offset);
                         CUDA_TRY(cudaMemcpy(input_buf, cache_buf, bytes_to_copy, cudaMemcpyHostToDevice));
                         if (cuda_status)
                         {
@@ -574,7 +573,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu
 
                 // Read the remaining block (size of PAGE_SIZE)
                 ssize_t read_cnt;
-                read_cnt = ::pread(handle_.fd, buf_pos, PAGE_SIZE, block_read_size);
+                read_cnt = ::pread(handle_->fd, buf_pos, PAGE_SIZE, block_read_size);
                 if (read_cnt < 0)
                 {
                     fmt::print(stderr, "Cannot read the remaining file content block! ({})\n", std::strerror(errno));
@@ -612,7 +611,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu
 
             if (large_block_size <= cache_size) // Optimize if bytes to load is less than cache_size
             {
-                ssize_t read_cnt = ::pread(handle_.fd, cache_buf, large_block_size, file_start_offset);
+                ssize_t read_cnt = ::pread(handle_->fd, cache_buf, large_block_size, file_start_offset);
                 if (read_cnt < 0)
                 {
                     fmt::print(stderr, "Cannot read the file content block! ({})\n", std::strerror(errno));
@@ -652,7 +651,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu
                 // Handle the head part of the file content
                 if (header_size)
                 {
-                    read_cnt = ::pread(handle_.fd, internal_buf_pos, PAGE_SIZE, read_offset);
+                    read_cnt = ::pread(handle_->fd, internal_buf_pos, PAGE_SIZE, read_offset);
                     if (read_cnt < 0)
                     {
                         fmt::print(stderr, "Cannot read the head part of the file content block! ({})\n",
@@ -691,7 +690,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu
                         break;
                     }
 
-                    read_cnt = ::pread(handle_.fd, cache_buf, bytes_to_copy, read_offset);
+                    read_cnt = ::pread(handle_->fd, cache_buf, bytes_to_copy, read_offset);
                     if (memory_type == cudaMemoryTypeUnregistered)
                     {
                         memcpy(output_buf, cache_buf, bytes_to_copy);
@@ -715,7 +714,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu
                 if (tail_size)
                 {
                     //                memset(internal_buf_pos, 0, PAGE_SIZE); // no need to initialize for pread()
-                    read_cnt = ::pread(handle_.fd, internal_buf_pos, PAGE_SIZE, read_offset);
+                    read_cnt = ::pread(handle_->fd, internal_buf_pos, PAGE_SIZE, read_offset);
                     if (read_cnt < 0)
                     {
                         fmt::print(stderr, "Cannot read the tail part of the file content block! ({})\n",
@@ -745,7 +744,7 @@ ssize_t CuFileDriver::pread(void* buf, size_t count, off_t file_offset, off_t bu
     {
         (void*)s_cufile_cache.device_cache(); // Lazy initialization
 
-        ssize_t read_cnt = cuFileRead(handle_.cufile, reinterpret_cast<char*>(buf) + buf_offset, count, file_offset, 0);
+        ssize_t read_cnt = cuFileRead(handle_->cufile, reinterpret_cast<char*>(buf) + buf_offset, count, file_offset, 0);
         total_read_cnt += read_cnt;
         if (read_cnt < 0)
         {
@@ -775,7 +774,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
     cudaPointerAttributes attributes;
     cudaMemoryType memory_type;
 
-    FileHandleType file_type = handle_.type;
+    FileHandleType file_type = handle_->type;
 
     CUDA_TRY(cudaPointerGetAttributes(&attributes, buf));
     if (cuda_status)
@@ -808,7 +807,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
                 {
                     return -1;
                 }
-                write_cnt = ::pwrite(handle_.fd, cache_buf, bytes_to_copy, write_offset);
+                write_cnt = ::pwrite(handle_->fd, cache_buf, bytes_to_copy, write_offset);
                 write_offset += write_cnt;
                 input_buf += write_cnt;
                 remaining_size -= write_cnt;
@@ -818,7 +817,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
         }
         else
         {
-            total_write_cnt = ::pwrite(handle_.fd, reinterpret_cast<const char*>(buf) + buf_offset, count, file_offset);
+            total_write_cnt = ::pwrite(handle_->fd, reinterpret_cast<const char*>(buf) + buf_offset, count, file_offset);
         }
     }
     else if (file_type == FileHandleType::kMemoryMapped)
@@ -826,7 +825,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
         fmt::print(stderr, "[Error] pwrite() is not supported for Memory-mapped IO file type!\n");
         return -1;
     }
-    else if (memory_type == cudaMemoryTypeUnregistered || handle_.type == FileHandleType::kPosixODirect)
+    else if (memory_type == cudaMemoryTypeUnregistered || handle_->type == FileHandleType::kPosixODirect)
     {
         uint64_t buf_align = (reinterpret_cast<uint64_t>(buf) + buf_offset) % PAGE_SIZE;
         bool is_aligned = (buf_align == 0) && ((file_offset % PAGE_SIZE) == 0);
@@ -841,7 +840,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
                 if (memory_type == cudaMemoryTypeUnregistered)
                 {
                     write_cnt = ::pwrite(
-                        handle_.fd, reinterpret_cast<const char*>(buf) + buf_offset, block_write_size, file_offset);
+                        handle_->fd, reinterpret_cast<const char*>(buf) + buf_offset, block_write_size, file_offset);
                     total_write_cnt += write_cnt;
                 }
                 else
@@ -866,7 +865,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
                         {
                             return -1;
                         }
-                        write_cnt = ::pwrite(handle_.fd, cache_buf, bytes_to_copy, write_offset);
+                        write_cnt = ::pwrite(handle_->fd, cache_buf, bytes_to_copy, write_offset);
                         write_offset += write_cnt;
                         input_buf += write_cnt;
                         remaining_size -= write_cnt;
@@ -885,7 +884,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
 
                 // Read the remaining block (size of PAGE_SIZE)
                 ssize_t read_cnt;
-                read_cnt = ::pread(handle_.fd, internal_buf_pos, PAGE_SIZE, block_write_size);
+                read_cnt = ::pread(handle_->fd, internal_buf_pos, PAGE_SIZE, block_write_size);
                 if (read_cnt < 0)
                 {
                     fmt::print(stderr, "Cannot read the remaining file content block! ({})\n", std::strerror(errno));
@@ -908,7 +907,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
                     }
                 }
                 // Write the constructed block
-                write_cnt = ::pwrite(handle_.fd, internal_buf_pos, PAGE_SIZE, block_write_size);
+                write_cnt = ::pwrite(handle_->fd, internal_buf_pos, PAGE_SIZE, block_write_size);
                 if (write_cnt < 0)
                 {
                     fmt::print(stderr, "Cannot write the remaining file content! ({})\n", std::strerror(errno));
@@ -933,7 +932,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
             if (large_block_size <= cache_size) // Optimize if bytes to write is less than cache_size
             {
                 memset(cache_buf, 0, PAGE_SIZE);
-                ssize_t read_cnt = ::pread(handle_.fd, cache_buf, PAGE_SIZE, file_start_offset);
+                ssize_t read_cnt = ::pread(handle_->fd, cache_buf, PAGE_SIZE, file_start_offset);
                 if (read_cnt < 0)
                 {
                     fmt::print(
@@ -942,7 +941,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
                 }
                 if (large_block_size > PAGE_SIZE)
                 {
-                    read_cnt = ::pread(handle_.fd, cache_buf + large_block_size - PAGE_SIZE, PAGE_SIZE,
+                    read_cnt = ::pread(handle_->fd, cache_buf + large_block_size - PAGE_SIZE, PAGE_SIZE,
                                        end_boundary_offset - PAGE_SIZE);
                     if (read_cnt < 0)
                     {
@@ -966,7 +965,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
                 }
 
                 // Write the constructed block
-                ssize_t write_cnt = ::pwrite(handle_.fd, cache_buf, large_block_size, file_start_offset);
+                ssize_t write_cnt = ::pwrite(handle_->fd, cache_buf, large_block_size, file_start_offset);
                 if (write_cnt < 0)
                 {
                     fmt::print(stderr, "Cannot write the file content block! ({})\n", std::strerror(errno));
@@ -993,7 +992,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
                 // Handle the head part of the file content
                 if (header_size)
                 {
-                    read_cnt = ::pread(handle_.fd, internal_buf_pos, PAGE_SIZE, write_offset);
+                    read_cnt = ::pread(handle_->fd, internal_buf_pos, PAGE_SIZE, write_offset);
                     if (read_cnt < 0)
                     {
                         fmt::print(stderr, "Cannot read the head part of the file content block! ({})\n",
@@ -1017,7 +1016,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
                     }
 
                     // Write the constructed block
-                    write_cnt = ::pwrite(handle_.fd, internal_buf_pos, PAGE_SIZE, write_offset);
+                    write_cnt = ::pwrite(handle_->fd, internal_buf_pos, PAGE_SIZE, write_offset);
                     if (write_cnt < 0)
                     {
                         fmt::print(stderr, "Cannot write the head part of the file content block! ({})\n",
@@ -1052,7 +1051,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
                             return -1;
                         }
                     }
-                    write_cnt = ::pwrite(handle_.fd, cache_buf, bytes_to_copy, write_offset);
+                    write_cnt = ::pwrite(handle_->fd, cache_buf, bytes_to_copy, write_offset);
                     write_offset += write_cnt;
                     input_buf += write_cnt;
                     body_remaining_size -= write_cnt;
@@ -1064,7 +1063,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
                 if (tail_size)
                 {
                     memset(internal_buf_pos, 0, PAGE_SIZE);
-                    read_cnt = ::pread(handle_.fd, internal_buf_pos, PAGE_SIZE, write_offset);
+                    read_cnt = ::pread(handle_->fd, internal_buf_pos, PAGE_SIZE, write_offset);
                     if (read_cnt < 0)
                     {
                         fmt::print(stderr, "Cannot read the tail part of the file content block! ({})\n",
@@ -1087,7 +1086,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
                     }
 
                     // Write the constructed block
-                    write_cnt = ::pwrite(handle_.fd, internal_buf_pos, PAGE_SIZE, write_offset);
+                    write_cnt = ::pwrite(handle_->fd, internal_buf_pos, PAGE_SIZE, write_offset);
                     if (write_cnt < 0)
                     {
                         fmt::print(stderr, "Cannot write the tail part of the file content block! ({})\n",
@@ -1104,7 +1103,7 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
         (void*)s_cufile_cache.device_cache(); // Lazy initialization
 
         ssize_t write_cnt =
-            cuFileWrite(handle_.cufile, reinterpret_cast<const char*>(buf) + buf_offset, count, file_offset, 0);
+            cuFileWrite(handle_->cufile, reinterpret_cast<const char*>(buf) + buf_offset, count, file_offset, 0);
         if (write_cnt < 0)
         {
             fmt::print(stderr, "[cuFile Error] {}\n", CUFILE_ERRSTR(write_cnt));
@@ -1122,10 +1121,10 @@ ssize_t CuFileDriver::pwrite(const void* buf, size_t count, off_t file_offset, o
 }
 bool CuFileDriver::close()
 {
-    if (handle_.cufile)
+    if (handle_->cufile)
     {
-        cuFileHandleDeregister(handle_.cufile);
-        handle_.cufile = nullptr;
+        cuFileHandleDeregister(handle_->cufile);
+
     }
     if (mmap_ptr_)
     {
@@ -1136,22 +1135,21 @@ bool CuFileDriver::close()
         }
         mmap_ptr_ = nullptr;
     }
-    if (handle_.fd != -1)
+    if (handle_->fd != -1)
     {
         // If block write was used
         if ((file_flags_ & O_RDWR) &&
-            (handle_.type == FileHandleType::kGPUDirect || handle_.type == FileHandleType::kPosixODirect))
+            (handle_->type == FileHandleType::kGPUDirect || handle_->type == FileHandleType::kPosixODirect))
         {
             // Truncate file assuming that `file_size_` is up to date during pwrite() calls
-            int err = ::ftruncate(handle_.fd, file_size_);
+            int err = ::ftruncate(handle_->fd, file_size_);
             if (err < 0)
             {
-                fmt::print(stderr, "[Error] Cannot resize the file {} to {} ({})\n", handle_.path, file_size_,
+                fmt::print(stderr, "[Error] Cannot resize the file {} to {} ({})\n", handle_->path, file_size_,
                            std::strerror(errno));
             }
         }
-        ::close(handle_.fd);
-        handle_.fd = -1;
+        handle_ = nullptr;
     }
     file_path_.clear();
     file_size_ = 0;
diff --git a/cpp/src/loader/batch_data_processor.cpp b/cpp/src/loader/batch_data_processor.cpp
new file mode 100644
index 000000000..2ba188b18
--- /dev/null
+++ b/cpp/src/loader/batch_data_processor.cpp
@@ -0,0 +1,77 @@
+/*
+ * Apache License, Version 2.0
+ * Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cucim/loader/batch_data_processor.h"
+
+#include <cuda_runtime.h>
+#include <fmt/format.h>
+
+#include "cucim/cache/image_cache_manager.h"
+
+namespace cucim::loader
+{
+
+BatchDataProcessor::BatchDataProcessor(const uint32_t batch_size) : batch_size_(batch_size), processed_index_count_(0)
+{
+}
+
+BatchDataProcessor::~BatchDataProcessor()
+{
+}
+
+
+void BatchDataProcessor::add_tile(const TileInfo& tile)
+{
+    tiles_.emplace_back(tile);
+    ++total_index_count_;
+}
+
+TileInfo BatchDataProcessor::remove_front_tile()
+{
+    const TileInfo tile = tiles_.front();
+    tiles_.pop_front();
+    ++processed_index_count_;
+    return tile;
+}
+
+uint32_t BatchDataProcessor::request(std::deque<uint32_t>& batch_item_counts, const uint32_t num_remaining_patches)
+{
+    (void)batch_item_counts;
+    (void)num_remaining_patches;
+    return 0;
+}
+
+uint32_t BatchDataProcessor::wait_batch(const uint32_t index_in_task,
+                                        std::deque<uint32_t>& batch_item_counts,
+                                        const uint32_t num_remaining_patches)
+{
+    (void)index_in_task;
+    (void)batch_item_counts;
+    (void)num_remaining_patches;
+    return 0;
+}
+
+std::shared_ptr<cucim::cache::ImageCacheValue> BatchDataProcessor::wait_for_processing(const uint32_t)
+{
+    return std::shared_ptr<cucim::cache::ImageCacheValue>();
+}
+
+void BatchDataProcessor::shutdown()
+{
+}
+
+} // namespace cucim::loader
diff --git a/cpp/src/loader/thread_batch_data_loader.cpp b/cpp/src/loader/thread_batch_data_loader.cpp
new file mode 100644
index 000000000..1d5cd5498
--- /dev/null
+++ b/cpp/src/loader/thread_batch_data_loader.cpp
@@ -0,0 +1,320 @@
+/*
+ * Apache License, Version 2.0
+ * Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cucim/loader/thread_batch_data_loader.h"
+
+#include <cassert>
+
+#include <fmt/format.h>
+
+#include "cucim/profiler/nvtx3.h"
+#include "cucim/util/cuda.h"
+
+namespace cucim::loader
+{
+
+ThreadBatchDataLoader::ThreadBatchDataLoader(LoadFunc load_func,
+                                             std::unique_ptr<BatchDataProcessor> batch_data_processor,
+                                             const cucim::io::Device out_device,
+                                             std::unique_ptr<std::vector<int64_t>> location,
+                                             std::unique_ptr<std::vector<int64_t>> image_size,
+                                             const uint64_t location_len,
+                                             const size_t one_raster_size,
+                                             const uint32_t batch_size,
+                                             const uint32_t prefetch_factor,
+                                             const uint32_t num_workers)
+    : load_func_(load_func),
+      out_device_(out_device),
+      location_(std::move(location)),
+      image_size_(std::move(image_size)),
+      location_len_(location_len),
+      one_rester_size_(one_raster_size),
+      batch_size_(batch_size),
+      prefetch_factor_(prefetch_factor),
+      num_workers_(num_workers),
+      batch_data_processor_(std::move(batch_data_processor)),
+      buffer_size_(one_raster_size * batch_size),
+      thread_pool_(num_workers),
+      queued_item_count_(0),
+      buffer_item_head_index_(0),
+      buffer_item_tail_index_(0),
+      processed_batch_count_(0),
+      current_data_(nullptr),
+      current_data_batch_size_(0)
+{
+    buffer_item_len_ = std::min(static_cast<uint64_t>(location_len_), static_cast<uint64_t>(1 + prefetch_factor_)),
+
+    raster_data_.reserve(buffer_item_len_);
+    cucim::io::DeviceType device_type = out_device_.type();
+    for (size_t i = 0; i < buffer_item_len_; ++i)
+    {
+        switch (device_type)
+        {
+        case io::DeviceType::kCPU:
+            raster_data_.emplace_back(static_cast<uint8_t*>(cucim_malloc(buffer_size_)));
+            break;
+        case io::DeviceType::kCUDA: {
+            cudaError_t cuda_status;
+            void* image_data_ptr = nullptr;
+            CUDA_ERROR(cudaMalloc(&image_data_ptr, buffer_size_));
+            raster_data_.emplace_back(static_cast<uint8_t*>(image_data_ptr));
+            break;
+        }
+        case io::DeviceType::kPinned:
+        case io::DeviceType::kCPUShared:
+        case io::DeviceType::kCUDAShared:
+            fmt::print(stderr, "Device type {} is not supported!\n", device_type);
+            break;
+        }
+    }
+}
+
+ThreadBatchDataLoader::~ThreadBatchDataLoader()
+{
+    cucim::io::DeviceType device_type = out_device_.type();
+    for (auto& raster_ptr : raster_data_)
+    {
+        switch (device_type)
+        {
+        case io::DeviceType::kCPU:
+            if (raster_ptr)
+            {
+                cucim_free(raster_ptr);
+            }
+            break;
+        case io::DeviceType::kCUDA:
+            cudaError_t cuda_status;
+            if (raster_ptr)
+            {
+                cuda_status = cudaSuccess;
+                CUDA_TRY(cudaFree(raster_ptr));
+            }
+            break;
+        case io::DeviceType::kPinned:
+        case io::DeviceType::kCPUShared:
+        case io::DeviceType::kCUDAShared:
+            fmt::print(stderr, "Device type {} is not supported!", device_type);
+            break;
+        }
+        raster_ptr = nullptr;
+    }
+    if (batch_data_processor_)
+    {
+        stopped_ = true;
+        batch_data_processor_->shutdown();
+    }
+}
+
+ThreadBatchDataLoader::operator bool() const
+{
+    return (num_workers_ > 0);
+}
+
+uint8_t* ThreadBatchDataLoader::raster_pointer(const uint64_t location_index) const
+{
+    uint64_t buffer_item_index = (location_index / batch_size_) % buffer_item_len_;
+    uint32_t raster_data_index = location_index % batch_size_;
+
+    assert(buffer_item_index < buffer_item_len_);
+
+    uint8_t* batch_raster_ptr = raster_data_[buffer_item_index];
+
+    return &batch_raster_ptr[raster_data_index * one_rester_size_];
+}
+
+uint32_t ThreadBatchDataLoader::request(uint32_t load_size)
+{
+    if (num_workers_ == 0)
+    {
+        return 0;
+    }
+
+    if (load_size == 0)
+    {
+        load_size = batch_size_;
+    }
+
+    uint32_t num_items_to_request = std::min(load_size, static_cast<uint32_t>(location_len_ - queued_item_count_));
+    for (uint32_t i = 0; i < num_items_to_request; ++i)
+    {
+        uint32_t last_item_count = 0;
+        if (!tasks_.empty())
+        {
+            last_item_count = tasks_.size();
+        }
+        load_func_(this, queued_item_count_);
+        ++queued_item_count_;
+        buffer_item_tail_index_ = queued_item_count_ % buffer_item_len_;
+        // Append the number of added tasks to the batch count list.
+        batch_item_counts_.emplace_back(tasks_.size() - last_item_count);
+    }
+
+    if (batch_data_processor_)
+    {
+        uint32_t num_remaining_patches = static_cast<uint32_t>(location_len_ - queued_item_count_);
+        batch_data_processor_->request(batch_item_counts_, num_remaining_patches);
+    }
+    return num_items_to_request;
+}
+
+uint32_t ThreadBatchDataLoader::wait_batch()
+{
+    if (num_workers_ == 0)
+    {
+        return 0;
+    }
+
+    uint32_t num_items_waited = 0;
+    for (uint32_t batch_item_index = 0; batch_item_index < batch_size_ && !batch_item_counts_.empty(); ++batch_item_index)
+    {
+        uint32_t batch_item_count = batch_item_counts_.front();
+        for (uint32_t i = 0; i < batch_item_count; ++i)
+        {
+            auto& future = tasks_.front();
+            future.wait();
+            tasks_.pop_front();
+            if (batch_data_processor_)
+            {
+                batch_data_processor_->remove_front_tile();
+                uint32_t num_remaining_patches = static_cast<uint32_t>(location_len_ - queued_item_count_);
+                batch_data_processor_->wait_batch(i, batch_item_counts_, num_remaining_patches);
+            }
+        }
+        batch_item_counts_.pop_front();
+        num_items_waited += batch_item_count;
+    }
+    return num_items_waited;
+}
+
+
+uint8_t* ThreadBatchDataLoader::next_data()
+{
+    if (num_workers_ == 0) // (location_len == 1 && batch_size == 1)
+    {
+        // If it reads entire image with multi threads (using loader), release raster memory from batch data loader.
+        uint8_t* batch_raster_ptr = raster_data_[0];
+        raster_data_[0] = nullptr;
+        return batch_raster_ptr;
+    }
+
+    if (processed_batch_count_ * batch_size_ >= location_len_)
+    {
+        // Remove buffer items that are no longer needed.
+        for (size_t i = 0; i < buffer_item_len_; ++i)
+        {
+            raster_data_[i] = nullptr;
+        }
+        return nullptr;
+    }
+
+    // Wait until the batch is ready.
+    wait_batch();
+
+    uint8_t* batch_raster_ptr = raster_data_[buffer_item_head_index_];
+
+    cucim::io::DeviceType device_type = out_device_.type();
+    switch (device_type)
+    {
+    case io::DeviceType::kCPU:
+        raster_data_[buffer_item_head_index_] = static_cast<uint8_t*>(cucim_malloc(buffer_size_));
+        break;
+    case io::DeviceType::kCUDA: {
+        cudaError_t cuda_status;
+        CUDA_ERROR(cudaMalloc(&raster_data_[buffer_item_head_index_], buffer_size_));
+        break;
+    }
+    case io::DeviceType::kPinned:
+    case io::DeviceType::kCPUShared:
+    case io::DeviceType::kCUDAShared:
+        fmt::print(stderr, "Device type {} is not supported!\n", device_type);
+        break;
+    }
+
+    buffer_item_head_index_ = (buffer_item_head_index_ + 1) % buffer_item_len_;
+
+    current_data_ = batch_raster_ptr;
+    current_data_batch_size_ =
+        std::min(location_len_ - (processed_batch_count_ * batch_size_), static_cast<uint64_t>(batch_size_));
+
+    ++processed_batch_count_;
+
+    // Prepare the next batch
+    request(batch_size_);
+    return batch_raster_ptr;
+}
+
+BatchDataProcessor* ThreadBatchDataLoader::batch_data_processor()
+{
+    return batch_data_processor_.get();
+}
+
+std::shared_ptr<cucim::cache::ImageCacheValue> ThreadBatchDataLoader::wait_for_processing(uint32_t index)
+{
+    if (batch_data_processor_ == nullptr || stopped_)
+    {
+        return std::shared_ptr<cucim::cache::ImageCacheValue>();
+    }
+
+    return batch_data_processor_->wait_for_processing(index);
+}
+
+uint64_t ThreadBatchDataLoader::size() const
+{
+    return location_len_;
+}
+
+uint32_t ThreadBatchDataLoader::batch_size() const
+{
+    return batch_size_;
+}
+
+uint64_t ThreadBatchDataLoader::total_batch_count() const
+{
+    return (location_len_ + batch_size_ - 1) / batch_size_;
+}
+
+uint64_t ThreadBatchDataLoader::processed_batch_count() const
+{
+    return processed_batch_count_;
+}
+
+uint8_t* ThreadBatchDataLoader::data() const
+{
+    return current_data_;
+}
+
+uint32_t ThreadBatchDataLoader::data_batch_size() const
+{
+    return current_data_batch_size_;
+}
+
+bool ThreadBatchDataLoader::enqueue(std::function<void()> task, const TileInfo& tile)
+{
+    if (num_workers_ > 0)
+    {
+        auto future = thread_pool_.enqueue(task);
+        tasks_.emplace_back(std::move(future));
+        if (batch_data_processor_)
+        {
+            batch_data_processor_->add_tile(tile);
+        }
+        return true;
+    }
+    return false;
+}
+
+} // namespace cucim::loader
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eb5910e33..2e892875b 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -43,12 +43,21 @@ target_compile_definitions(cucim_tests
         CUCIM_VERSION_PATCH=${PROJECT_VERSION_PATCH}
         CUCIM_VERSION_BUILD=${PROJECT_VERSION_BUILD}
 )
+
+# Include pthread
+# (https://cmake.org/cmake/help/v3.18/module/FindThreads.html)
+set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+find_package(Threads REQUIRED)
+
 target_link_libraries(cucim_tests
         PRIVATE
             CUDA::cudart
             ${CUCIM_PACKAGE_NAME}
             deps::catch2
             deps::openslide
+            deps::taskflow
+            Threads::Threads # -lpthread
         )
 
 include(ParseAndAddCatchTests)
diff --git a/cucim.code-workspace b/cucim.code-workspace
index f5a63064b..af60d4b02 100644
--- a/cucim.code-workspace
+++ b/cucim.code-workspace
@@ -6,6 +6,9 @@
 		{
 			"path": "cpp/plugins/cucim.kit.cuslide"
 		},
+		{
+			"path": "cpp/plugins/cucim.kit.cumed"
+		},
 		{
 			"path": "python"
 		}
@@ -141,7 +144,15 @@
 			"__node_handle": "cpp",
 			"__memory": "cpp",
 			"*.def": "cpp",
-			"__mutex_base": "cpp"
+			"__mutex_base": "cpp",
+			"mprealsupport": "cpp",
+			"compare": "cpp",
+			"concepts": "cpp",
+			"coroutine": "cpp",
+			"numbers": "cpp",
+			"semaphore": "cpp",
+			"stop_token": "cpp",
+			"tensor": "cpp"
 		},
 		// https://vector-of-bool.github.io/docs/vscode-cmake-tools/settings.html
 		"cmake.buildTask": true,
@@ -160,7 +171,6 @@
 				"options": {
 					"cwd": "${workspaceFolder}",
 					"env": {
-						// Workaround the environment variable issue: https://github.com/microsoft/vscode/issues/121470
 						"PATH": "${env:HOME}/.local/bin:${env:PATH}"
 					}
 				},
@@ -179,14 +189,17 @@
 				"type": "shell",
 				"command": "./run build_local all release",
 				"options": {
-					"cwd": "${workspaceFolder}"
+					"cwd": "${workspaceFolder}",
+					"env": {
+						"PATH": "${env:HOME}/.local/bin:${env:PATH}"
+					}
 				},
 				"presentation": {
 					"reveal": "always",
 					"focus": true
 				},
 				"problemMatcher": [],
-				"group": "build",
+				"group": "build"
 			}
 		]
 	},
@@ -210,7 +223,7 @@
 				"environment": [
 					{
 						"name": "LD_LIBRARY_PATH",
-						"value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${env:LD_LIBRARY_PATH}"
+						"value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${workspaceFolder:cucim.kit.cumed}/build-debug/lib:${env:LD_LIBRARY_PATH}"
 					},
 					{
 						"name": "CUCIM_TEST_PLUGIN_PATH",
@@ -238,7 +251,7 @@
 				"environment": [
 					{
 						"name": "LD_LIBRARY_PATH",
-						"value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${env:LD_LIBRARY_PATH}"
+						"value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${workspaceFolder:cucim.kit.cumed}/build-debug/lib:${env:LD_LIBRARY_PATH}"
 					},
 					{
 						"name": "CUCIM_TEST_PLUGIN_PATH",
@@ -259,9 +272,10 @@
 				"name": "(gdb) cucim_py",
 				"type": "cppdbg",
 				"request": "launch",
-				"program": "/usr/bin/python3",
+				"program": "/usr/bin/bash",
 				// https://github.com/catchorg/Catch2/blob/devel/docs/command-line.md#specifying-which-tests-to-run
 				"args": [
+					"${workspaceFolder:cucim}/scripts/debug_python",
 					"${workspaceFolder:python}/cucim/src/localtest.py",
 				],
 				"stopAtEntry": false,
@@ -269,7 +283,7 @@
 				"environment": [
 					{
 						"name": "LD_LIBRARY_PATH",
-						"value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${env:LD_LIBRARY_PATH}"
+						"value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${workspaceFolder:cucim.kit.cumed}/build-debug/lib:${env:LD_LIBRARY_PATH}"
 					},
 					{
 						"name": "CUCIM_TEST_PLUGIN_PATH",
@@ -301,7 +315,7 @@
 				"environment": [
 					{
 						"name": "LD_LIBRARY_PATH",
-						"value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${env:LD_LIBRARY_PATH}"
+						"value": "${workspaceFolder:cucim}/build-debug/lib:${workspaceFolder:cucim.kit.cuslide}/build-debug/lib:${workspaceFolder:cucim.kit.cumed}/build-debug/lib:${env:LD_LIBRARY_PATH}"
 					}
 				],
 				"console": "externalTerminal",
diff --git a/examples/cpp/tiff_image/main.cpp b/examples/cpp/tiff_image/main.cpp
index 051652285..41b212e1c 100644
--- a/examples/cpp/tiff_image/main.cpp
+++ b/examples/cpp/tiff_image/main.cpp
@@ -59,8 +59,7 @@ int main(int argc, char* argv[])
     fmt::print("channel_names: ({})\n", fmt::join(associated_image.channel_names(), ", "));
     fmt::print("\n");
 
-    cucim::CuImage region =
-        image.read_region({ 10000, 10000 }, { 1024, 1024 }, 0, cucim::DimIndices{}, "cpu", nullptr, "");
+    cucim::CuImage region = image.read_region({ 10000, 10000 }, { 1024, 1024 }, 0);
 
     fmt::print("is_loaded: {}\n", region.is_loaded());
     fmt::print("device: {}\n", std::string(region.device()));
@@ -83,9 +82,21 @@ int main(int argc, char* argv[])
 
     region.save(fmt::format("{}/output.ppm", output_folder_path));
 
-    cucim::CuImage region2 =
-        image.read_region({ 5000, 5000 }, { 1024, 1024 }, 1, cucim::DimIndices{}, "cpu", nullptr, "");
+    cucim::CuImage region2 = image.read_region({ 5000, 5000 }, { 1024, 1024 }, 1);
     region2.save(fmt::format("{}/output2.ppm", output_folder_path));
 
+    // Batch loading image
+    // You need to create shared pointer for cucim::CuImage. Otherwise it will cause std::bad_weak_ptr exception.
+    auto batch_image = std::make_shared<cucim::CuImage>(input_file_path);
+
+    auto region3 = std::make_shared<cucim::CuImage>(image.read_region(
+        { 0, 0, 100, 200, 300, 300, 400, 400, 500, 500, 600, 600, 700, 700, 800, 800, 900, 900, 1000, 1000 },
+        { 200, 200 }, 0 /*level*/, 2 /*num_workers*/, 2 /*batch_size*/, false /*drop_last*/, 1 /*prefetch_factor*/,
+        false /*shuffle*/, 0 /*seed*/));
+
+    for (auto batch : *region3)
+    {
+        fmt::print("shape: {}, data size:{}\n", fmt::join(batch->shape(), ", "), batch->container().size());
+    }
     return 0;
 }
diff --git a/python/cucim/src/localtest.py b/python/cucim/src/localtest.py
index 59955dbc8..b255b8287 100644
--- a/python/cucim/src/localtest.py
+++ b/python/cucim/src/localtest.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/cucim/tests/performance/clara/test_read_region_memory_usage.py b/python/cucim/tests/performance/clara/test_read_region_memory_usage.py
index fe2a3b976..676168937 100644
--- a/python/cucim/tests/performance/clara/test_read_region_memory_usage.py
+++ b/python/cucim/tests/performance/clara/test_read_region_memory_usage.py
@@ -18,14 +18,14 @@
 from ...util.io import open_image_cucim
 
 
-def test_read_region_cuda_memleak(testimg_tiff_stripe_4096x4096_256):
+def test_read_region_cuda_memleak(testimg_tiff_stripe_4096x4096_256_jpeg):
     import GPUtil
     gpus = GPUtil.getGPUs()
 
     if len(gpus) == 0:
         pytest.skip('No gpu available')
 
-    img = open_image_cucim(testimg_tiff_stripe_4096x4096_256)
+    img = open_image_cucim(testimg_tiff_stripe_4096x4096_256_jpeg)
 
     gpu = gpus[0]
     mem_usage_history = [gpu.memoryUsed]
@@ -38,11 +38,10 @@ def test_read_region_cuda_memleak(testimg_tiff_stripe_4096x4096_256):
 
     print(mem_usage_history)
 
-    # Memory usage difference should be less than 40MB
+    # The difference in memory usage should be less than 180MB.
     # Note: Since we cannot measure GPU memory usage for a process,
-    #       we use a rough number.
-    #       Actual CUDA memory used would be 48MB per iteration (4096x4096x3).
-    assert mem_usage_history[4] - mem_usage_history[1] < 40.0
+    #       we use a rough number (experimentally measured).
+    assert mem_usage_history[4] - mem_usage_history[1] < 180.0
 
 
 def test_read_region_cpu_memleak(testimg_tiff_stripe_4096x4096_256):
diff --git a/python/pybind11/cache/cache_py.cpp b/python/pybind11/cache/cache_py.cpp
index f46352cb1..f028aa922 100644
--- a/python/pybind11/cache/cache_py.cpp
+++ b/python/pybind11/cache/cache_py.cpp
@@ -123,12 +123,12 @@ void py_image_cache_reserve(ImageCache& cache, uint32_t memory_capacity, py::kwa
 }
 
 py::int_ py_preferred_memory_capacity(const py::object& img,
-                                      const std::optional<const std::vector<uint32_t>>& image_size,
+                                      const std::optional<const std::vector<uint64_t>>& image_size,
                                       const std::optional<const std::vector<uint32_t>>& tile_size,
                                       const std::optional<const std::vector<uint32_t>>& patch_size,
                                       uint32_t bytes_per_pixel)
 {
-    std::vector<uint32_t> param_image;
+    std::vector<uint64_t> param_image;
     std::vector<uint32_t> param_tile;
     std::vector<uint32_t> param_patch;
 
diff --git a/python/pybind11/cache/cache_py.h b/python/pybind11/cache/cache_py.h
index e0b817fcd..99af48bba 100644
--- a/python/pybind11/cache/cache_py.h
+++ b/python/pybind11/cache/cache_py.h
@@ -40,7 +40,7 @@ py::dict py_config(ImageCache& cache);
 void py_image_cache_reserve(ImageCache& cache, uint32_t memory_capacity, py::kwargs kwargs);
 
 py::int_ py_preferred_memory_capacity(const py::object& img,
-                                      const std::optional<const std::vector<uint32_t>>& image_size,
+                                      const std::optional<const std::vector<uint64_t>>& image_size,
                                       const std::optional<const std::vector<uint32_t>>& tile_size,
                                       const std::optional<const std::vector<uint32_t>>& patch_size,
                                       uint32_t bytes_per_pixel);
diff --git a/python/pybind11/cache/cache_pydoc.h b/python/pybind11/cache/cache_pydoc.h
index 67840a40a..cca61557e 100644
--- a/python/pybind11/cache/cache_pydoc.h
+++ b/python/pybind11/cache/cache_pydoc.h
@@ -22,7 +22,7 @@ namespace cucim::cache::doc
 {
 
 // py::int_ py_preferred_memory_capacity(const py::object& img,
-//                                       const std::optional<const std::vector<uint32_t>>& image_size,
+//                                       const std::optional<const std::vector<uint64_t>>& image_size,
 //                                       const std::optional<const std::vector<uint32_t>>& tile_size,
 //                                       const std::optional<const std::vector<uint32_t>>& patch_size,
 //                                       uint32_t bytes_per_pixel);
diff --git a/python/pybind11/cucim_py.cpp b/python/pybind11/cucim_py.cpp
index be91f9d3f..e3f48a03d 100644
--- a/python/pybind11/cucim_py.cpp
+++ b/python/pybind11/cucim_py.cpp
@@ -148,9 +148,15 @@ PYBIND11_MODULE(_cucim, m)
         .def_property("resolutions", &py_resolutions, nullptr, doc::CuImage::doc_resolutions,
                       py::call_guard<py::gil_scoped_release>()) //
         .def("read_region", &py_read_region, doc::CuImage::doc_read_region, py::call_guard<py::gil_scoped_release>(), //
-             py::arg("location") = py::list{}, //
-             py::arg("size") = py::list{}, //
+             py::arg("location") = py::tuple{}, //
+             py::arg("size") = py::tuple{}, //
              py::arg("level") = 0, //
+             py::arg("num_workers") = 0, //
+             py::arg("batch_size") = 1, //
+             py::arg("drop_last") = py::bool_(false), //
+             py::arg("prefetch_factor") = 2, //
+             py::arg("shuffle") = py::bool_(false), //
+             py::arg("seed") = py::int_(0), //
              py::arg("device") = io::Device(), //
              py::arg("buf") = py::none(), //
              py::arg("shm_name") = "") //
@@ -163,6 +169,12 @@ PYBIND11_MODULE(_cucim, m)
         .def("save", &CuImage::save, doc::CuImage::doc_save, py::call_guard<py::gil_scoped_release>()) //
         .def("close", &CuImage::close, doc::CuImage::doc_close, py::call_guard<py::gil_scoped_release>()) //
         .def("__bool__", &CuImage::operator bool, py::call_guard<py::gil_scoped_release>()) //
+        .def(
+            "__iter__", //
+            [](const std::shared_ptr<CuImage>& cuimg) { //
+                return cuimg->begin(); //
+            }, //
+            py::call_guard<py::gil_scoped_release>())
         .def(
             "__enter__",
             [](const std::shared_ptr<CuImage>& cuimg) { //
@@ -183,6 +195,30 @@ PYBIND11_MODULE(_cucim, m)
             },
             py::call_guard<py::gil_scoped_release>());
 
+    py::class_<CuImageIterator<CuImage>>(m, "CuImageIterator") //
+        .def(py::init<std::shared_ptr<CuImage>, bool>(), doc::CuImageIterator::doc_CuImageIterator,
+             py::arg("cuimg"), //
+             py::arg("ending") = false, py::call_guard<py::gil_scoped_release>())
+        .def(
+            "__len__",
+            [](const CuImageIterator<CuImage>& it) { //
+                return it.size(); //
+            }, //
+            py::call_guard<py::gil_scoped_release>())
+        .def(
+            "__iter__", //
+            [](CuImageIterator<CuImage>& it) { //
+                return CuImageIterator<CuImage>(it); //
+            }, //
+            py::call_guard<py::gil_scoped_release>())
+        .def("__next__", &py_cuimage_iterator_next, py::call_guard<py::gil_scoped_release>())
+        .def(
+            "__repr__", //
+            [](CuImageIterator<CuImage>& it) { //
+                return fmt::format("<cucim.CuImageIterator index:{}>", it.index());
+            },
+            py::call_guard<py::gil_scoped_release>());
+
     // We can use `"cpu"` instead of `Device("cpu")`
     py::implicitly_convertible<const char*, io::Device>();
 }
@@ -379,15 +415,61 @@ py::dict py_resolutions(const CuImage& cuimg)
 
 
 py::object py_read_region(const CuImage& cuimg,
-                          std::vector<int64_t>&& location,
+                          const py::iterable& location,
                           std::vector<int64_t>&& size,
                           int16_t level,
+                          uint32_t num_workers,
+                          uint32_t batch_size,
+                          bool drop_last,
+                          uint32_t prefetch_factor,
+                          bool shuffle,
+                          uint64_t seed,
                           const io::Device& device,
                           const py::object& buf,
                           const std::string& shm_name,
                           const py::kwargs& kwargs)
 {
+    if (!size.empty() && size.size() != 2)
+    {
+        throw std::runtime_error("size (patch size) should be 2!");
+    }
+
     cucim::DimIndices indices;
+    std::vector<int64_t> locations;
+    {
+        py::gil_scoped_acquire scope_guard;
+
+        auto arr = pybind11::array_t<int64_t, py::array::c_style | py::array::forcecast>::ensure(location);
+        if (arr) // fast copy
+        {
+            py::buffer_info buf = arr.request();
+            int64_t* data_array = static_cast<int64_t*>(buf.ptr);
+            ssize_t data_size = buf.size;
+            locations.reserve(data_size);
+            locations.insert(locations.end(), &data_array[0], &data_array[data_size]);
+        }
+        else
+        {
+            auto iter = py::iter(location);
+            while (iter != py::iterator::sentinel())
+            {
+                if (py::isinstance<py::iterable>(*iter))
+                {
+                    auto iter2 = py::iter(*iter);
+                    while (iter2 != py::iterator::sentinel())
+                    {
+                        locations.emplace_back(py::cast<int64_t>(*iter2));
+                        ++iter2;
+                    }
+                }
+                else
+                {
+                    locations.emplace_back(py::cast<int64_t>(*iter));
+                }
+                ++iter;
+            }
+        }
+    }
 
     if (kwargs)
     {
@@ -414,9 +496,6 @@ py::object py_read_region(const CuImage& cuimg,
                 }
 
                 indices_args.emplace_back(std::make_pair(key_char, value));
-
-                //            fmt::print("k:{} v:{}\n", std::string(py::str(item.first)),
-                //            std::string(py::str(item.second)));
             }
         }
         indices = cucim::DimIndices(indices_args);
@@ -427,14 +506,26 @@ py::object py_read_region(const CuImage& cuimg,
     }
 
     auto region_ptr = std::make_shared<cucim::CuImage>(
-        cuimg.read_region(std::move(location), std::move(size), level, indices, device, nullptr, ""));
+        std::move(cuimg.read_region(std::move(locations), std::move(size), level, num_workers, batch_size, drop_last,
+                                    prefetch_factor, shuffle, seed, indices, device, nullptr, "")));
+    auto loader = region_ptr->loader();
+    if (batch_size > 1 || (loader && loader->size() > 1))
+    {
+        auto iter_ptr = region_ptr->begin();
+
+        py::gil_scoped_acquire scope_guard;
+
+        py::object iter = py::cast(iter_ptr);
 
+        return iter;
+    }
+    else
     {
         py::gil_scoped_acquire scope_guard;
 
         py::object region = py::cast(region_ptr);
 
-        // Add `__array_interace__` or `__cuda_array_interface__` in runtime.
+        // Add `__array_interface__` or `__cuda_array_interface__` in runtime.
         _set_array_interface(region);
 
         return region;
@@ -450,13 +541,40 @@ py::object py_associated_image(const CuImage& cuimg, const std::string& name, co
 
         py::object image = py::cast(image_ptr);
 
-        // Add `__array_interace__` or `__cuda_array_interface__` in runtime.
+        // Add `__array_interface__` or `__cuda_array_interface__` in runtime.
         _set_array_interface(image);
 
         return image;
     }
 }
 
+py::object py_cuimage_iterator_next(CuImageIterator<CuImage>& it)
+{
+    bool stop_iteration = (it.index() == it.size());
+
+    // Get the next batch of images.
+    ++it;
+
+    auto cuimg = *it;
+    memory::DLTContainer container = cuimg->container();
+    DLTensor* tensor = static_cast<DLTensor*>(container);
+    cucim::loader::ThreadBatchDataLoader* loader = cuimg->loader();
+
+    {
+        py::gil_scoped_acquire scope_guard;
+        py::object cuimg_obj = py::cast(cuimg);
+        if (loader)
+        {
+            _set_array_interface(cuimg_obj);
+        }
+        if (stop_iteration)
+        {
+            throw py::stop_iteration();
+        }
+        return cuimg_obj;
+    }
+}
+
 void _set_array_interface(const py::object& cuimg_obj)
 {
     const auto& cuimg = cuimg_obj.cast<const CuImage&>();
@@ -464,42 +582,74 @@ void _set_array_interface(const py::object& cuimg_obj)
     // TODO: using __array_struct__, access to array interface could be faster
     //       (https://numpy.org/doc/stable/reference/arrays.interface.html#c-struct-access)
     // TODO: check the performance difference between python int vs python long later.
+
+    loader::ThreadBatchDataLoader* loader = cuimg.loader();
     memory::DLTContainer container = cuimg.container();
 
-    const DLTensor* tensor = static_cast<DLTensor*>(container);
+    DLTensor* tensor = static_cast<DLTensor*>(container);
     if (!tensor)
     {
         return;
     }
+    if (loader)
+    {
+        // Get the last available (batch) image.
+        tensor->data = loader->data();
+    }
 
-    const char* type_str = container.numpy_dtype();
-    py::str typestr = py::str(type_str);
+    if (tensor->data)
+    {
+        const char* type_str = container.numpy_dtype();
+        py::str typestr = py::str(type_str);
 
-    py::tuple data = pybind11::make_tuple(py::int_(reinterpret_cast<uint64_t>(tensor->data)), py::bool_(false));
-    py::list descr;
-    descr.append(py::make_tuple(""_s, typestr));
+        py::tuple data = pybind11::make_tuple(py::int_(reinterpret_cast<uint64_t>(tensor->data)), py::bool_(false));
+        py::list descr;
+        descr.append(py::make_tuple(""_s, typestr));
 
-    py::tuple shape = vector2pytuple<pybind11::int_>(cuimg.shape());
+        py::tuple shape = vector2pytuple<pybind11::int_>(cuimg.shape());
 
-    // TODO: depending on container's memory type, expose either array_interface or cuda_array_interface
-    switch (tensor->ctx.device_type)
-    {
-    case kDLCPU: {
-        // Reference: https://numpy.org/doc/stable/reference/arrays.interface.html
-        cuimg_obj.attr("__array_interface__") =
-            py::dict{ "data"_a = data,       "strides"_a = py::none(), "descr"_a = descr,
-                      "typestr"_a = typestr, "shape"_a = shape,        "version"_a = py::int_(3) };
-    }
-    break;
-    case kDLGPU: {
-        // Reference: https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html
-        cuimg_obj.attr("__cuda_array_interface__") =
-            py::dict{ "data"_a = data,   "strides"_a = py::none(),  "descr"_a = descr,     "typestr"_a = typestr,
-                      "shape"_a = shape, "version"_a = py::int_(3), "mask"_a = py::none(), "stream"_a = 1 };
+        // Depending on container's memory type, expose either array_interface or cuda_array_interface
+        switch (tensor->ctx.device_type)
+        {
+        case kDLCPU: {
+            // Reference: https://numpy.org/doc/stable/reference/arrays.interface.html
+            cuimg_obj.attr("__array_interface__") =
+                py::dict{ "data"_a = data,       "strides"_a = py::none(), "descr"_a = descr,
+                          "typestr"_a = typestr, "shape"_a = shape,        "version"_a = py::int_(3) };
+        }
+        break;
+        case kDLGPU: {
+            // Reference: https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html
+            cuimg_obj.attr("__cuda_array_interface__") =
+                py::dict{ "data"_a = data,   "strides"_a = py::none(),  "descr"_a = descr,     "typestr"_a = typestr,
+                          "shape"_a = shape, "version"_a = py::int_(3), "mask"_a = py::none(), "stream"_a = 1 };
+        }
+        break;
+        default:
+            break;
+        }
     }
-    break;
-    default:
+    else
+    {
+        switch (tensor->ctx.device_type)
+        {
+        case kDLCPU: {
+            if (py::hasattr(cuimg_obj, "__array_interface__"))
+            {
+                py::delattr(cuimg_obj, "__array_interface__");
+            }
+        }
         break;
+        case kDLGPU: {
+            if (py::hasattr(cuimg_obj, "__cuda_array_interface__"))
+            {
+                py::delattr(cuimg_obj, "__cuda_array_interface__");
+            }
+        }
+        break;
+        default:
+            break;
+        }
     }
 }
 
diff --git a/python/pybind11/cucim_py.h b/python/pybind11/cucim_py.h
index 2b0614d65..4d25cda7a 100644
--- a/python/pybind11/cucim_py.h
+++ b/python/pybind11/cucim_py.h
@@ -29,6 +29,8 @@ namespace cucim
 
 // Forward declarations
 class CuImage;
+template <typename DataType = CuImage>
+class CuImageIterator;
 namespace io
 {
 class Device;
@@ -67,15 +69,23 @@ bool py_is_trace_enabled(py::object /* self */);
 json py_metadata(const CuImage& cuimg);
 py::dict py_resolutions(const CuImage& cuimg);
 py::object py_read_region(const CuImage& cuimg,
-                          std::vector<int64_t>&& location,
+                          const py::iterable& location,
                           std::vector<int64_t>&& size,
                           int16_t level,
+                          uint32_t num_workers,
+                          uint32_t batch_size,
+                          bool drop_last,
+                          uint32_t prefetch_factor,
+                          bool shuffle,
+                          uint64_t seed,
                           const io::Device& device,
                           const py::object& buf,
                           const std::string& shm_name,
                           const py::kwargs& kwargs);
 py::object py_associated_image(const CuImage& cuimg, const std::string& name, const io::Device& device);
 
+py::object py_cuimage_iterator_next(CuImageIterator<CuImage>& it);
+
 void _set_array_interface(const py::object& cuimg_obj);
 } // namespace cucim
 
diff --git a/python/pybind11/cucim_pydoc.h b/python/pybind11/cucim_pydoc.h
index 255cd8998..673a0d900 100644
--- a/python/pybind11/cucim_pydoc.h
+++ b/python/pybind11/cucim_pydoc.h
@@ -258,6 +258,16 @@ Add `__array_interface__` or `__cuda_array_interface__` depending on the memory
 
 }; // namespace CuImage
 
+namespace CuImageIterator
+{
+
+// CuImageIterator(std::shared_ptr<DataType> cuimg, bool ending = false);
+PYDOC(CuImageIterator, R"doc(
+Constructor of CuImageIterator.
+)doc")
+
+} // namespace CuImageIterator
+
 } // namespace cucim::doc
 
 #endif // PYCUCIM_CUCIM_PYDOC_H
diff --git a/run b/run
index e2cd109a3..8033eaf18 100755
--- a/run
+++ b/run
@@ -268,6 +268,8 @@ build_local_libcucim_() {
 
     # Copy cufile SDK from host system to temp/cuda
     copy_gds_files_ $source_folder
+    # Copy libjpeg SDK from host system to temp/cuda
+    copy_nvjpeg_files_ $source_folder
 
     ${CMAKE_CMD} -S ${source_folder} -B ${build_folder} -G "Unix Makefiles" \
         -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE \
@@ -471,6 +473,74 @@ copy_gds_files_() {
     fi
 }
 
+get_arch_name_() {
+    architecture="unknown"
+    case $(uname -m) in
+        x86_64) architecture="x86_64" ;;
+        arm|aarch64) lscpu | awk '/Architecture:/{print $2}' | grep -i -q "aarch64" && architecture="sbsa";;
+    esac
+    echo "${architecture}"
+}
+
+copy_nvjpeg_files_() {
+    local root_folder=${1:-${TOP}}
+    local arch_name="$(get_arch_name_)"
+    local nvjpeg_search="${root_folder}/temp/cuda/include:${root_folder}/temp/cuda/lib64"
+    local cuda_version="11.6"
+    local nvjpeg_version="11-6_11.6.0.55-1"
+    local candidate
+    local nvjpeg_include
+    local nvjpeg_lib
+
+    for candidate in ${nvjpeg_search}; do
+        nvjpeg_include="$(echo $candidate | cut -d: -f1)"
+        nvjpeg_lib="$(echo $candidate | cut -d: -f2)"
+        if [ -f ${nvjpeg_include}/nvjpeg.h ] && [ -f ${nvjpeg_lib}/libnvjpeg_static.a ]; then
+            c_echo W "nvJPEG client library is available at '${nvjpeg_include}/nvjpeg.h' and '${nvjpeg_lib}/libnvjpeg_static.a'"
+            break
+        fi
+        nvjpeg_include=""
+        nvjpeg_lib=""
+    done
+
+    if [ ! -f ${nvjpeg_include}/nvjpeg.h ]; then
+        c_echo_err Y "nvJPEG library is not available! Downloading the redistributable package to get nvjpeg.h and libraries."
+
+        run_command rm -rf ${root_folder}/temp/cuda/include/*nvjpeg*
+        run_command rm -rf ${root_folder}/temp/cuda/lib64/*nvjpeg*
+        run_command mkdir -p ${root_folder}/temp/cuda/include ${root_folder}/temp/cuda/lib64
+
+        local temp_tgz_dir=$(mktemp -d)
+        pushd ${temp_tgz_dir}
+        c_echo W "Arch name: " G "${arch_name}"
+        if [ "${arch_name}" = "sbsa" ]; then
+            run_command wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/${arch_name}/libnvjpeg-dev-${nvjpeg_version}_arm64.deb
+        else
+            run_command wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/${arch_name}/libnvjpeg-dev-${nvjpeg_version}_amd64.deb
+        fi
+
+        mkdir -p libnvjpeg-dev
+        mv libnvjpeg-dev-${nvjpeg_version}*.deb libnvjpeg-dev/
+        pushd libnvjpeg-dev > /dev/null
+        run_command ar x libnvjpeg-dev-${nvjpeg_version}*.deb
+        run_command xz --decompress data.tar.xz
+        run_command tar xvf data.tar
+        popd > /dev/null
+
+        run_command cp libnvjpeg-dev/usr/local/cuda-${cuda_version}/include/nvjpeg.h ${root_folder}/temp/cuda/include/
+        run_command cp libnvjpeg-dev/usr/local/cuda-${cuda_version}/lib64/libnvjpeg_static.a ${root_folder}/temp/cuda/lib64/
+        popd > /dev/null
+        run_command rm -r ${temp_tgz_dir}
+    else
+        run_command mkdir -p ${root_folder}/temp/cuda/include ${root_folder}/temp/cuda/lib64
+
+        if [ "${nvjpeg_include}" != "${root_folder}/temp/cuda/include" ]; then
+            run_command cp -Pf ${nvjpeg_include}/*nvjpeg* ${root_folder}/temp/cuda/include/ || true
+            run_command cp -Pf ${nvjpeg_lib}/*nvjpeg* ${root_folder}/temp/cuda/lib64/ || true
+        fi
+    fi
+}
+
 build_python_package_desc() { echo 'Build Python package
 
 Note: This command does not remove `dist` folder before building.
@@ -483,6 +553,8 @@ build_python_package() {
 
     # Copy cufile SDK from host system to temp/cuda
     copy_gds_files_
+    # Copy nvjpeg SDK from host system to temp/cuda
+    copy_nvjpeg_files_
 
     run_command ${TOP}/dockcross-manylinux2014-x64 ./run build_python_package_
     ret=$?
diff --git a/scripts/debug_python b/scripts/debug_python
new file mode 100755
index 000000000..f62e377fe
--- /dev/null
+++ b/scripts/debug_python
@@ -0,0 +1,32 @@
+#!/bin/bash
+#
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+
+if [ -e ${SCRIPT_DIR}/debug_env.sh ]; then
+    # User can place debug_env.sh in the same directory as this script (scripts/debug_env.sh would be ignored in git repo)
+    . ${SCRIPT_DIR}/debug_env.sh
+elif [ -e ${SCRIPT_DIR}/../.python-version ]; then
+    # Need to init in script: https://github.com/pyenv/pyenv-virtualenv/issues/204
+    eval "$(pyenv init -)"
+    eval "$(pyenv virtualenv-init -)"
+    # Above will do `pyenv activate $(cat ${SCRIPT_DIR}/../.python-version)`.
+else
+    echo "Environment file not found. Exiting."
+    exit 1
+fi
+echo "Python: $(python3 -c "import sys;print(sys.executable)")"
+exec env python3 "$@"