diff --git a/libs/Common/UtilCUDA.cpp b/libs/Common/UtilCUDA.cpp
index dddc6d5a4..5b79679dc 100644
--- a/libs/Common/UtilCUDA.cpp
+++ b/libs/Common/UtilCUDA.cpp
@@ -123,10 +123,10 @@ CUresult _gpuGetMaxGflopsDeviceId(Device& bestDevice)
 		if (device.major > 0 && device.major < 9999) {
 			best_SM_arch = MAXF(best_SM_arch, device.major);
 			device.ID = (CUdevice)current_device;
-			devices.Insert(device);
+			devices.emplace_back(std::move(device));
 		}
 	}
-	if (devices.IsEmpty()) {
+	if (devices.empty()) {
 		VERBOSE("CUDA error: all devices have compute mode prohibited");
 		return CUDA_ERROR_PROFILER_DISABLED;
 	}
@@ -134,22 +134,22 @@ CUresult _gpuGetMaxGflopsDeviceId(Device& bestDevice)
 	// Find the best CUDA capable GPU device
 	Device* max_perf_device = NULL;
 	size_t max_compute_perf = 0;
-	FOREACHPTR(pDevice, devices) {
-		ASSERT(pDevice->computeMode != CU_COMPUTEMODE_PROHIBITED);
-		int sm_per_multiproc = _convertSMVer2Cores(pDevice->major, pDevice->minor);
+	for (Device& device: devices) {
+		ASSERT(device.computeMode != CU_COMPUTEMODE_PROHIBITED);
+		int sm_per_multiproc = _convertSMVer2Cores(device.major, device.minor);
 		int multiProcessorCount;
-		if (reportCudaError(cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, (CUdevice)pDevice->ID)) != CUDA_SUCCESS)
+		if (reportCudaError(cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device.ID)) != CUDA_SUCCESS)
 			continue;
 		int clockRate;
-		if (reportCudaError(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, pDevice->ID)) != CUDA_SUCCESS)
+		if (reportCudaError(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device.ID)) != CUDA_SUCCESS)
 			continue;
 		size_t compute_perf = (size_t)multiProcessorCount * sm_per_multiproc * clockRate;
 		if (compute_perf > max_compute_perf &&
 			(best_SM_arch < 3 || // if we find GPU with SM major > 2, search only these
-			 pDevice->major == best_SM_arch) ) // if our device==dest_SM_arch, choose this, or else pass
+			 device.major == best_SM_arch) ) // if our device==dest_SM_arch, choose this, or else pass
 		{
 			max_compute_perf = compute_perf;
-			max_perf_device  = pDevice;
+			max_perf_device  = &device;
 		}
 	}
 	if (max_perf_device == NULL)
@@ -180,8 +180,8 @@ CUresult initDevice(int deviceID)
 		VERBOSE("CUDA error: compute capability 3.2 or greater required (available %d.%d for device[%d])", device.ID, device.major, device.minor);
 		return CUDA_ERROR_INVALID_DEVICE;
 	}
-	devices.Insert(device);
-	checkCudaError(cuCtxCreate(&devices.Last().ctx, CU_CTX_SCHED_AUTO, device.ID));
+	devices.emplace_back(device);
+	checkCudaError(cuCtxCreate(&devices.back().ctx, CU_CTX_SCHED_AUTO, device.ID));
 
 	#if TD_VERBOSE != TD_VERBOSE_OFF
 	char name[2048];
diff --git a/libs/Common/UtilCUDA.h b/libs/Common/UtilCUDA.h
index 44d869d6b..d04b04717 100644
--- a/libs/Common/UtilCUDA.h
+++ b/libs/Common/UtilCUDA.h
@@ -23,6 +23,8 @@
 #include <curand_kernel.h>
 #include <vector_types.h>
 
+#include "UtilCUDADevice.h"
+
 
 // D E F I N E S ///////////////////////////////////////////////////
 
@@ -66,7 +68,6 @@ inline CUresult __reportCudaError(CUresult result, LPCSTR errorMessage) {
 	return result;
 }
 #define reportCudaError(val) CUDA::__reportCudaError(val, #val)
-
 #define checkCudaError(val) { const CUresult ret(CUDA::__reportCudaError(val, #val)); if (ret != CUDA_SUCCESS) return ret; }
 
 // outputs the proper CUDA error code and abort in the event that a CUDA host call returns an error
@@ -77,18 +78,7 @@ inline void __ensureCudaResult(CUresult result, LPCSTR errorMessage) {
 	exit(EXIT_FAILURE);
 }
 #define ensureCudaResult(val) CUDA::__ensureCudaResult(val, #val)
-
-inline void checkCudaCall(const cudaError_t error) {
-	if (error == cudaSuccess)
-		return;
-	#ifdef _DEBUG
-	VERBOSE("CUDA error at %s:%d: %s (code %d)", __FILE__, __LINE__, cudaGetErrorString(error), error);
-	#else
-	DEBUG("CUDA error: %s (code %d)", cudaGetErrorString(error), error);
-	#endif
-	ASSERT("CudaError" == NULL);
-	exit(EXIT_FAILURE);
-}
+/*----------------------------------------------------------------*/
 
 // rounds up addr to the align boundary
 template <typename T>
@@ -478,7 +468,7 @@ class TArrayRT
 
 public:
 	inline TArrayRT() : hArray(NULL) {}
-	inline TArrayRT(const Image8U::Size& size, unsigned flags=0) : hArray(NULL) { reportCudaError(Reset(size, flags)); }
+	inline TArrayRT(const cv::Size& size, unsigned flags=0) : hArray(NULL) { reportCudaError(Reset(size, flags)); }
 	inline TArrayRT(unsigned width, unsigned height, unsigned depth=0, unsigned flags=0) : hArray(NULL) { reportCudaError(Reset(width, height, depth, flags)); }
 	inline ~TArrayRT() { Release(); }
 
@@ -498,7 +488,7 @@ class TArrayRT
 			hArray = NULL;
 		}
 	}
-	inline CUresult Reset(const Image8U::Size& size, unsigned flags=0) {
+	inline CUresult Reset(const cv::Size& size, unsigned flags=0) {
 		return Reset((unsigned)size.width, (unsigned)size.height, 0, flags);
 	}
 	CUresult Reset(unsigned width, unsigned height, unsigned depth=0, unsigned flags=0) {
diff --git a/libs/Common/UtilCUDADevice.h b/libs/Common/UtilCUDADevice.h
new file mode 100644
index 000000000..46933650d
--- /dev/null
+++ b/libs/Common/UtilCUDADevice.h
@@ -0,0 +1,85 @@
+////////////////////////////////////////////////////////////////////
+// UtilCUDADevice.h
+//
+// Copyright 2024 cDc@seacave
+// Distributed under the Boost Software License, Version 1.0
+// (See http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef  __SEACAVE_CUDA_DEVICE_H__
+#define  __SEACAVE_CUDA_DEVICE_H__
+
+
+// I N C L U D E S /////////////////////////////////////////////////
+
+#include "Config.h"
+
+// CUDA driver
+#include <cuda.h>
+
+// CUDA toolkit
+#include <cuda_runtime.h>
+
+#include <memory>
+
+
+// D E F I N E S ///////////////////////////////////////////////////
+
+#ifndef VERBOSE
+#define DEFINE_VERBOSE 1
+#define VERBOSE(...) fprintf(stderr, __VA_ARGS__)
+#endif
+
+// check for CUDA errors following a CUDA call
+#define CUDA_CHECK(condition) SEACAVE::CUDA::checkCudaCall(condition)
+
+// check cudaGetLastError() for success
+#define CUDA_CHECK_LAST_ERROR CUDA_CHECK(cudaGetLastError());
+
+
+// S T R U C T S ///////////////////////////////////////////////////
+
+namespace SEACAVE {
+
+namespace CUDA {
+
+inline void checkCudaCall(const cudaError_t error) {
+	if (error == cudaSuccess)
+		return;
+	VERBOSE("CUDA error at %s:%d: %s (code %d)", __FILE__, __LINE__, cudaGetErrorString(error), error);
+	ASSERT("CudaError" == NULL);
+	exit(EXIT_FAILURE);
+}
+
+// define smart pointers for CUDA stream
+struct CudaStreamDestructor {
+	void operator()(cudaStream_t s) {
+		if (s)
+			CUDA_CHECK(cudaStreamDestroy(s));
+	}
+};
+
+typedef std::unique_ptr<std::remove_pointer<cudaStream_t>::type, CudaStreamDestructor> CudaStreamPtr;
+inline CudaStreamPtr CreateStream() {
+	cudaStream_t stream;
+	CUDA_CHECK(cudaStreamCreate(&stream));
+	return CudaStreamPtr(stream, CudaStreamDestructor());
+}
+
+typedef std::shared_ptr<std::remove_pointer<cudaStream_t>::type> CudaStreamSharedPtr;
+inline CudaStreamSharedPtr CreateSharedStream() {
+	cudaStream_t stream;
+	CUDA_CHECK(cudaStreamCreate(&stream));
+	return CudaStreamSharedPtr(stream, CudaStreamDestructor());
+}
+/*----------------------------------------------------------------*/
+
+} // namespace CUDA
+
+} // namespace SEACAVE
+
+#ifdef DEFINE_VERBOSE
+#undef DEFINE_VERBOSE
+#undef VERBOSE
+#endif
+
+#endif // __SEACAVE_CUDA_DEVICE_H__
diff --git a/libs/MVS/PatchMatchCUDA.cpp b/libs/MVS/PatchMatchCUDA.cpp
index 814e73e9c..ea12344de 100644
--- a/libs/MVS/PatchMatchCUDA.cpp
+++ b/libs/MVS/PatchMatchCUDA.cpp
@@ -108,18 +108,18 @@ void PatchMatchCUDA::Init(bool bGeomConsistency)
 void PatchMatchCUDA::AllocatePatchMatchCUDA(const cv::Mat1f& image)
 {
 	const size_t num_images = images.size();
-	CUDA::checkCudaCall(cudaMalloc((void**)&cudaTextureImages, sizeof(cudaTextureObject_t) * num_images));
-	CUDA::checkCudaCall(cudaMalloc((void**)&cudaCameras, sizeof(Camera) * num_images));
+	CUDA_CHECK(cudaMalloc((void**)&cudaTextureImages, sizeof(cudaTextureObject_t) * num_images));
+	CUDA_CHECK(cudaMalloc((void**)&cudaCameras, sizeof(Camera) * num_images));
 	if (params.bGeomConsistency)
-		CUDA::checkCudaCall(cudaMalloc((void**)&cudaTextureDepths, sizeof(cudaTextureObject_t) * (num_images-1)));
+		CUDA_CHECK(cudaMalloc((void**)&cudaTextureDepths, sizeof(cudaTextureObject_t) * (num_images-1)));
 
 	const size_t size = image.size().area();
 	depthNormalEstimates = new Point4[size];
-	CUDA::checkCudaCall(cudaMalloc((void**)&cudaDepthNormalEstimates, sizeof(Point4) * size));
+	CUDA_CHECK(cudaMalloc((void**)&cudaDepthNormalEstimates, sizeof(Point4) * size));
 
-	CUDA::checkCudaCall(cudaMalloc((void**)&cudaDepthNormalCosts, sizeof(float) * size));
-	CUDA::checkCudaCall(cudaMalloc((void**)&cudaSelectedViews, sizeof(unsigned) * size));
-	CUDA::checkCudaCall(cudaMalloc((void**)&cudaRandStates, sizeof(curandState) * size));
+	CUDA_CHECK(cudaMalloc((void**)&cudaDepthNormalCosts, sizeof(float) * size));
+	CUDA_CHECK(cudaMalloc((void**)&cudaSelectedViews, sizeof(unsigned) * size));
+	CUDA_CHECK(cudaMalloc((void**)&cudaRandStates, sizeof(curandState) * size));
 }
 
 void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bInitImage, bool bInitDepthMap)
@@ -127,7 +127,7 @@ void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bI
 	const cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
 
 	if (bInitImage) {
-		CUDA::checkCudaCall(cudaMallocArray(&cudaImageArrays[i], &channelDesc, image.cols, image.rows));
+		CUDA_CHECK(cudaMallocArray(&cudaImageArrays[i], &channelDesc, image.cols, image.rows));
 
 		struct cudaResourceDesc resDesc;
 		memset(&resDesc, 0, sizeof(cudaResourceDesc));
@@ -142,7 +142,7 @@ void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bI
 		texDesc.readMode  = cudaReadModeElementType;
 		texDesc.normalizedCoords = 0;
 
-		CUDA::checkCudaCall(cudaCreateTextureObject(&textureImages[i], &resDesc, &texDesc, NULL));
+		CUDA_CHECK(cudaCreateTextureObject(&textureImages[i], &resDesc, &texDesc, NULL));
 	}
 
 	if (params.bGeomConsistency && i > 0) {
@@ -152,7 +152,7 @@ void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bI
 			return;
 		}
 
-		CUDA::checkCudaCall(cudaMallocArray(&cudaDepthArrays[i-1], &channelDesc, image.cols, image.rows));
+		CUDA_CHECK(cudaMallocArray(&cudaDepthArrays[i-1], &channelDesc, image.cols, image.rows));
 
 		struct cudaResourceDesc resDesc;
 		memset(&resDesc, 0, sizeof(cudaResourceDesc));
@@ -167,7 +167,7 @@ void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bI
 		texDesc.readMode  = cudaReadModeElementType;
 		texDesc.normalizedCoords = 0;
 
-		CUDA::checkCudaCall(cudaCreateTextureObject(&textureDepths[i-1], &resDesc, &texDesc, NULL));
+		CUDA_CHECK(cudaCreateTextureObject(&textureDepths[i-1], &resDesc, &texDesc, NULL));
 	}
 }
 
@@ -213,7 +213,7 @@ void PatchMatchCUDA::EstimateDepthMap(DepthData& depthData)
 			cv::resize(lowResDepthMap, depthData.depthMap, size, 0, 0, cv::INTER_LINEAR);
 			cv::resize(lowResNormalMap, depthData.normalMap, size, 0, 0, cv::INTER_NEAREST);
 			cv::resize(lowResViewsMap, depthData.viewsMap, size, 0, 0, cv::INTER_NEAREST);
-			CUDA::checkCudaCall(cudaMalloc((void**)&cudaLowDepths, sizeof(float) * size.area()));
+			CUDA_CHECK(cudaMalloc((void**)&cudaLowDepths, sizeof(float) * size.area()));
 		} else {
 			if (totalScaleNumber > 0) {
 				// smallest resolution, when multi-resolution is enabled
@@ -289,13 +289,13 @@ void PatchMatchCUDA::EstimateDepthMap(DepthData& depthData)
 				}
 				AllocateImageCUDA(i, image, false, !view.depthMap.empty());
 			}
-			CUDA::checkCudaCall(cudaMemcpy2DToArray(cudaImageArrays[i], 0, 0, image.ptr<float>(), image.step[0], image.cols * sizeof(float), image.rows, cudaMemcpyHostToDevice));
+			CUDA_CHECK(cudaMemcpy2DToArray(cudaImageArrays[i], 0, 0, image.ptr<float>(), image.step[0], image.cols * sizeof(float), image.rows, cudaMemcpyHostToDevice));
 			if (params.bGeomConsistency && i > 0 && !view.depthMap.empty()) {
 				// set previously computed depth-map
 				DepthMap depthMap(view.depthMap);
 				if (depthMap.size() != image.size())
 					cv::resize(depthMap, depthMap, image.size(), 0, 0, cv::INTER_LINEAR);
-				CUDA::checkCudaCall(cudaMemcpy2DToArray(cudaDepthArrays[i-1], 0, 0, depthMap.ptr<float>(), depthMap.step[0], sizeof(float) * depthMap.cols, depthMap.rows, cudaMemcpyHostToDevice));
+				CUDA_CHECK(cudaMemcpy2DToArray(cudaDepthArrays[i-1], 0, 0, depthMap.ptr<float>(), depthMap.step[0], sizeof(float) * depthMap.cols, depthMap.rows, cudaMemcpyHostToDevice));
 			}
 			images[i] = std::move(image);
 			cameras[i] = std::move(camera);
@@ -323,12 +323,12 @@ void PatchMatchCUDA::EstimateDepthMap(DepthData& depthData)
 		prevNumImages = numImages;
 
 		// setup CUDA memory
-		CUDA::checkCudaCall(cudaMemcpy(cudaTextureImages, textureImages.data(), sizeof(cudaTextureObject_t) * numImages, cudaMemcpyHostToDevice));
-		CUDA::checkCudaCall(cudaMemcpy(cudaCameras, cameras.data(), sizeof(Camera) * numImages, cudaMemcpyHostToDevice));
+		CUDA_CHECK(cudaMemcpy(cudaTextureImages, textureImages.data(), sizeof(cudaTextureObject_t) * numImages, cudaMemcpyHostToDevice));
+		CUDA_CHECK(cudaMemcpy(cudaCameras, cameras.data(), sizeof(Camera) * numImages, cudaMemcpyHostToDevice));
 		if (params.bGeomConsistency) {
 			// set previously computed depth-maps
 			ASSERT(depthData.depthMap.size() == depthData.GetView().image.size());
-			CUDA::checkCudaCall(cudaMemcpy(cudaTextureDepths, textureDepths.data(), sizeof(cudaTextureObject_t) * params.nNumViews, cudaMemcpyHostToDevice));
+			CUDA_CHECK(cudaMemcpy(cudaTextureDepths, textureDepths.data(), sizeof(cudaTextureObject_t) * params.nNumViews, cudaMemcpyHostToDevice));
 		}
 
 		// load depth-map and normal-map into CUDA memory
@@ -342,20 +342,20 @@ void PatchMatchCUDA::EstimateDepthMap(DepthData& depthData)
 				depthNormal.w() = depthData.depthMap(r, c);
 			}
 		}
-		CUDA::checkCudaCall(cudaMemcpy(cudaDepthNormalEstimates, depthNormalEstimates, sizeof(Point4) * depthData.depthMap.size().area(), cudaMemcpyHostToDevice));
+		CUDA_CHECK(cudaMemcpy(cudaDepthNormalEstimates, depthNormalEstimates, sizeof(Point4) * depthData.depthMap.size().area(), cudaMemcpyHostToDevice));
 
 		// load low resolution depth-map into CUDA memory
 		if (params.bLowResProcessed) {
 			ASSERT(depthData.depthMap.isContinuous());
-			CUDA::checkCudaCall(cudaMemcpy(cudaLowDepths, depthData.depthMap.ptr<float>(), sizeof(float) * depthData.depthMap.size().area(), cudaMemcpyHostToDevice));
+			CUDA_CHECK(cudaMemcpy(cudaLowDepths, depthData.depthMap.ptr<float>(), sizeof(float) * depthData.depthMap.size().area(), cudaMemcpyHostToDevice));
 		}
 
 		// run CUDA patch-match
 		ASSERT(!depthData.viewsMap.empty());
 		RunCUDA(depthData.confMap.getData(), (uint32_t*)depthData.viewsMap.getData());
-		CUDA::checkCudaCall(cudaGetLastError());
+		CUDA_CHECK(cudaGetLastError());
 		if (params.bLowResProcessed)
-			CUDA::checkCudaCall(cudaFree(cudaLowDepths));
+			CUDA_CHECK(cudaFree(cudaLowDepths));
 
 		// load depth-map, normal-map and confidence-map from CUDA memory
 		for (int r = 0; r < depthData.depthMap.rows; ++r) {