diff --git a/libs/Common/UtilCUDA.cpp b/libs/Common/UtilCUDA.cpp index dddc6d5a4..5b79679dc 100644 --- a/libs/Common/UtilCUDA.cpp +++ b/libs/Common/UtilCUDA.cpp @@ -123,10 +123,10 @@ CUresult _gpuGetMaxGflopsDeviceId(Device& bestDevice) if (device.major > 0 && device.major < 9999) { best_SM_arch = MAXF(best_SM_arch, device.major); device.ID = (CUdevice)current_device; - devices.Insert(device); + devices.emplace_back(std::move(device)); } } - if (devices.IsEmpty()) { + if (devices.empty()) { VERBOSE("CUDA error: all devices have compute mode prohibited"); return CUDA_ERROR_PROFILER_DISABLED; } @@ -134,22 +134,22 @@ CUresult _gpuGetMaxGflopsDeviceId(Device& bestDevice) // Find the best CUDA capable GPU device Device* max_perf_device = NULL; size_t max_compute_perf = 0; - FOREACHPTR(pDevice, devices) { - ASSERT(pDevice->computeMode != CU_COMPUTEMODE_PROHIBITED); - int sm_per_multiproc = _convertSMVer2Cores(pDevice->major, pDevice->minor); + for (Device& device: devices) { + ASSERT(device.computeMode != CU_COMPUTEMODE_PROHIBITED); + int sm_per_multiproc = _convertSMVer2Cores(device.major, device.minor); int multiProcessorCount; - if (reportCudaError(cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, (CUdevice)pDevice->ID)) != CUDA_SUCCESS) + if (reportCudaError(cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device.ID)) != CUDA_SUCCESS) continue; int clockRate; - if (reportCudaError(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, pDevice->ID)) != CUDA_SUCCESS) + if (reportCudaError(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device.ID)) != CUDA_SUCCESS) continue; size_t compute_perf = (size_t)multiProcessorCount * sm_per_multiproc * clockRate; if (compute_perf > max_compute_perf && (best_SM_arch < 3 || // if we find GPU with SM major > 2, search only these - pDevice->major == best_SM_arch) ) // if our device==dest_SM_arch, choose this, or else pass + device.major == best_SM_arch) ) // if our device==dest_SM_arch, choose this, or else pass { max_compute_perf = compute_perf; - max_perf_device = pDevice; + max_perf_device = &device; } } if (max_perf_device == NULL) @@ -180,8 +180,8 @@ CUresult initDevice(int deviceID) VERBOSE("CUDA error: compute capability 3.2 or greater required (available %d.%d for device[%d])", device.ID, device.major, device.minor); return CUDA_ERROR_INVALID_DEVICE; } - devices.Insert(device); - checkCudaError(cuCtxCreate(&devices.Last().ctx, CU_CTX_SCHED_AUTO, device.ID)); + devices.emplace_back(device); + checkCudaError(cuCtxCreate(&devices.back().ctx, CU_CTX_SCHED_AUTO, device.ID)); #if TD_VERBOSE != TD_VERBOSE_OFF char name[2048]; diff --git a/libs/Common/UtilCUDA.h b/libs/Common/UtilCUDA.h index 44d869d6b..d04b04717 100644 --- a/libs/Common/UtilCUDA.h +++ b/libs/Common/UtilCUDA.h @@ -23,6 +23,8 @@ #include #include +#include "UtilCUDADevice.h" + // D E F I N E S /////////////////////////////////////////////////// @@ -66,7 +68,6 @@ inline CUresult __reportCudaError(CUresult result, LPCSTR errorMessage) { return result; } #define reportCudaError(val) CUDA::__reportCudaError(val, #val) - #define checkCudaError(val) { const CUresult ret(CUDA::__reportCudaError(val, #val)); if (ret != CUDA_SUCCESS) return ret; } // outputs the proper CUDA error code and abort in the event that a CUDA host call returns an error @@ -77,18 +78,7 @@ inline void __ensureCudaResult(CUresult result, LPCSTR errorMessage) { exit(EXIT_FAILURE); } #define ensureCudaResult(val) CUDA::__ensureCudaResult(val, #val) - -inline void checkCudaCall(const cudaError_t error) { - if (error == cudaSuccess) - return; - #ifdef _DEBUG - VERBOSE("CUDA error at %s:%d: %s (code %d)", __FILE__, __LINE__, cudaGetErrorString(error), error); - #else - DEBUG("CUDA error: %s (code %d)", cudaGetErrorString(error), error); - #endif - ASSERT("CudaError" == NULL); - exit(EXIT_FAILURE); -} +/*----------------------------------------------------------------*/ // rounds up addr to the align boundary template @@ -478,7 +468,7 @@ class TArrayRT public: inline TArrayRT() : hArray(NULL) {} - inline TArrayRT(const Image8U::Size& size, unsigned flags=0) : hArray(NULL) { reportCudaError(Reset(size, flags)); } + inline TArrayRT(const cv::Size& size, unsigned flags=0) : hArray(NULL) { reportCudaError(Reset(size, flags)); } inline TArrayRT(unsigned width, unsigned height, unsigned depth=0, unsigned flags=0) : hArray(NULL) { reportCudaError(Reset(width, height, depth, flags)); } inline ~TArrayRT() { Release(); } @@ -498,7 +488,7 @@ class TArrayRT hArray = NULL; } } - inline CUresult Reset(const Image8U::Size& size, unsigned flags=0) { + inline CUresult Reset(const cv::Size& size, unsigned flags=0) { return Reset((unsigned)size.width, (unsigned)size.height, 0, flags); } CUresult Reset(unsigned width, unsigned height, unsigned depth=0, unsigned flags=0) { diff --git a/libs/Common/UtilCUDADevice.h b/libs/Common/UtilCUDADevice.h new file mode 100644 index 000000000..46933650d --- /dev/null +++ b/libs/Common/UtilCUDADevice.h @@ -0,0 +1,85 @@ +//////////////////////////////////////////////////////////////////// +// UtilCUDADevice.h +// +// Copyright 2024 cDc@seacave +// Distributed under the Boost Software License, Version 1.0 +// (See http://www.boost.org/LICENSE_1_0.txt) + +#ifndef __SEACAVE_CUDA_DEVICE_H__ +#define __SEACAVE_CUDA_DEVICE_H__ + + +// I N C L U D E S ///////////////////////////////////////////////// + +#include "Config.h" + +// CUDA driver +#include + +// CUDA toolkit +#include + +#include + + +// D E F I N E S /////////////////////////////////////////////////// + +#ifndef VERBOSE +#define DEFINE_VERBOSE 1 +#define VERBOSE(...) fprintf(stderr, __VA_ARGS__) +#endif + +// check for CUDA errors following a CUDA call +#define CUDA_CHECK(condition) SEACAVE::CUDA::checkCudaCall(condition) + +// check cudaGetLastError() for success +#define CUDA_CHECK_LAST_ERROR CUDA_CHECK(cudaGetLastError()); + + +// S T R U C T S /////////////////////////////////////////////////// + +namespace SEACAVE { + +namespace CUDA { + +inline void checkCudaCall(const cudaError_t error) { + if (error == cudaSuccess) + return; + VERBOSE("CUDA error at %s:%d: %s (code %d)", __FILE__, __LINE__, cudaGetErrorString(error), error); + ASSERT("CudaError" == NULL); + exit(EXIT_FAILURE); +} + +// define smart pointers for CUDA stream +struct CudaStreamDestructor { + void operator()(cudaStream_t s) { + if (s) + CUDA_CHECK(cudaStreamDestroy(s)); + } +}; + +typedef std::unique_ptr::type, CudaStreamDestructor> CudaStreamPtr; +inline CudaStreamPtr CreateStream() { + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + return CudaStreamPtr(stream, CudaStreamDestructor()); +} + +typedef std::shared_ptr::type> CudaStreamSharedPtr; +inline CudaStreamSharedPtr CreateSharedStream() { + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + return CudaStreamSharedPtr(stream, CudaStreamDestructor()); +} +/*----------------------------------------------------------------*/ + +} // namespace CUDA + +} // namespace SEACAVE + +#ifdef DEFINE_VERBOSE +#undef DEFINE_VERBOSE +#undef VERBOSE +#endif + +#endif // __SEACAVE_CUDA_DEVICE_H__ diff --git a/libs/MVS/PatchMatchCUDA.cpp b/libs/MVS/PatchMatchCUDA.cpp index 814e73e9c..ea12344de 100644 --- a/libs/MVS/PatchMatchCUDA.cpp +++ b/libs/MVS/PatchMatchCUDA.cpp @@ -108,18 +108,18 @@ void PatchMatchCUDA::Init(bool bGeomConsistency) void PatchMatchCUDA::AllocatePatchMatchCUDA(const cv::Mat1f& image) { const size_t num_images = images.size(); - CUDA::checkCudaCall(cudaMalloc((void**)&cudaTextureImages, sizeof(cudaTextureObject_t) * num_images)); - CUDA::checkCudaCall(cudaMalloc((void**)&cudaCameras, sizeof(Camera) * num_images)); + CUDA_CHECK(cudaMalloc((void**)&cudaTextureImages, sizeof(cudaTextureObject_t) * num_images)); + CUDA_CHECK(cudaMalloc((void**)&cudaCameras, sizeof(Camera) * num_images)); if (params.bGeomConsistency) - CUDA::checkCudaCall(cudaMalloc((void**)&cudaTextureDepths, sizeof(cudaTextureObject_t) * (num_images-1))); + CUDA_CHECK(cudaMalloc((void**)&cudaTextureDepths, sizeof(cudaTextureObject_t) * (num_images-1))); const size_t size = image.size().area(); depthNormalEstimates = new Point4[size]; - CUDA::checkCudaCall(cudaMalloc((void**)&cudaDepthNormalEstimates, sizeof(Point4) * size)); + CUDA_CHECK(cudaMalloc((void**)&cudaDepthNormalEstimates, sizeof(Point4) * size)); - CUDA::checkCudaCall(cudaMalloc((void**)&cudaDepthNormalCosts, sizeof(float) * size)); - CUDA::checkCudaCall(cudaMalloc((void**)&cudaSelectedViews, sizeof(unsigned) * size)); - CUDA::checkCudaCall(cudaMalloc((void**)&cudaRandStates, sizeof(curandState) * size)); + CUDA_CHECK(cudaMalloc((void**)&cudaDepthNormalCosts, sizeof(float) * size)); + CUDA_CHECK(cudaMalloc((void**)&cudaSelectedViews, sizeof(unsigned) * size)); + CUDA_CHECK(cudaMalloc((void**)&cudaRandStates, sizeof(curandState) * size)); } void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bInitImage, bool bInitDepthMap) @@ -127,7 +127,7 @@ void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bI const cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); if (bInitImage) { - CUDA::checkCudaCall(cudaMallocArray(&cudaImageArrays[i], &channelDesc, image.cols, image.rows)); + CUDA_CHECK(cudaMallocArray(&cudaImageArrays[i], &channelDesc, image.cols, image.rows)); struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(cudaResourceDesc)); @@ -142,7 +142,7 @@ void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bI texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 0; - CUDA::checkCudaCall(cudaCreateTextureObject(&textureImages[i], &resDesc, &texDesc, NULL)); + CUDA_CHECK(cudaCreateTextureObject(&textureImages[i], &resDesc, &texDesc, NULL)); } if (params.bGeomConsistency && i > 0) { @@ -152,7 +152,7 @@ void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bI return; } - CUDA::checkCudaCall(cudaMallocArray(&cudaDepthArrays[i-1], &channelDesc, image.cols, image.rows)); + CUDA_CHECK(cudaMallocArray(&cudaDepthArrays[i-1], &channelDesc, image.cols, image.rows)); struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(cudaResourceDesc)); @@ -167,7 +167,7 @@ void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bI texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 0; - CUDA::checkCudaCall(cudaCreateTextureObject(&textureDepths[i-1], &resDesc, &texDesc, NULL)); + CUDA_CHECK(cudaCreateTextureObject(&textureDepths[i-1], &resDesc, &texDesc, NULL)); } } @@ -213,7 +213,7 @@ void PatchMatchCUDA::EstimateDepthMap(DepthData& depthData) cv::resize(lowResDepthMap, depthData.depthMap, size, 0, 0, cv::INTER_LINEAR); cv::resize(lowResNormalMap, depthData.normalMap, size, 0, 0, cv::INTER_NEAREST); cv::resize(lowResViewsMap, depthData.viewsMap, size, 0, 0, cv::INTER_NEAREST); - CUDA::checkCudaCall(cudaMalloc((void**)&cudaLowDepths, sizeof(float) * size.area())); + CUDA_CHECK(cudaMalloc((void**)&cudaLowDepths, sizeof(float) * size.area())); } else { if (totalScaleNumber > 0) { // smallest resolution, when multi-resolution is enabled @@ -289,13 +289,13 @@ void PatchMatchCUDA::EstimateDepthMap(DepthData& depthData) } AllocateImageCUDA(i, image, false, !view.depthMap.empty()); } - CUDA::checkCudaCall(cudaMemcpy2DToArray(cudaImageArrays[i], 0, 0, image.ptr(), image.step[0], image.cols * sizeof(float), image.rows, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy2DToArray(cudaImageArrays[i], 0, 0, image.ptr(), image.step[0], image.cols * sizeof(float), image.rows, cudaMemcpyHostToDevice)); if (params.bGeomConsistency && i > 0 && !view.depthMap.empty()) { // set previously computed depth-map DepthMap depthMap(view.depthMap); if (depthMap.size() != image.size()) cv::resize(depthMap, depthMap, image.size(), 0, 0, cv::INTER_LINEAR); - CUDA::checkCudaCall(cudaMemcpy2DToArray(cudaDepthArrays[i-1], 0, 0, depthMap.ptr(), depthMap.step[0], sizeof(float) * depthMap.cols, depthMap.rows, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy2DToArray(cudaDepthArrays[i-1], 0, 0, depthMap.ptr(), depthMap.step[0], sizeof(float) * depthMap.cols, depthMap.rows, cudaMemcpyHostToDevice)); } images[i] = std::move(image); cameras[i] = std::move(camera); @@ -323,12 +323,12 @@ void PatchMatchCUDA::EstimateDepthMap(DepthData& depthData) prevNumImages = numImages; // setup CUDA memory - CUDA::checkCudaCall(cudaMemcpy(cudaTextureImages, textureImages.data(), sizeof(cudaTextureObject_t) * numImages, cudaMemcpyHostToDevice)); - CUDA::checkCudaCall(cudaMemcpy(cudaCameras, cameras.data(), sizeof(Camera) * numImages, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(cudaTextureImages, textureImages.data(), sizeof(cudaTextureObject_t) * numImages, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(cudaCameras, cameras.data(), sizeof(Camera) * numImages, cudaMemcpyHostToDevice)); if (params.bGeomConsistency) { // set previously computed depth-maps ASSERT(depthData.depthMap.size() == depthData.GetView().image.size()); - CUDA::checkCudaCall(cudaMemcpy(cudaTextureDepths, textureDepths.data(), sizeof(cudaTextureObject_t) * params.nNumViews, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(cudaTextureDepths, textureDepths.data(), sizeof(cudaTextureObject_t) * params.nNumViews, cudaMemcpyHostToDevice)); } // load depth-map and normal-map into CUDA memory @@ -342,20 +342,20 @@ void PatchMatchCUDA::EstimateDepthMap(DepthData& depthData) depthNormal.w() = depthData.depthMap(r, c); } } - CUDA::checkCudaCall(cudaMemcpy(cudaDepthNormalEstimates, depthNormalEstimates, sizeof(Point4) * depthData.depthMap.size().area(), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(cudaDepthNormalEstimates, depthNormalEstimates, sizeof(Point4) * depthData.depthMap.size().area(), cudaMemcpyHostToDevice)); // load low resolution depth-map into CUDA memory if (params.bLowResProcessed) { ASSERT(depthData.depthMap.isContinuous()); - CUDA::checkCudaCall(cudaMemcpy(cudaLowDepths, depthData.depthMap.ptr(), sizeof(float) * depthData.depthMap.size().area(), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(cudaLowDepths, depthData.depthMap.ptr(), sizeof(float) * depthData.depthMap.size().area(), cudaMemcpyHostToDevice)); } // run CUDA patch-match ASSERT(!depthData.viewsMap.empty()); RunCUDA(depthData.confMap.getData(), (uint32_t*)depthData.viewsMap.getData()); - CUDA::checkCudaCall(cudaGetLastError()); + CUDA_CHECK(cudaGetLastError()); if (params.bLowResProcessed) - CUDA::checkCudaCall(cudaFree(cudaLowDepths)); + CUDA_CHECK(cudaFree(cudaLowDepths)); // load depth-map, normal-map and confidence-map from CUDA memory for (int r = 0; r < depthData.depthMap.rows; ++r) {