Skip to content

Commit

Permalink
common: add CUDA utils for device
Browse files Browse the repository at this point in the history
(cherry picked from commit 8d097a76ea1f74017fdd23794742df09133f3c35)
  • Loading branch information
cdcseacave committed Aug 21, 2024
1 parent adaeecc commit a759b87
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 47 deletions.
22 changes: 11 additions & 11 deletions libs/Common/UtilCUDA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,33 +123,33 @@ CUresult _gpuGetMaxGflopsDeviceId(Device& bestDevice)
if (device.major > 0 && device.major < 9999) {
best_SM_arch = MAXF(best_SM_arch, device.major);
device.ID = (CUdevice)current_device;
devices.Insert(device);
devices.emplace_back(std::move(device));
}
}
if (devices.IsEmpty()) {
if (devices.empty()) {
VERBOSE("CUDA error: all devices have compute mode prohibited");
return CUDA_ERROR_PROFILER_DISABLED;
}

// Find the best CUDA capable GPU device
Device* max_perf_device = NULL;
size_t max_compute_perf = 0;
FOREACHPTR(pDevice, devices) {
ASSERT(pDevice->computeMode != CU_COMPUTEMODE_PROHIBITED);
int sm_per_multiproc = _convertSMVer2Cores(pDevice->major, pDevice->minor);
for (Device& device: devices) {
ASSERT(device.computeMode != CU_COMPUTEMODE_PROHIBITED);
int sm_per_multiproc = _convertSMVer2Cores(device.major, device.minor);
int multiProcessorCount;
if (reportCudaError(cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, (CUdevice)pDevice->ID)) != CUDA_SUCCESS)
if (reportCudaError(cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device.ID)) != CUDA_SUCCESS)
continue;
int clockRate;
if (reportCudaError(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, pDevice->ID)) != CUDA_SUCCESS)
if (reportCudaError(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device.ID)) != CUDA_SUCCESS)
continue;
size_t compute_perf = (size_t)multiProcessorCount * sm_per_multiproc * clockRate;
if (compute_perf > max_compute_perf &&
(best_SM_arch < 3 || // if we find GPU with SM major > 2, search only these
pDevice->major == best_SM_arch) ) // if our device==dest_SM_arch, choose this, or else pass
device.major == best_SM_arch) ) // if our device==dest_SM_arch, choose this, or else pass
{
max_compute_perf = compute_perf;
max_perf_device = pDevice;
max_perf_device = &device;
}
}
if (max_perf_device == NULL)
Expand Down Expand Up @@ -180,8 +180,8 @@ CUresult initDevice(int deviceID)
VERBOSE("CUDA error: compute capability 3.2 or greater required (available %d.%d for device[%d])", device.ID, device.major, device.minor);
return CUDA_ERROR_INVALID_DEVICE;
}
devices.Insert(device);
checkCudaError(cuCtxCreate(&devices.Last().ctx, CU_CTX_SCHED_AUTO, device.ID));
devices.emplace_back(device);
checkCudaError(cuCtxCreate(&devices.back().ctx, CU_CTX_SCHED_AUTO, device.ID));

#if TD_VERBOSE != TD_VERBOSE_OFF
char name[2048];
Expand Down
20 changes: 5 additions & 15 deletions libs/Common/UtilCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include <curand_kernel.h>
#include <vector_types.h>

#include "UtilCUDADevice.h"


// D E F I N E S ///////////////////////////////////////////////////

Expand Down Expand Up @@ -66,7 +68,6 @@ inline CUresult __reportCudaError(CUresult result, LPCSTR errorMessage) {
return result;
}
#define reportCudaError(val) CUDA::__reportCudaError(val, #val)

#define checkCudaError(val) { const CUresult ret(CUDA::__reportCudaError(val, #val)); if (ret != CUDA_SUCCESS) return ret; }

// outputs the proper CUDA error code and abort in the event that a CUDA host call returns an error
Expand All @@ -77,18 +78,7 @@ inline void __ensureCudaResult(CUresult result, LPCSTR errorMessage) {
exit(EXIT_FAILURE);
}
#define ensureCudaResult(val) CUDA::__ensureCudaResult(val, #val)

inline void checkCudaCall(const cudaError_t error) {
if (error == cudaSuccess)
return;
#ifdef _DEBUG
VERBOSE("CUDA error at %s:%d: %s (code %d)", __FILE__, __LINE__, cudaGetErrorString(error), error);
#else
DEBUG("CUDA error: %s (code %d)", cudaGetErrorString(error), error);
#endif
ASSERT("CudaError" == NULL);
exit(EXIT_FAILURE);
}
/*----------------------------------------------------------------*/

// rounds up addr to the align boundary
template <typename T>
Expand Down Expand Up @@ -478,7 +468,7 @@ class TArrayRT

public:
inline TArrayRT() : hArray(NULL) {}
inline TArrayRT(const Image8U::Size& size, unsigned flags=0) : hArray(NULL) { reportCudaError(Reset(size, flags)); }
inline TArrayRT(const cv::Size& size, unsigned flags=0) : hArray(NULL) { reportCudaError(Reset(size, flags)); }
inline TArrayRT(unsigned width, unsigned height, unsigned depth=0, unsigned flags=0) : hArray(NULL) { reportCudaError(Reset(width, height, depth, flags)); }
inline ~TArrayRT() { Release(); }

Expand All @@ -498,7 +488,7 @@ class TArrayRT
hArray = NULL;
}
}
inline CUresult Reset(const Image8U::Size& size, unsigned flags=0) {
inline CUresult Reset(const cv::Size& size, unsigned flags=0) {
return Reset((unsigned)size.width, (unsigned)size.height, 0, flags);
}
CUresult Reset(unsigned width, unsigned height, unsigned depth=0, unsigned flags=0) {
Expand Down
85 changes: 85 additions & 0 deletions libs/Common/UtilCUDADevice.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
////////////////////////////////////////////////////////////////////
// UtilCUDADevice.h
//
// Copyright 2024 cDc@seacave
// Distributed under the Boost Software License, Version 1.0
// (See http://www.boost.org/LICENSE_1_0.txt)

#ifndef __SEACAVE_CUDA_DEVICE_H__
#define __SEACAVE_CUDA_DEVICE_H__


// I N C L U D E S /////////////////////////////////////////////////

#include "Config.h"

// CUDA driver
#include <cuda.h>

// CUDA toolkit
#include <cuda_runtime.h>

#include <memory>


// D E F I N E S ///////////////////////////////////////////////////

#ifndef VERBOSE
#define DEFINE_VERBOSE 1
#define VERBOSE(...) fprintf(stderr, __VA_ARGS__)
#endif

// check for CUDA errors following a CUDA call
#define CUDA_CHECK(condition) SEACAVE::CUDA::checkCudaCall(condition)

// check cudaGetLastError() for success
#define CUDA_CHECK_LAST_ERROR CUDA_CHECK(cudaGetLastError());


// S T R U C T S ///////////////////////////////////////////////////

namespace SEACAVE {

namespace CUDA {

inline void checkCudaCall(const cudaError_t error) {
if (error == cudaSuccess)
return;
VERBOSE("CUDA error at %s:%d: %s (code %d)", __FILE__, __LINE__, cudaGetErrorString(error), error);
ASSERT("CudaError" == NULL);
exit(EXIT_FAILURE);
}

// define smart pointers for CUDA stream
struct CudaStreamDestructor {
void operator()(cudaStream_t s) {
if (s)
CUDA_CHECK(cudaStreamDestroy(s));
}
};

typedef std::unique_ptr<std::remove_pointer<cudaStream_t>::type, CudaStreamDestructor> CudaStreamPtr;
inline CudaStreamPtr CreateStream() {
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
return CudaStreamPtr(stream, CudaStreamDestructor());
}

typedef std::shared_ptr<std::remove_pointer<cudaStream_t>::type> CudaStreamSharedPtr;
inline CudaStreamSharedPtr CreateSharedStream() {
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
return CudaStreamSharedPtr(stream, CudaStreamDestructor());
}
/*----------------------------------------------------------------*/

} // namespace CUDA

} // namespace SEACAVE

#ifdef DEFINE_VERBOSE
#undef DEFINE_VERBOSE
#undef VERBOSE
#endif

#endif // __SEACAVE_CUDA_DEVICE_H__
42 changes: 21 additions & 21 deletions libs/MVS/PatchMatchCUDA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,26 +108,26 @@ void PatchMatchCUDA::Init(bool bGeomConsistency)
void PatchMatchCUDA::AllocatePatchMatchCUDA(const cv::Mat1f& image)
{
const size_t num_images = images.size();
CUDA::checkCudaCall(cudaMalloc((void**)&cudaTextureImages, sizeof(cudaTextureObject_t) * num_images));
CUDA::checkCudaCall(cudaMalloc((void**)&cudaCameras, sizeof(Camera) * num_images));
CUDA_CHECK(cudaMalloc((void**)&cudaTextureImages, sizeof(cudaTextureObject_t) * num_images));
CUDA_CHECK(cudaMalloc((void**)&cudaCameras, sizeof(Camera) * num_images));
if (params.bGeomConsistency)
CUDA::checkCudaCall(cudaMalloc((void**)&cudaTextureDepths, sizeof(cudaTextureObject_t) * (num_images-1)));
CUDA_CHECK(cudaMalloc((void**)&cudaTextureDepths, sizeof(cudaTextureObject_t) * (num_images-1)));

const size_t size = image.size().area();
depthNormalEstimates = new Point4[size];
CUDA::checkCudaCall(cudaMalloc((void**)&cudaDepthNormalEstimates, sizeof(Point4) * size));
CUDA_CHECK(cudaMalloc((void**)&cudaDepthNormalEstimates, sizeof(Point4) * size));

CUDA::checkCudaCall(cudaMalloc((void**)&cudaDepthNormalCosts, sizeof(float) * size));
CUDA::checkCudaCall(cudaMalloc((void**)&cudaSelectedViews, sizeof(unsigned) * size));
CUDA::checkCudaCall(cudaMalloc((void**)&cudaRandStates, sizeof(curandState) * size));
CUDA_CHECK(cudaMalloc((void**)&cudaDepthNormalCosts, sizeof(float) * size));
CUDA_CHECK(cudaMalloc((void**)&cudaSelectedViews, sizeof(unsigned) * size));
CUDA_CHECK(cudaMalloc((void**)&cudaRandStates, sizeof(curandState) * size));
}

void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bInitImage, bool bInitDepthMap)
{
const cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

if (bInitImage) {
CUDA::checkCudaCall(cudaMallocArray(&cudaImageArrays[i], &channelDesc, image.cols, image.rows));
CUDA_CHECK(cudaMallocArray(&cudaImageArrays[i], &channelDesc, image.cols, image.rows));

struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(cudaResourceDesc));
Expand All @@ -142,7 +142,7 @@ void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bI
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 0;

CUDA::checkCudaCall(cudaCreateTextureObject(&textureImages[i], &resDesc, &texDesc, NULL));
CUDA_CHECK(cudaCreateTextureObject(&textureImages[i], &resDesc, &texDesc, NULL));
}

if (params.bGeomConsistency && i > 0) {
Expand All @@ -152,7 +152,7 @@ void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bI
return;
}

CUDA::checkCudaCall(cudaMallocArray(&cudaDepthArrays[i-1], &channelDesc, image.cols, image.rows));
CUDA_CHECK(cudaMallocArray(&cudaDepthArrays[i-1], &channelDesc, image.cols, image.rows));

struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(cudaResourceDesc));
Expand All @@ -167,7 +167,7 @@ void PatchMatchCUDA::AllocateImageCUDA(size_t i, const cv::Mat1f& image, bool bI
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 0;

CUDA::checkCudaCall(cudaCreateTextureObject(&textureDepths[i-1], &resDesc, &texDesc, NULL));
CUDA_CHECK(cudaCreateTextureObject(&textureDepths[i-1], &resDesc, &texDesc, NULL));
}
}

Expand Down Expand Up @@ -213,7 +213,7 @@ void PatchMatchCUDA::EstimateDepthMap(DepthData& depthData)
cv::resize(lowResDepthMap, depthData.depthMap, size, 0, 0, cv::INTER_LINEAR);
cv::resize(lowResNormalMap, depthData.normalMap, size, 0, 0, cv::INTER_NEAREST);
cv::resize(lowResViewsMap, depthData.viewsMap, size, 0, 0, cv::INTER_NEAREST);
CUDA::checkCudaCall(cudaMalloc((void**)&cudaLowDepths, sizeof(float) * size.area()));
CUDA_CHECK(cudaMalloc((void**)&cudaLowDepths, sizeof(float) * size.area()));
} else {
if (totalScaleNumber > 0) {
// smallest resolution, when multi-resolution is enabled
Expand Down Expand Up @@ -289,13 +289,13 @@ void PatchMatchCUDA::EstimateDepthMap(DepthData& depthData)
}
AllocateImageCUDA(i, image, false, !view.depthMap.empty());
}
CUDA::checkCudaCall(cudaMemcpy2DToArray(cudaImageArrays[i], 0, 0, image.ptr<float>(), image.step[0], image.cols * sizeof(float), image.rows, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy2DToArray(cudaImageArrays[i], 0, 0, image.ptr<float>(), image.step[0], image.cols * sizeof(float), image.rows, cudaMemcpyHostToDevice));
if (params.bGeomConsistency && i > 0 && !view.depthMap.empty()) {
// set previously computed depth-map
DepthMap depthMap(view.depthMap);
if (depthMap.size() != image.size())
cv::resize(depthMap, depthMap, image.size(), 0, 0, cv::INTER_LINEAR);
CUDA::checkCudaCall(cudaMemcpy2DToArray(cudaDepthArrays[i-1], 0, 0, depthMap.ptr<float>(), depthMap.step[0], sizeof(float) * depthMap.cols, depthMap.rows, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy2DToArray(cudaDepthArrays[i-1], 0, 0, depthMap.ptr<float>(), depthMap.step[0], sizeof(float) * depthMap.cols, depthMap.rows, cudaMemcpyHostToDevice));
}
images[i] = std::move(image);
cameras[i] = std::move(camera);
Expand Down Expand Up @@ -323,12 +323,12 @@ void PatchMatchCUDA::EstimateDepthMap(DepthData& depthData)
prevNumImages = numImages;

// setup CUDA memory
CUDA::checkCudaCall(cudaMemcpy(cudaTextureImages, textureImages.data(), sizeof(cudaTextureObject_t) * numImages, cudaMemcpyHostToDevice));
CUDA::checkCudaCall(cudaMemcpy(cudaCameras, cameras.data(), sizeof(Camera) * numImages, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(cudaTextureImages, textureImages.data(), sizeof(cudaTextureObject_t) * numImages, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(cudaCameras, cameras.data(), sizeof(Camera) * numImages, cudaMemcpyHostToDevice));
if (params.bGeomConsistency) {
// set previously computed depth-maps
ASSERT(depthData.depthMap.size() == depthData.GetView().image.size());
CUDA::checkCudaCall(cudaMemcpy(cudaTextureDepths, textureDepths.data(), sizeof(cudaTextureObject_t) * params.nNumViews, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(cudaTextureDepths, textureDepths.data(), sizeof(cudaTextureObject_t) * params.nNumViews, cudaMemcpyHostToDevice));
}

// load depth-map and normal-map into CUDA memory
Expand All @@ -342,20 +342,20 @@ void PatchMatchCUDA::EstimateDepthMap(DepthData& depthData)
depthNormal.w() = depthData.depthMap(r, c);
}
}
CUDA::checkCudaCall(cudaMemcpy(cudaDepthNormalEstimates, depthNormalEstimates, sizeof(Point4) * depthData.depthMap.size().area(), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(cudaDepthNormalEstimates, depthNormalEstimates, sizeof(Point4) * depthData.depthMap.size().area(), cudaMemcpyHostToDevice));

// load low resolution depth-map into CUDA memory
if (params.bLowResProcessed) {
ASSERT(depthData.depthMap.isContinuous());
CUDA::checkCudaCall(cudaMemcpy(cudaLowDepths, depthData.depthMap.ptr<float>(), sizeof(float) * depthData.depthMap.size().area(), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(cudaLowDepths, depthData.depthMap.ptr<float>(), sizeof(float) * depthData.depthMap.size().area(), cudaMemcpyHostToDevice));
}

// run CUDA patch-match
ASSERT(!depthData.viewsMap.empty());
RunCUDA(depthData.confMap.getData(), (uint32_t*)depthData.viewsMap.getData());
CUDA::checkCudaCall(cudaGetLastError());
CUDA_CHECK(cudaGetLastError());
if (params.bLowResProcessed)
CUDA::checkCudaCall(cudaFree(cudaLowDepths));
CUDA_CHECK(cudaFree(cudaLowDepths));

// load depth-map, normal-map and confidence-map from CUDA memory
for (int r = 0; r < depthData.depthMap.rows; ++r) {
Expand Down

0 comments on commit a759b87

Please sign in to comment.