Skip to content

Commit

Permalink
cudaimgproc: update for CUDA 8.0 and fix out of bounds memory error
Browse files Browse the repository at this point in the history
  • Loading branch information
cudawarped committed Jan 5, 2024
1 parent c7602a8 commit 1334d92
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 21 deletions.
34 changes: 23 additions & 11 deletions modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -800,10 +800,21 @@ enum MomentsOrder {
@param order Order of largest moments to calculate with lower order moments requiring less computation.
@returns number of image moments.
@sa cuda::moments, cuda::spatialMoments, cuda::MomentsOrder
@sa cuda::spatialMoments, cuda::moments, cuda::MomentsOrder
*/
CV_EXPORTS_W int numMoments(const MomentsOrder order);

/** @brief Converts the spatial image moments returned from cuda::spatialMoments to cv::Moments.
@param spatialMoments Spatial moments returned from cuda::spatialMoments.
@param order Order used when calculating image moments with cuda::spatialMoments.
@param momentsType Precision used when calculating image moments with cuda::spatialMoments.
@returns cv::Moments.
@sa cuda::spatialMoments, cuda::moments, cuda::cvtToMoments, cuda::numMoments, cuda::MomentsOrder
*/
CV_EXPORTS_W Moments cvtToMoments(Mat spatialMoments, const MomentsOrder order, const int momentsType);

/** @brief Calculates all of the spatial moments up to the 3rd order of a rasterized shape.
Asynchronous version of cuda::moments() which only calculates the spatial (not centralized or normalized) moments, up to the 3rd order, of a rasterized shape.
Expand All @@ -813,24 +824,25 @@ Each moment is returned as a column entry in the 1D \a moments array.
@param [out] moments 1D array with each column entry containing a spatial image moment.
@param binaryImage If it is true, all non-zero image pixels are treated as 1's.
@param order Order of largest moments to calculate with lower order moments requiring less computation.
@param momentsType Precision to use when calculating moments. Available types are `CV_32F` and `CV_64F` with the performance of `CV_32F` an order of magnitude greater than `CV_64F`. If the image is small the accuracy from `CV_32F` can be equal or very close to `CV_64F`.
@param momentsType Precision to use when calculating moments. Available types are \ref CV_32F and \ref CV_64F with the performance of \ref CV_32F an order of magnitude greater than \ref CV_64F. If the image is small the accuracy from \ref CV_32F can be equal or very close to \ref CV_64F.
@param stream Stream for the asynchronous version.
@note For maximum performance pre-allocate a 1D GpuMat for \a moments of the correct type and size large enough to store the all the image moments of up to the desired \a order. e.g. With \a order === MomentsOrder::SECOND_ORDER_MOMENTS and \a momentsType == `CV_32F` \a moments can be allocated as
@note For maximum performance pre-allocate a 1D GpuMat for \a moments of the correct type and size large enough to store the all the image moments of up to the desired \a order. e.g. With \a order === MomentsOrder::SECOND_ORDER_MOMENTS and \a momentsType == \ref CV_32F \a moments can be allocated as
```
GpuMat momentsDevice(1,numMoments(MomentsOrder::SECOND_ORDER_MOMENTS),CV_32F)
```
The central and normalized moments can easily be calculated on the host by downloading the \a moments array and using the cv::Moments constructor. e.g.
The central and normalized moments can easily be calculated on the host by downloading the \a moments array and using the cuda::cvtToMoments helper function. e.g.
```
HostMem momentsHostMem(1, numMoments(MomentsOrder::SECOND_ORDER_MOMENTS), CV_32F);
momentsDevice.download(momentsHostMem, stream);
HostMem spatialMomentsHostMem(1, numMoments(MomentsOrder::SECOND_ORDER_MOMENTS), CV_32F);
spatialMomentsDevice.download(spatialMomentsHostMem, stream);
stream.waitForCompletion();
Mat momentsMat = momentsHostMem.createMatHeader();
cv::Moments cvMoments(momentsMat.at<float>(0), momentsMat.at<float>(1), momentsMat.at<float>(2), momentsMat.at<float>(3), momentsMat.at<float>(4), momentsMat.at<float>(5), momentsMat.at<float>(6), momentsMat.at<float>(7), momentsMat.at<float>(8), momentsMat.at<float>(9));
Mat spatialMoments = spatialMomentsHostMem.createMatHeader();
cv::Moments cvMoments = cvtToMomentsT<float>(spatialMoments, order);
```
see the \a CUDA_TEST_P(Moments, Async) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
@returns cv::Moments.
@sa cuda::moments
@sa cuda::moments, cuda::cvtToMoments, cuda::numMoments, cuda::MomentsOrder
*/
CV_EXPORTS_W void spatialMoments(InputArray src, OutputArray moments, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F, Stream& stream = Stream::Null());

Expand All @@ -842,7 +854,7 @@ results are returned in the structure cv::Moments.
@param src Raster image (single-channel 2D array).
@param binaryImage If it is true, all non-zero image pixels are treated as 1's.
@param order Order of largest moments to calculate with lower order moments requiring less computation.
@param momentsType Precision to use when calculating moments. Available types are `CV_32F` and `CV_64F` with the performance of `CV_32F` an order of magnitude greater than `CV_64F`. If the image is small the accuracy from `CV_32F` can be equal or very close to `CV_64F`.
@param momentsType Precision to use when calculating moments. Available types are \ref CV_32F and \ref CV_64F with the performance of \ref CV_32F an order of magnitude greater than \ref CV_64F. If the image is small the accuracy from \ref CV_32F can be equal or very close to \ref CV_64F.
@note For maximum performance use the asynchronous version cuda::spatialMoments() as this version interally allocates and deallocates both GpuMat and HostMem to respectively perform the calculation on the device and download the result to the host.
The costly HostMem allocation cannot be avoided however the GpuMat device allocation can be by using BufferPool, e.g.
Expand All @@ -852,7 +864,7 @@ The costly HostMem allocation cannot be avoided however the GpuMat device alloca
```
see the \a CUDA_TEST_P(Moments, Accuracy) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
@returns cv::Moments.
@sa cuda::spatialMoments
@sa cuda::spatialMoments, cuda::cvtToMoments, cuda::numMoments, cuda::MomentsOrder
*/
CV_EXPORTS_W Moments moments(InputArray src, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F);

Expand Down
10 changes: 9 additions & 1 deletion modules/cudaimgproc/src/cuda/moments.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,22 @@ constexpr int blockSizeY = 16;
template <typename T>
__device__ T butterflyWarpReduction(T value) {
for (int i = 16; i >= 1; i /= 2)
#if (CUDART_VERSION >= 9000)
value += __shfl_xor_sync(0xffffffff, value, i, 32);
#else
value += __shfl_xor(value, i, 32);
#endif
return value;
}

template <typename T>
__device__ T butterflyHalfWarpReduction(T value) {
for (int i = 8; i >= 1; i /= 2)
value += __shfl_xor_sync(0xffff, value, i, 32);
#if (CUDART_VERSION >= 9000)
value += __shfl_xor_sync(0xffff, value, i, 16);
#else
value += __shfl_xor(value, i, 16);
#endif
return value;
}

Expand Down
26 changes: 21 additions & 5 deletions modules/cudaimgproc/src/moments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,25 @@ int cv::cuda::numMoments(const MomentsOrder order) {
return order == MomentsOrder::FIRST_ORDER_MOMENTS ? device::imgproc::n1 : order == MomentsOrder::SECOND_ORDER_MOMENTS ? device::imgproc::n12 : device::imgproc::n123;
}

template<typename T>
cv::Moments cvtToMomentsT(Mat spatialMoments, const MomentsOrder order) {
switch (order) {
case MomentsOrder::FIRST_ORDER_MOMENTS:
return Moments(spatialMoments.at<T>(0), spatialMoments.at<T>(1), spatialMoments.at<T>(2), 0, 0, 0, 0, 0, 0, 0);
case MomentsOrder::SECOND_ORDER_MOMENTS:
return Moments(spatialMoments.at<T>(0), spatialMoments.at<T>(1), spatialMoments.at<T>(2), spatialMoments.at<T>(3), spatialMoments.at<T>(4), spatialMoments.at<T>(5), 0, 0, 0, 0);
default:
return Moments(spatialMoments.at<T>(0), spatialMoments.at<T>(1), spatialMoments.at<T>(2), spatialMoments.at<T>(3), spatialMoments.at<T>(4), spatialMoments.at<T>(5), spatialMoments.at<T>(6), spatialMoments.at<T>(7), spatialMoments.at<T>(8), spatialMoments.at<T>(9));
}
}

cv::Moments cv::cuda::cvtToMoments(Mat spatialMoments, const MomentsOrder order, const int momentsType) {
if (momentsType == CV_32F)
return cvtToMomentsT<float>(spatialMoments, order);
else
return cvtToMomentsT<double>(spatialMoments, order);
}

#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
Moments cv::cuda::moments(InputArray src, const bool binary, const MomentsOrder order, const int momentsType) { throw_no_cuda(); }
void spatialMoments(InputArray src, OutputArray moments, const bool binary, const MomentsOrder order, const int momentsType, Stream& stream) { throw_no_cuda(); }
Expand Down Expand Up @@ -53,15 +72,12 @@ void cv::cuda::spatialMoments(InputArray src, OutputArray moments, const bool bi
}

Moments cv::cuda::moments(InputArray src, const bool binary, const MomentsOrder order, const int momentsType) {
Stream& stream = Stream::Null();
Stream stream;
HostMem dst;
spatialMoments(src, dst, binary, order, momentsType, stream);
stream.waitForCompletion();
Mat moments = dst.createMatHeader();
if(momentsType == CV_32F)
return Moments(moments.at<float>(0), moments.at<float>(1), moments.at<float>(2), moments.at<float>(3), moments.at<float>(4), moments.at<float>(5), moments.at<float>(6), moments.at<float>(7), moments.at<float>(8), moments.at<float>(9));
else
return Moments(moments.at<double>(0), moments.at<double>(1), moments.at<double>(2), moments.at<double>(3), moments.at<double>(4), moments.at<double>(5), moments.at<double>(6), moments.at<double>(7), moments.at<double>(8), moments.at<double>(9));
return cvtToMoments(moments, order, momentsType);
}

#endif /* !defined (HAVE_CUDA) */
5 changes: 1 addition & 4 deletions modules/cudaimgproc/test/test_moments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,7 @@ CUDA_TEST_P(Moments, Async)
HostMem momentsHost(1, nMoments, momentsType);
momentsDevice.download(momentsHost, stream);
stream.waitForCompletion();
Mat momentsHost64F = momentsHost.createMatHeader();
if (momentsType == CV_32F)
momentsHost.createMatHeader().convertTo(momentsHost64F, CV_64F);
const cv::Moments moments = cv::Moments(momentsHost64F.at<double>(0), momentsHost64F.at<double>(1), momentsHost64F.at<double>(2), momentsHost64F.at<double>(3), momentsHost64F.at<double>(4), momentsHost64F.at<double>(5), momentsHost64F.at<double>(6), momentsHost64F.at<double>(7), momentsHost64F.at<double>(8), momentsHost64F.at<double>(9));
const cv::Moments moments = cvtToMoments(momentsHost.createMatHeader(), order, momentsType);
Mat imgHostAdjustedType = imgHost(roi);
if (imgType != CV_8U && imgType != CV_32F)
imgHost(roi).convertTo(imgHostAdjustedType, CV_32F);
Expand Down

0 comments on commit 1334d92

Please sign in to comment.