cudaimgproc: update for CUDA 8.0 and fix out of bounds memory error

cudawarped · cudawarped · commit 1334d923dd65 · 2024-01-05T08:37:57.000+02:00
diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -800,10 +800,21 @@ enum MomentsOrder {
 @param order Order of largest moments to calculate with lower order moments requiring less computation.
 @returns number of image moments.
 
-@sa cuda::moments, cuda::spatialMoments, cuda::MomentsOrder
+@sa cuda::spatialMoments, cuda::moments, cuda::MomentsOrder
  */
 CV_EXPORTS_W int numMoments(const MomentsOrder order);
 
+/** @brief Converts the spatial image moments returned from cuda::spatialMoments to cv::Moments.
+@param spatialMoments Spatial moments returned from cuda::spatialMoments.
+@param order Order used when calculating image moments with cuda::spatialMoments.
+@param momentsType Precision used when calculating image moments with cuda::spatialMoments.
+
+@returns cv::Moments.
+
+@sa cuda::spatialMoments, cuda::moments, cuda::cvtToMoments, cuda::numMoments, cuda::MomentsOrder
+ */
+CV_EXPORTS_W Moments cvtToMoments(Mat spatialMoments, const MomentsOrder order, const int momentsType);
+
 /** @brief Calculates all of the spatial moments up to the 3rd order of a rasterized shape.
 
 Asynchronous version of cuda::moments() which only calculates the spatial (not centralized or normalized) moments, up to the 3rd order, of a rasterized shape.
@@ -813,24 +824,25 @@ Each moment is returned as a column entry in the 1D \a moments array.
 @param [out] moments 1D array with each column entry containing a spatial image moment.
 @param binaryImage If it is true, all non-zero image pixels are treated as 1's.
 @param order Order of largest moments to calculate with lower order moments requiring less computation.
-@param momentsType Precision to use when calculating moments. Available types are `CV_32F` and `CV_64F` with the performance of `CV_32F` an order of magnitude greater than `CV_64F`. If the image is small the accuracy from `CV_32F` can be equal or very close to `CV_64F`.
+@param momentsType Precision to use when calculating moments. Available types are \ref CV_32F and \ref CV_64F with the performance of \ref CV_32F an order of magnitude greater than \ref CV_64F. If the image is small the accuracy from \ref CV_32F can be equal or very close to \ref CV_64F.
 @param stream Stream for the asynchronous version.
 
-@note For maximum performance pre-allocate a 1D GpuMat for \a moments of the correct type and size large enough to store the all the image moments of up to the desired \a order. e.g. With \a order === MomentsOrder::SECOND_ORDER_MOMENTS and \a momentsType == `CV_32F` \a moments can be allocated as
+@note For maximum performance pre-allocate a 1D GpuMat for \a moments of the correct type and size large enough to store the all the image moments of up to the desired \a order. e.g. With \a order === MomentsOrder::SECOND_ORDER_MOMENTS and \a momentsType == \ref CV_32F \a moments can be allocated as
 ```
 GpuMat momentsDevice(1,numMoments(MomentsOrder::SECOND_ORDER_MOMENTS),CV_32F)
 ```
-The central and normalized moments can easily be calculated on the host by downloading the \a moments array and using the cv::Moments constructor. e.g.
+The central and normalized moments can easily be calculated on the host by downloading the \a moments array and using the cuda::cvtToMoments helper function. e.g.
 ```
-HostMem momentsHostMem(1, numMoments(MomentsOrder::SECOND_ORDER_MOMENTS), CV_32F);
-momentsDevice.download(momentsHostMem, stream);
+HostMem spatialMomentsHostMem(1, numMoments(MomentsOrder::SECOND_ORDER_MOMENTS), CV_32F);
+spatialMomentsDevice.download(spatialMomentsHostMem, stream);
 stream.waitForCompletion();
-Mat momentsMat = momentsHostMem.createMatHeader();
-cv::Moments cvMoments(momentsMat.at<float>(0), momentsMat.at<float>(1), momentsMat.at<float>(2), momentsMat.at<float>(3), momentsMat.at<float>(4), momentsMat.at<float>(5), momentsMat.at<float>(6), momentsMat.at<float>(7), momentsMat.at<float>(8), momentsMat.at<float>(9));
+Mat spatialMoments = spatialMomentsHostMem.createMatHeader();
+cv::Moments cvMoments = cvtToMomentsT<float>(spatialMoments, order);
 ```
+
 see the \a CUDA_TEST_P(Moments, Async) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
 @returns cv::Moments.
-@sa cuda::moments
+@sa cuda::moments, cuda::cvtToMoments, cuda::numMoments, cuda::MomentsOrder
 */
 CV_EXPORTS_W void spatialMoments(InputArray src, OutputArray moments, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F, Stream& stream = Stream::Null());
 
@@ -842,7 +854,7 @@ results are returned in the structure cv::Moments.
 @param src Raster image (single-channel 2D array).
 @param binaryImage If it is true, all non-zero image pixels are treated as 1's.
 @param order Order of largest moments to calculate with lower order moments requiring less computation.
- @param momentsType Precision to use when calculating moments. Available types are `CV_32F` and `CV_64F` with the performance of `CV_32F` an order of magnitude greater than `CV_64F`. If the image is small the accuracy from `CV_32F` can be equal or very close to `CV_64F`.
+ @param momentsType Precision to use when calculating moments. Available types are \ref CV_32F and \ref CV_64F with the performance of \ref CV_32F an order of magnitude greater than \ref CV_64F. If the image is small the accuracy from \ref CV_32F can be equal or very close to \ref CV_64F.
 
 @note For maximum performance use the asynchronous version cuda::spatialMoments() as this version interally allocates and deallocates both GpuMat and HostMem to respectively perform the calculation on the device and download the result to the host.
 The costly HostMem allocation cannot be avoided however the GpuMat device allocation can be by using BufferPool, e.g.
@@ -852,7 +864,7 @@ The costly HostMem allocation cannot be avoided however the GpuMat device alloca
 ```
 see the \a CUDA_TEST_P(Moments, Accuracy) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
 @returns cv::Moments.
-@sa cuda::spatialMoments
+@sa cuda::spatialMoments, cuda::cvtToMoments, cuda::numMoments, cuda::MomentsOrder
  */
 CV_EXPORTS_W Moments moments(InputArray src, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F);
 
diff --git a/modules/cudaimgproc/src/cuda/moments.cu b/modules/cudaimgproc/src/cuda/moments.cu
@@ -16,14 +16,22 @@ constexpr int blockSizeY = 16;
 template <typename T>
 __device__ T butterflyWarpReduction(T value) {
     for (int i = 16; i >= 1; i /= 2)
+#if (CUDART_VERSION >= 9000)
         value += __shfl_xor_sync(0xffffffff, value, i, 32);
+#else
+        value += __shfl_xor(value, i, 32);
+#endif
     return value;
 }
 
 template <typename T>
 __device__ T butterflyHalfWarpReduction(T value) {
     for (int i = 8; i >= 1; i /= 2)
-        value += __shfl_xor_sync(0xffff, value, i, 32);
+#if (CUDART_VERSION >= 9000)
+        value += __shfl_xor_sync(0xffff, value, i, 16);
+#else
+        value += __shfl_xor(value, i, 16);
+#endif
     return value;
 }
 
diff --git a/modules/cudaimgproc/src/moments.cpp b/modules/cudaimgproc/src/moments.cpp
@@ -12,6 +12,25 @@ int cv::cuda::numMoments(const MomentsOrder order) {
     return order == MomentsOrder::FIRST_ORDER_MOMENTS ? device::imgproc::n1 : order == MomentsOrder::SECOND_ORDER_MOMENTS ? device::imgproc::n12 : device::imgproc::n123;
 }
 
+template<typename T>
+cv::Moments cvtToMomentsT(Mat spatialMoments, const MomentsOrder order) {
+    switch (order) {
+    case MomentsOrder::FIRST_ORDER_MOMENTS:
+        return Moments(spatialMoments.at<T>(0), spatialMoments.at<T>(1), spatialMoments.at<T>(2), 0, 0, 0, 0, 0, 0, 0);
+    case MomentsOrder::SECOND_ORDER_MOMENTS:
+        return Moments(spatialMoments.at<T>(0), spatialMoments.at<T>(1), spatialMoments.at<T>(2), spatialMoments.at<T>(3), spatialMoments.at<T>(4), spatialMoments.at<T>(5), 0, 0, 0, 0);
+    default:
+        return Moments(spatialMoments.at<T>(0), spatialMoments.at<T>(1), spatialMoments.at<T>(2), spatialMoments.at<T>(3), spatialMoments.at<T>(4), spatialMoments.at<T>(5), spatialMoments.at<T>(6), spatialMoments.at<T>(7), spatialMoments.at<T>(8), spatialMoments.at<T>(9));
+    }
+}
+
+cv::Moments cv::cuda::cvtToMoments(Mat spatialMoments, const MomentsOrder order, const int momentsType) {
+    if (momentsType == CV_32F)
+        return cvtToMomentsT<float>(spatialMoments, order);
+    else
+        return cvtToMomentsT<double>(spatialMoments, order);
+}
+
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
     Moments cv::cuda::moments(InputArray src, const bool binary, const MomentsOrder order, const int momentsType) { throw_no_cuda(); }
     void spatialMoments(InputArray src, OutputArray moments, const bool binary, const MomentsOrder order, const int momentsType, Stream& stream) { throw_no_cuda(); }
@@ -53,15 +72,12 @@ void cv::cuda::spatialMoments(InputArray src, OutputArray moments, const bool bi
 }
 
 Moments cv::cuda::moments(InputArray src, const bool binary, const MomentsOrder order, const int momentsType) {
-    Stream& stream = Stream::Null();
+    Stream stream;
     HostMem dst;
     spatialMoments(src, dst, binary, order, momentsType, stream);
     stream.waitForCompletion();
     Mat moments = dst.createMatHeader();
-    if(momentsType == CV_32F)
-        return Moments(moments.at<float>(0), moments.at<float>(1), moments.at<float>(2), moments.at<float>(3), moments.at<float>(4), moments.at<float>(5), moments.at<float>(6), moments.at<float>(7), moments.at<float>(8), moments.at<float>(9));
-    else
-        return Moments(moments.at<double>(0), moments.at<double>(1), moments.at<double>(2), moments.at<double>(3), moments.at<double>(4), moments.at<double>(5), moments.at<double>(6), moments.at<double>(7), moments.at<double>(8), moments.at<double>(9));
+    return cvtToMoments(moments, order, momentsType);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/test/test_moments.cpp b/modules/cudaimgproc/test/test_moments.cpp
@@ -101,10 +101,7 @@ CUDA_TEST_P(Moments, Async)
     HostMem momentsHost(1, nMoments, momentsType);
     momentsDevice.download(momentsHost, stream);
     stream.waitForCompletion();
-    Mat momentsHost64F = momentsHost.createMatHeader();
-    if (momentsType == CV_32F)
-        momentsHost.createMatHeader().convertTo(momentsHost64F, CV_64F);
-    const cv::Moments moments = cv::Moments(momentsHost64F.at<double>(0), momentsHost64F.at<double>(1), momentsHost64F.at<double>(2), momentsHost64F.at<double>(3), momentsHost64F.at<double>(4), momentsHost64F.at<double>(5), momentsHost64F.at<double>(6), momentsHost64F.at<double>(7), momentsHost64F.at<double>(8), momentsHost64F.at<double>(9));
+    const cv::Moments moments  = cvtToMoments(momentsHost.createMatHeader(), order, momentsType);
     Mat imgHostAdjustedType = imgHost(roi);
     if (imgType != CV_8U && imgType != CV_32F)
         imgHost(roi).convertTo(imgHostAdjustedType, CV_32F);