opencv · asmorkalov · Nov 5, 2024 · Oct 2, 2024
diff --git a/modules/cudaarithm/src/core.cpp b/modules/cudaarithm/src/core.cpp
@@ -79,7 +79,11 @@ namespace
     {
         typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
 
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip, NppStreamContext ctx);
+#else
+        typedef NppStatus(*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
+#endif
     };
 
     template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
@@ -94,9 +98,15 @@ namespace
             sz.width  = src.cols;
             sz.height = src.rows;
 
+#if USE_NPP_STREAM_CTX
             nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
                 dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
-                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
+                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS)), h) );
+#else
+            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
+                dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
+                    (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
+#endif
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -107,7 +117,11 @@ namespace
     {
         typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
 
-        typedef NppStatus (*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip);
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus (*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip, NppStreamContext ctx);
+#else
+        typedef NppStatus(*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip);
+#endif
     };
 
     template <int DEPTH, typename NppMirrorIFunc<DEPTH>::func_t func> struct NppMirrorI
@@ -121,10 +135,15 @@ namespace
             NppiSize sz;
             sz.width  = srcDst.cols;
             sz.height = srcDst.rows;
-
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func(srcDst.ptr<npp_t>(), static_cast<int>(srcDst.step),
+                sz,
+                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS)), h) );
+#else
             nppSafeCall( func(srcDst.ptr<npp_t>(), static_cast<int>(srcDst.step),
                 sz,
                 (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
+#endif
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -137,23 +156,41 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str
     typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
     static const func_t funcs[6][4] =
     {
-        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
+#if USE_NPP_STREAM_CTX
+        {NppMirror<CV_8U, nppiMirror_8u_C1R_Ctx>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R_Ctx>::call, NppMirror<CV_8U, nppiMirror_8u_C4R_Ctx>::call},
+        {0,0,0,0},
+        {NppMirror<CV_16U, nppiMirror_16u_C1R_Ctx>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R_Ctx>::call, NppMirror<CV_16U, nppiMirror_16u_C4R_Ctx>::call},
+        {0,0,0,0},
+        {NppMirror<CV_32S, nppiMirror_32s_C1R_Ctx>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R_Ctx>::call, NppMirror<CV_32S, nppiMirror_32s_C4R_Ctx>::call},
+        {NppMirror<CV_32F, nppiMirror_32f_C1R_Ctx>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R_Ctx>::call, NppMirror<CV_32F, nppiMirror_32f_C4R_Ctx>::call}
+#else
+        { NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call },
         {0,0,0,0},
         {NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
         {0,0,0,0},
         {NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
         {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
+#endif
     };
 
     typedef void (*ifunc_t)(GpuMat& srcDst, int flipCode, cudaStream_t stream);
     static const ifunc_t ifuncs[6][4] =
     {
-        {NppMirrorI<CV_8U, nppiMirror_8u_C1IR>::call, 0, NppMirrorI<CV_8U, nppiMirror_8u_C3IR>::call, NppMirrorI<CV_8U, nppiMirror_8u_C4IR>::call},
+#if USE_NPP_STREAM_CTX
+        {NppMirrorI<CV_8U, nppiMirror_8u_C1IR_Ctx>::call, 0, NppMirrorI<CV_8U, nppiMirror_8u_C3IR_Ctx>::call, NppMirrorI<CV_8U, nppiMirror_8u_C4IR_Ctx>::call},
+        {0,0,0,0},
+        {NppMirrorI<CV_16U, nppiMirror_16u_C1IR_Ctx>::call, 0, NppMirrorI<CV_16U, nppiMirror_16u_C3IR_Ctx>::call, NppMirrorI<CV_16U, nppiMirror_16u_C4IR_Ctx>::call},
+        {0,0,0,0},
+        {NppMirrorI<CV_32S, nppiMirror_32s_C1IR_Ctx>::call, 0, NppMirrorI<CV_32S, nppiMirror_32s_C3IR_Ctx>::call, NppMirrorI<CV_32S, nppiMirror_32s_C4IR_Ctx>::call},
+        {NppMirrorI<CV_32F, nppiMirror_32f_C1IR_Ctx>::call, 0, NppMirrorI<CV_32F, nppiMirror_32f_C3IR_Ctx>::call, NppMirrorI<CV_32F, nppiMirror_32f_C4IR_Ctx>::call}
+#else
+        { NppMirrorI<CV_8U, nppiMirror_8u_C1IR>::call, 0, NppMirrorI<CV_8U, nppiMirror_8u_C3IR>::call, NppMirrorI<CV_8U, nppiMirror_8u_C4IR>::call },
         {0,0,0,0},
         {NppMirrorI<CV_16U, nppiMirror_16u_C1IR>::call, 0, NppMirrorI<CV_16U, nppiMirror_16u_C3IR>::call, NppMirrorI<CV_16U, nppiMirror_16u_C4IR>::call},
         {0,0,0,0},
         {NppMirrorI<CV_32S, nppiMirror_32s_C1IR>::call, 0, NppMirrorI<CV_32S, nppiMirror_32s_C3IR>::call, NppMirrorI<CV_32S, nppiMirror_32s_C4IR>::call},
         {NppMirrorI<CV_32F, nppiMirror_32f_C1IR>::call, 0, NppMirrorI<CV_32F, nppiMirror_32f_C3IR>::call, NppMirrorI<CV_32F, nppiMirror_32f_C4IR>::call}
+#endif
     };
 
     GpuMat src = getInputMat(_src, stream);

diff --git a/modules/cudaarithm/src/cuda/bitwise_scalar.cu b/modules/cudaarithm/src/cuda/bitwise_scalar.cu
@@ -92,7 +92,11 @@ namespace
     {
         typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
 
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx);
+#else
         typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
+#endif
     };
 
     template <int DEPTH, int cn, typename NppBitwiseCFunc<DEPTH, cn>::func_t func> struct NppBitwiseC
@@ -116,7 +120,11 @@ namespace
                 cv::saturate_cast<npp_type>(value[3])
             };
 
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI, h));
+#else
             nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
+#endif
 
             if (stream == 0)
                 CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
@@ -131,13 +139,39 @@ void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const Gpu
     typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
     static const func_t funcs[3][6][4] =
     {
+#if USE_NPP_STREAM_CTX
+        {
+            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R_Ctx>::call},
+            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R_Ctx>::call},
+            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R_Ctx>::call},
+            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R_Ctx>::call}
+        },
         {
-            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
-            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
-            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
-            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
-            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call},
-            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+            {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R_Ctx>::call},
+            {BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R_Ctx>::call},
+            {BitScalar<uint, bitScalarOp<bit_or, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R_Ctx>::call},
+            {BitScalar<uint, bitScalarOp<bit_or, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R_Ctx>::call}
+        },
+        {
+            {BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R_Ctx>::call},
+            {BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R_Ctx>::call},
+            {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R_Ctx>::call},
+            {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R_Ctx>::call}
+        }
+#else
+        {
+            { BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call, 0, NppBitwiseC<CV_8U, 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call },
+            { BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call },
+            { BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call },
+            { BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call },
+            { BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call },
+            { BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call }
         },
         {
             {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
@@ -155,6 +189,7 @@ void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const Gpu
             {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call},
             {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
         }
+#endif
     };
 
     const int depth = src.depth();

diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
@@ -116,8 +116,13 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou
         sz.width  = src.cols;
         sz.height = src.rows;
 
+#if USE_NPP_STREAM_CTX
+        nppSafeCall(nppiThreshold_32f_C1R_Ctx(src.ptr<Npp32f>(), static_cast<int>(src.step),
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER, h));
+#else
         nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
             dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );
+#endif
 
         if (!stream)
             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );

diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
@@ -74,8 +74,13 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
         sz.width  = src.cols;
         sz.height = src.rows;
 
+#if USE_NPP_STREAM_CTX
+        nppSafeCall(nppiTranspose_8u_C1R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, h));
+#else
         nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
             dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+#endif
 
         if (!stream)
             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );