From a7fdf2f6d766f5eaf02dc0bddf5ac629e941e5aa Mon Sep 17 00:00:00 2001 From: cudawarped <12133430+cudawarped@users.noreply.github.com> Date: Wed, 2 Oct 2024 13:52:44 +0300 Subject: [PATCH] cuda - update npp calls to use the new NppStreamContext API if available --- modules/cudaarithm/src/core.cpp | 49 +++++- modules/cudaarithm/src/cuda/bitwise_scalar.cu | 47 ++++- modules/cudaarithm/src/cuda/threshold.cu | 5 + modules/cudaarithm/src/cuda/transpose.cu | 5 + modules/cudaarithm/src/element_operations.cpp | 49 ++++++ modules/cudaarithm/src/reductions.cpp | 47 ++++- modules/cudacodec/src/video_reader.cpp | 23 ++- modules/cudafilters/src/filtering.cpp | 117 +++++++++++-- modules/cudaimgproc/src/color.cpp | 39 ++++- modules/cudaimgproc/src/histogram.cpp | 164 +++++++++++++++--- modules/cudawarping/src/warp.cpp | 70 ++++++++ 11 files changed, 539 insertions(+), 76 deletions(-) diff --git a/modules/cudaarithm/src/core.cpp b/modules/cudaarithm/src/core.cpp index 223929881fd..449cb308a2d 100644 --- a/modules/cudaarithm/src/core.cpp +++ b/modules/cudaarithm/src/core.cpp @@ -79,7 +79,11 @@ namespace { typedef typename NppTypeTraits::npp_t npp_t; - typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip); +#if USE_NPP_STREAM_CTX + typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip, NppStreamContext ctx); +#else + typedef NppStatus(*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip); +#endif }; template ::func_t func> struct NppMirror @@ -94,9 +98,15 @@ namespace sz.width = src.cols; sz.height = src.rows; +#if USE_NPP_STREAM_CTX nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, - (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) ); + (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS)), h) ); +#else + nppSafeCall( func(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, + (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -107,7 +117,11 @@ namespace { typedef typename NppTypeTraits::npp_t npp_t; - typedef NppStatus (*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip); +#if USE_NPP_STREAM_CTX + typedef NppStatus (*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip, NppStreamContext ctx); +#else + typedef NppStatus(*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip); +#endif }; template ::func_t func> struct NppMirrorI @@ -121,10 +135,15 @@ namespace NppiSize sz; sz.width = srcDst.cols; sz.height = srcDst.rows; - +#if USE_NPP_STREAM_CTX + nppSafeCall(func(srcDst.ptr(), static_cast(srcDst.step), + sz, + (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS)), h) ); +#else nppSafeCall( func(srcDst.ptr(), static_cast(srcDst.step), sz, (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -137,23 +156,41 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream); static const func_t funcs[6][4] = { - {NppMirror::call, 0, NppMirror::call, NppMirror::call}, +#if USE_NPP_STREAM_CTX + {NppMirror::call, 0, NppMirror::call, NppMirror::call}, + {0,0,0,0}, + {NppMirror::call, 0, NppMirror::call, NppMirror::call}, + {0,0,0,0}, + {NppMirror::call, 0, NppMirror::call, NppMirror::call}, + {NppMirror::call, 0, NppMirror::call, NppMirror::call} +#else + { NppMirror::call, 0, NppMirror::call, NppMirror::call }, {0,0,0,0}, {NppMirror::call, 0, NppMirror::call, NppMirror::call}, {0,0,0,0}, {NppMirror::call, 0, NppMirror::call, NppMirror::call}, {NppMirror::call, 0, NppMirror::call, NppMirror::call} +#endif }; typedef void (*ifunc_t)(GpuMat& srcDst, int flipCode, cudaStream_t stream); static const ifunc_t ifuncs[6][4] = { - {NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call}, +#if USE_NPP_STREAM_CTX + {NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call}, + {0,0,0,0}, + {NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call}, + {0,0,0,0}, + {NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call}, + {NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call} +#else + { NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call }, {0,0,0,0}, {NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call}, {0,0,0,0}, {NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call}, {NppMirrorI::call, 0, NppMirrorI::call, NppMirrorI::call} +#endif }; GpuMat src = getInputMat(_src, stream); diff --git a/modules/cudaarithm/src/cuda/bitwise_scalar.cu b/modules/cudaarithm/src/cuda/bitwise_scalar.cu index 0dd99e8cd85..1ff3edd7b48 100644 --- a/modules/cudaarithm/src/cuda/bitwise_scalar.cu +++ b/modules/cudaarithm/src/cuda/bitwise_scalar.cu @@ -92,7 +92,11 @@ namespace { typedef typename NPPTypeTraits::npp_type npp_type; +#if USE_NPP_STREAM_CTX + typedef NppStatus(*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx); +#else typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI); +#endif }; template ::func_t func> struct NppBitwiseC @@ -116,7 +120,11 @@ namespace cv::saturate_cast(value[3]) }; +#if USE_NPP_STREAM_CTX + nppSafeCall(func(src.ptr(), static_cast(src.step), pConstants, dst.ptr(), static_cast(dst.step), oSizeROI, h)); +#else nppSafeCall( func(src.ptr(), static_cast(src.step), pConstants, dst.ptr(), static_cast(dst.step), oSizeROI) ); +#endif if (stream == 0) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); @@ -131,13 +139,39 @@ void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const Gpu typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream); static const func_t funcs[3][6][4] = { +#if USE_NPP_STREAM_CTX + { + {BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarOp >::call}, + {BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarOp >::call}, + {BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call}, + {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call} + }, { - {BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarOp >::call}, - {BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarOp >::call}, - {BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call}, - {BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call}, - {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call}, - {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call} + {BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarOp >::call}, + {BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarOp >::call}, + {BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call}, + {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call} + }, + { + {BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarOp >::call}, + {BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarOp >::call}, + {BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call}, + {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call} + } +#else + { + { BitScalar >::call, 0, NppBitwiseC::call, BitScalar4< bitScalarOp >::call }, + { BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarOp >::call }, + { BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call }, + { BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call }, + { BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call }, + { BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call } }, { {BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarOp >::call}, @@ -155,6 +189,7 @@ void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const Gpu {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call}, {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call} } +#endif }; const int depth = src.depth(); diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu index 1249fee04be..cfa88dfae7c 100644 --- a/modules/cudaarithm/src/cuda/threshold.cu +++ b/modules/cudaarithm/src/cuda/threshold.cu @@ -116,8 +116,13 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou sz.width = src.cols; sz.height = src.rows; +#if USE_NPP_STREAM_CTX + nppSafeCall(nppiThreshold_32f_C1R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, static_cast(thresh), NPP_CMP_GREATER, h)); +#else nppSafeCall( nppiThreshold_32f_C1R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, static_cast(thresh), NPP_CMP_GREATER) ); +#endif if (!stream) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index bfe50bd34fb..8dae081731e 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -74,8 +74,13 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) sz.width = src.cols; sz.height = src.rows; +#if USE_NPP_STREAM_CTX + nppSafeCall(nppiTranspose_8u_C1R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, h)); +#else nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); +#endif if (!stream) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); diff --git a/modules/cudaarithm/src/element_operations.cpp b/modules/cudaarithm/src/element_operations.cpp index 1ad3c17c40f..037abc07821 100644 --- a/modules/cudaarithm/src/element_operations.cpp +++ b/modules/cudaarithm/src/element_operations.cpp @@ -337,13 +337,21 @@ namespace { typedef typename NPPTypeTraits::npp_type npp_type; +#if USE_NPP_STREAM_CTX + typedef NppStatus(*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx); +#else typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI); +#endif }; template struct NppShiftFunc { typedef typename NPPTypeTraits::npp_type npp_type; +#if USE_NPP_STREAM_CTX + typedef NppStatus(*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx); +#else typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI); +#endif }; template ::func_t func> struct NppShift @@ -358,7 +366,11 @@ namespace oSizeROI.width = src.cols; oSizeROI.height = src.rows; +#if USE_NPP_STREAM_CTX + nppSafeCall(func(src.ptr(), static_cast(src.step), sc.val, dst.ptr(), static_cast(dst.step), oSizeROI, h)); +#else nppSafeCall( func(src.ptr(), static_cast(src.step), sc.val, dst.ptr(), static_cast(dst.step), oSizeROI) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -376,7 +388,11 @@ namespace oSizeROI.width = src.cols; oSizeROI.height = src.rows; +#if USE_NPP_STREAM_CTX + nppSafeCall(func(src.ptr(), static_cast(src.step), sc.val[0], dst.ptr(), static_cast(dst.step), oSizeROI, h)); +#else nppSafeCall( func(src.ptr(), static_cast(src.step), sc.val[0], dst.ptr(), static_cast(dst.step), oSizeROI) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -389,11 +405,20 @@ void cv::cuda::rshift(InputArray _src, Scalar_ val, OutputArray _dst, Strea typedef void (*func_t)(const GpuMat& src, Scalar_ sc, GpuMat& dst, cudaStream_t stream); static const func_t funcs[5][4] = { +#if USE_NPP_STREAM_CTX + + {NppShift::call, 0, NppShift::call, NppShift::call }, + {NppShift::call, 0, NppShift::call, NppShift::call }, + {NppShift::call, 0, NppShift::call, NppShift::call}, + {NppShift::call, 0, NppShift::call, NppShift::call}, + {NppShift::call, 0, NppShift::call, NppShift::call}, +#else {NppShift::call, 0, NppShift::call, NppShift::call }, {NppShift::call, 0, NppShift::call, NppShift::call }, {NppShift::call, 0, NppShift::call, NppShift::call}, {NppShift::call, 0, NppShift::call, NppShift::call}, {NppShift::call, 0, NppShift::call, NppShift::call}, +#endif }; GpuMat src = getInputMat(_src, stream); @@ -413,11 +438,19 @@ void cv::cuda::lshift(InputArray _src, Scalar_ val, OutputArray _dst, Strea typedef void (*func_t)(const GpuMat& src, Scalar_ sc, GpuMat& dst, cudaStream_t stream); static const func_t funcs[5][4] = { +#if USE_NPP_STREAM_CTX + {NppShift::call , 0, NppShift::call , NppShift::call }, + {0 , 0, 0 , 0 }, + {NppShift::call, 0, NppShift::call, NppShift::call}, + {0 , 0, 0 , 0 }, + {NppShift::call, 0, NppShift::call, NppShift::call}, +#else {NppShift::call , 0, NppShift::call , NppShift::call }, {0 , 0, 0 , 0 }, {NppShift::call, 0, NppShift::call, NppShift::call}, {0 , 0, 0 , 0 }, {NppShift::call, 0, NppShift::call, NppShift::call}, +#endif }; GpuMat src = getInputMat(_src, stream); @@ -463,7 +496,11 @@ void cv::cuda::max(InputArray src1, InputArray src2, OutputArray dst, Stream& st namespace { +#if USE_NPP_STREAM_CTX + typedef NppStatus(*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx); +#else typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); +#endif void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream) { @@ -475,7 +512,11 @@ namespace NppStreamHandler h(stream); +#if USE_NPP_STREAM_CTX + nppSafeCall(func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, h)); +#else nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -488,7 +529,11 @@ void cv::cuda::magnitude(InputArray _src, OutputArray _dst, Stream& stream) GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream); +#if USE_NPP_STREAM_CTX + npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R_Ctx, StreamAccessor::getStream(stream)); +#else npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream)); +#endif syncOutput(dst, _dst, stream); } @@ -499,7 +544,11 @@ void cv::cuda::magnitudeSqr(InputArray _src, OutputArray _dst, Stream& stream) GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream); +#if USE_NPP_STREAM_CTX + npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R_Ctx, StreamAccessor::getStream(stream)); +#else npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream)); +#endif syncOutput(dst, _dst, stream); } diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp index b70a128558f..d89a2984303 100644 --- a/modules/cudaarithm/src/reductions.cpp +++ b/modules/cudaarithm/src/reductions.cpp @@ -151,32 +151,44 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, Stream& stream) sz.width = gsrc.cols; sz.height = gsrc.rows; -#if (CUDA_VERSION >= 12040) +#if (NPP_VERSION >= 12205) size_t bufSize; #else int bufSize; #endif + NppStreamHandler h(StreamAccessor::getStream(stream)); + #if (CUDA_VERSION <= 4020) nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) ); +#else +#if USE_NPP_STREAM_CTX + if (gsrc.type() == CV_8UC1) + nppSafeCall(nppiMeanStdDevGetBufferHostSize_8u_C1R_Ctx(sz, &bufSize, h)); + else + nppSafeCall(nppiMeanStdDevGetBufferHostSize_32f_C1R_Ctx(sz, &bufSize, h)); #else if (gsrc.type() == CV_8UC1) nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) ); else nppSafeCall( nppiMeanStdDevGetBufferHostSize_32f_C1R(sz, &bufSize) ); +#endif #endif BufferPool pool(stream); CV_Assert(bufSize <= std::numeric_limits::max()); GpuMat buf = pool.getBuffer(1, static_cast(bufSize), gsrc.type()); - - // detail: https://github.com/opencv/opencv/issues/11063 - //NppStreamHandler h(StreamAccessor::getStream(stream)); - +#if USE_NPP_STREAM_CTX + if (gsrc.type() == CV_8UC1) + nppSafeCall(nppiMean_StdDev_8u_C1R_Ctx(gsrc.ptr(), static_cast(gsrc.step), sz, buf.ptr(), gdst.ptr(), gdst.ptr() + 1, h)); + else + nppSafeCall(nppiMean_StdDev_32f_C1R_Ctx(gsrc.ptr(), static_cast(gsrc.step), sz, buf.ptr(), gdst.ptr(), gdst.ptr() + 1, h)); +#else if(gsrc.type() == CV_8UC1) nppSafeCall( nppiMean_StdDev_8u_C1R(gsrc.ptr(), static_cast(gsrc.step), sz, buf.ptr(), gdst.ptr(), gdst.ptr() + 1) ); else nppSafeCall( nppiMean_StdDev_32f_C1R(gsrc.ptr(), static_cast(gsrc.step), sz, buf.ptr(), gdst.ptr(), gdst.ptr() + 1) ); +#endif syncOutput(gdst, dst, stream); } @@ -233,31 +245,49 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stre sz.width = gsrc.cols; sz.height = gsrc.rows; -#if (CUDA_VERSION >= 12040) +#if (NPP_VERSION >= 12205) size_t bufSize; #else int bufSize; #endif + NppStreamHandler h(StreamAccessor::getStream(stream)); + #if (CUDA_VERSION <= 4020) nppSafeCall( nppiMeanStdDev8uC1MRGetBufferHostSize(sz, &bufSize) ); +#else +#if USE_NPP_STREAM_CTX + if (gsrc.type() == CV_8UC1) + nppSafeCall(nppiMeanStdDevGetBufferHostSize_8u_C1MR_Ctx(sz, &bufSize, h)); + else + nppSafeCall(nppiMeanStdDevGetBufferHostSize_32f_C1MR_Ctx(sz, &bufSize, h)); #else if (gsrc.type() == CV_8UC1) nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1MR(sz, &bufSize) ); else nppSafeCall( nppiMeanStdDevGetBufferHostSize_32f_C1MR(sz, &bufSize) ); +#endif #endif BufferPool pool(stream); CV_Assert(bufSize <= std::numeric_limits::max()); GpuMat buf = pool.getBuffer(1, static_cast(bufSize), gsrc.type()); +#if USE_NPP_STREAM_CTX + if (gsrc.type() == CV_8UC1) + nppSafeCall(nppiMean_StdDev_8u_C1MR_Ctx(gsrc.ptr(), static_cast(gsrc.step), gmask.ptr(), static_cast(gmask.step), + sz, buf.ptr(), gdst.ptr(), gdst.ptr() + 1, h)); + else + nppSafeCall(nppiMean_StdDev_32f_C1MR_Ctx(gsrc.ptr(), static_cast(gsrc.step), gmask.ptr(), static_cast(gmask.step), + sz, buf.ptr(), gdst.ptr(), gdst.ptr() + 1, h)); +#else if(gsrc.type() == CV_8UC1) nppSafeCall( nppiMean_StdDev_8u_C1MR(gsrc.ptr(), static_cast(gsrc.step), gmask.ptr(), static_cast(gmask.step), sz, buf.ptr(), gdst.ptr(), gdst.ptr() + 1) ); else nppSafeCall( nppiMean_StdDev_32f_C1MR(gsrc.ptr(), static_cast(gsrc.step), gmask.ptr(), static_cast(gmask.step), sz, buf.ptr(), gdst.ptr(), gdst.ptr() + 1) ); +#endif syncOutput(gdst, dst, stream); } @@ -288,8 +318,13 @@ void cv::cuda::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Re NppStreamHandler h(stream); +#if USE_NPP_STREAM_CTX + nppSafeCall(nppiRectStdDev_32s32f_C1R_Ctx(src.ptr(), static_cast(src.step), sqr.ptr(), static_cast(sqr.step), + dst.ptr(), static_cast(dst.step), sz, nppRect, h)); +#else nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr(), static_cast(src.step), sqr.ptr(), static_cast(sqr.step), dst.ptr(), static_cast(dst.step), sz, nppRect) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp index 6d71e544fa0..5bf9aac91ed 100644 --- a/modules/cudacodec/src/video_reader.cpp +++ b/modules/cudacodec/src/video_reader.cpp @@ -68,26 +68,23 @@ void cvtFromNv12(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int he outFrame.create(height, width, CV_8UC3); Npp8u* pSrc[2] = { decodedFrame.data, &decodedFrame.data[decodedFrame.step * height] }; NppiSize oSizeROI = { width,height }; -#if (CUDART_VERSION < 10010) cv::cuda::NppStreamHandler h(stream); +#if USE_NPP_STREAM_CTX if (videoFullRangeFlag) - nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI)); - else { - nppSafeCall(nppiNV12ToBGR_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI)); - } -#elif (CUDART_VERSION >= 10010) - NppStreamContext nppStreamCtx; - nppSafeCall(nppGetStreamContext(&nppStreamCtx)); - nppStreamCtx.hStream = StreamAccessor::getStream(stream); - if (videoFullRangeFlag) - nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx)); + nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, h)); else { #if (CUDART_VERSION < 11000) - nppSafeCall(nppiNV12ToBGR_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx)); + nppSafeCall(nppiNV12ToBGR_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, h)); #else - nppSafeCall(nppiNV12ToBGR_709CSC_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx)); + nppSafeCall(nppiNV12ToBGR_709CSC_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, h)); #endif } +#else + if (videoFullRangeFlag) + nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI)); + else { + nppSafeCall(nppiNV12ToBGR_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI)); + } #endif } else if (colorFormat == ColorFormat::GRAY) { diff --git a/modules/cudafilters/src/filtering.cpp b/modules/cudafilters/src/filtering.cpp index 185751f861f..ff2315ee70d 100644 --- a/modules/cudafilters/src/filtering.cpp +++ b/modules/cudafilters/src/filtering.cpp @@ -105,10 +105,17 @@ namespace void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()); private: +#if USE_NPP_STREAM_CTX + typedef NppStatus(*nppFilterBox8U_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, + NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor, NppStreamContext ctx); + typedef NppStatus(*nppFilterBox32F_t)(const Npp32f* pSrc, Npp32s nSrcStep, Npp32f* pDst, Npp32s nDstStep, + NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor, NppStreamContext ctx); +#else typedef NppStatus (*nppFilterBox8U_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor); typedef NppStatus (*nppFilterBox32F_t)(const Npp32f* pSrc, Npp32s nSrcStep, Npp32f* pDst, Npp32s nDstStep, NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor); +#endif Size ksize_; Point anchor_; @@ -161,20 +168,40 @@ namespace { case CV_8U: { +#if USE_NPP_STREAM_CTX + static const nppFilterBox8U_t funcs8U[] = { 0, nppiFilterBox_8u_C1R_Ctx, 0, 0, nppiFilterBox_8u_C4R_Ctx }; +#else static const nppFilterBox8U_t funcs8U[] = { 0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R }; +#endif const nppFilterBox8U_t func8U = funcs8U[cn]; +#if USE_NPP_STREAM_CTX + nppSafeCall(func8U(srcRoi.ptr(), static_cast(srcRoi.step), + dst.ptr(), static_cast(dst.step), + oSizeROI, oMaskSize, oAnchor, h)); +#else nppSafeCall(func8U(srcRoi.ptr(), static_cast(srcRoi.step), dst.ptr(), static_cast(dst.step), oSizeROI, oMaskSize, oAnchor)); +#endif } break; case CV_32F: { +#if USE_NPP_STREAM_CTX + static const nppFilterBox32F_t funcs32F[] = { 0, nppiFilterBox_32f_C1R_Ctx, 0, 0, 0 }; +#else static const nppFilterBox32F_t funcs32F[] = { 0, nppiFilterBox_32f_C1R, 0, 0, 0 }; +#endif const nppFilterBox32F_t func32F = funcs32F[cn]; +#if USE_NPP_STREAM_CTX + nppSafeCall(func32F(srcRoi.ptr(), static_cast(srcRoi.step), + dst.ptr(), static_cast(dst.step), + oSizeROI, oMaskSize, oAnchor, h)); +#else nppSafeCall(func32F(srcRoi.ptr(), static_cast(srcRoi.step), dst.ptr(), static_cast(dst.step), oSizeROI, oMaskSize, oAnchor)); +#endif } break; } @@ -566,10 +593,18 @@ namespace void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()); private: +#if USE_NPP_STREAM_CTX typedef NppStatus (*nppMorfFilter8u_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI, - const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor); + const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor, NppStreamContext streamCtx); typedef NppStatus (*nppMorfFilter32f_t)(const Npp32f* pSrc, Npp32s nSrcStep, Npp32f* pDst, Npp32s nDstStep, NppiSize oSizeROI, - const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor); + const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor, NppStreamContext streamCtx); +#else + typedef NppStatus(*nppMorfFilter8u_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI, + const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor); + typedef NppStatus(*nppMorfFilter32f_t)(const Npp32f* pSrc, Npp32s nSrcStep, Npp32f* pDst, Npp32s nDstStep, NppiSize oSizeROI, + const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor); + +#endif int type_; GpuMat kernel_; @@ -585,6 +620,18 @@ namespace MorphologyFilter::MorphologyFilter(int op, int srcType, InputArray _kernel, Point anchor, int iterations) : type_(srcType), anchor_(anchor), iters_(iterations) { +#if USE_NPP_STREAM_CTX + static const nppMorfFilter8u_t funcs8u[2][5] = + { + {0, nppiErode_8u_C1R_Ctx, 0, 0, nppiErode_8u_C4R_Ctx }, + {0, nppiDilate_8u_C1R_Ctx, 0, 0, nppiDilate_8u_C4R_Ctx } + }; + static const nppMorfFilter32f_t funcs32f[2][5] = + { + {0, nppiErode_32f_C1R_Ctx, 0, 0, nppiErode_32f_C4R_Ctx }, + {0, nppiDilate_32f_C1R_Ctx, 0, 0, nppiDilate_32f_C4R_Ctx } + }; +#else static const nppMorfFilter8u_t funcs8u[2][5] = { {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R }, @@ -595,6 +642,7 @@ namespace {0, nppiErode_32f_C1R, 0, 0, nppiErode_32f_C4R }, {0, nppiDilate_32f_C1R, 0, 0, nppiDilate_32f_C4R } }; +#endif CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_32FC1 || srcType == CV_32FC4 ); @@ -676,28 +724,46 @@ namespace if (type_ == CV_8UC1 || type_ == CV_8UC4) { +#if USE_NPP_STREAM_CTX nppSafeCall( func8u_(srcRoi.ptr(), static_cast(srcRoi.step), dst.ptr(), static_cast(dst.step), - oSizeROI, kernel_.ptr(), oMaskSize, oAnchor) ); + oSizeROI, kernel_.ptr(), oMaskSize, oAnchor, h) ); +#else + nppSafeCall(func8u_(srcRoi.ptr(), static_cast(srcRoi.step), dst.ptr(), static_cast(dst.step), + oSizeROI, kernel_.ptr(), oMaskSize, oAnchor)); +#endif for(int i = 1; i < iters_; ++i) { dst.copyTo(bufRoi, _stream); - +#if USE_NPP_STREAM_CTX nppSafeCall( func8u_(bufRoi.ptr(), static_cast(bufRoi.step), dst.ptr(), static_cast(dst.step), - oSizeROI, kernel_.ptr(), oMaskSize, oAnchor) ); + oSizeROI, kernel_.ptr(), oMaskSize, oAnchor, h) ); +#else + nppSafeCall(func8u_(bufRoi.ptr(), static_cast(bufRoi.step), dst.ptr(), static_cast(dst.step), + oSizeROI, kernel_.ptr(), oMaskSize, oAnchor)); +#endif } } else if (type_ == CV_32FC1 || type_ == CV_32FC4) { +#if USE_NPP_STREAM_CTX nppSafeCall( func32f_(srcRoi.ptr(), static_cast(srcRoi.step), dst.ptr(), static_cast(dst.step), - oSizeROI, kernel_.ptr(), oMaskSize, oAnchor) ); - + oSizeROI, kernel_.ptr(), oMaskSize, oAnchor, h) ); +#else + nppSafeCall(func32f_(srcRoi.ptr(), static_cast(srcRoi.step), dst.ptr(), static_cast(dst.step), + oSizeROI, kernel_.ptr(), oMaskSize, oAnchor)); +#endif for(int i = 1; i < iters_; ++i) { dst.copyTo(bufRoi, _stream); +#if USE_NPP_STREAM_CTX nppSafeCall( func32f_(bufRoi.ptr(), static_cast(bufRoi.step), dst.ptr(), static_cast(dst.step), - oSizeROI, kernel_.ptr(), oMaskSize, oAnchor) ); + oSizeROI, kernel_.ptr(), oMaskSize, oAnchor, h) ); +#else + nppSafeCall(func32f_(srcRoi.ptr(), static_cast(srcRoi.step), dst.ptr(), static_cast(dst.step), + oSizeROI, kernel_.ptr(), oMaskSize, oAnchor)); +#endif } } @@ -887,8 +953,13 @@ namespace void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()); private: +#if USE_NPP_STREAM_CTX typedef NppStatus (*nppFilterRank_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI, - NppiSize oMaskSize, NppiPoint oAnchor); + NppiSize oMaskSize, NppiPoint oAnchor, NppStreamContext); +#else + typedef NppStatus(*nppFilterRank_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI, + NppiSize oMaskSize, NppiPoint oAnchor); +#endif int type_; Size ksize_; @@ -903,8 +974,13 @@ namespace NPPRankFilter::NPPRankFilter(int op, int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal) : type_(srcType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal) { - static const nppFilterRank_t maxFuncs[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R}; +#if USE_NPP_STREAM_CTX + static const nppFilterRank_t maxFuncs[] = {0, nppiFilterMax_8u_C1R_Ctx, 0, 0, nppiFilterMax_8u_C4R_Ctx}; + static const nppFilterRank_t minFuncs[] = { 0, nppiFilterMin_8u_C1R_Ctx, 0, 0, nppiFilterMin_8u_C4R_Ctx }; +#else + static const nppFilterRank_t maxFuncs[] = { 0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R }; static const nppFilterRank_t minFuncs[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R}; +#endif CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 ); @@ -943,8 +1019,13 @@ namespace oAnchor.x = anchor_.x; oAnchor.y = anchor_.y; +#if USE_NPP_STREAM_CTX + nppSafeCall(func_(srcRoi.ptr(), static_cast(srcRoi.step), dst.ptr(), static_cast(dst.step), + oSizeROI, oMaskSize, oAnchor, h)); +#else nppSafeCall( func_(srcRoi.ptr(), static_cast(srcRoi.step), dst.ptr(), static_cast(dst.step), oSizeROI, oMaskSize, oAnchor) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -1011,9 +1092,15 @@ namespace oSizeROI.width = src.cols; oSizeROI.height = src.rows; +#if USE_NPP_STREAM_CTX + nppSafeCall(nppiSumWindowRow_8u32f_C1R_Ctx(srcRoi.ptr(), static_cast(srcRoi.step), + dst.ptr(), static_cast(dst.step), + oSizeROI, ksize_, anchor_, h)); +#else nppSafeCall( nppiSumWindowRow_8u32f_C1R(srcRoi.ptr(), static_cast(srcRoi.step), dst.ptr(), static_cast(dst.step), oSizeROI, ksize_, anchor_) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -1072,9 +1159,15 @@ namespace oSizeROI.width = src.cols; oSizeROI.height = src.rows; - nppSafeCall( nppiSumWindowColumn_8u32f_C1R(srcRoi.ptr(), static_cast(srcRoi.step), +#if USE_NPP_STREAM_CTX + nppSafeCall( nppiSumWindowColumn_8u32f_C1R_Ctx(srcRoi.ptr(), static_cast(srcRoi.step), dst.ptr(), static_cast(dst.step), - oSizeROI, ksize_, anchor_) ); + oSizeROI, ksize_, anchor_, h) ); +#else + nppSafeCall(nppiSumWindowColumn_8u32f_C1R(srcRoi.ptr(), static_cast(srcRoi.step), + dst.ptr(), static_cast(dst.step), + oSizeROI, ksize_, anchor_) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); diff --git a/modules/cudaimgproc/src/color.cpp b/modules/cudaimgproc/src/color.cpp index 5adfa6cda6c..7ef0835d05a 100644 --- a/modules/cudaimgproc/src/color.cpp +++ b/modules/cudaimgproc/src/color.cpp @@ -1831,10 +1831,17 @@ namespace oSizeROI.width = src.cols; oSizeROI.height = src.rows; +#if USE_NPP_STREAM_CTX if (src.depth() == CV_8U) - nppSafeCall( nppiAlphaPremul_8u_AC4R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI) ); + nppSafeCall( nppiAlphaPremul_8u_AC4R_Ctx(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI, h) ); else - nppSafeCall( nppiAlphaPremul_16u_AC4R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI) ); + nppSafeCall( nppiAlphaPremul_16u_AC4R_Ctx(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI, h) ); +#else + if (src.depth() == CV_8U) + nppSafeCall(nppiAlphaPremul_8u_AC4R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI)); + else + nppSafeCall(nppiAlphaPremul_16u_AC4R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), oSizeROI)); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -2200,7 +2207,11 @@ void cv::cuda::swapChannels(InputOutputArray _image, const int dstOrder[4], Stre sz.width = image.cols; sz.height = image.rows; - nppSafeCall( nppiSwapChannels_8u_C4IR(image.ptr(), static_cast(image.step), sz, dstOrder) ); +#if USE_NPP_STREAM_CTX + nppSafeCall( nppiSwapChannels_8u_C4IR_Ctx(image.ptr(), static_cast(image.step), sz, dstOrder, h) ); +#else + nppSafeCall(nppiSwapChannels_8u_C4IR(image.ptr(), static_cast(image.step), sz, dstOrder)); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -2262,7 +2273,11 @@ namespace { typedef typename NPPTypeTraits::npp_type npp_t; - typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, NppiAlphaOp eAlphaOp); +#if USE_NPP_STREAM_CTX + typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, NppiAlphaOp eAlphaOp, NppStreamContext ctx); +#else + typedef NppStatus(*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, NppiAlphaOp eAlphaOp); +#endif }; template ::func_t func> struct NppAlphaComp @@ -2277,8 +2292,13 @@ namespace oSizeROI.width = img1.cols; oSizeROI.height = img2.rows; +#if USE_NPP_STREAM_CTX nppSafeCall( func(img1.ptr(), static_cast(img1.step), img2.ptr(), static_cast(img2.step), - dst.ptr(), static_cast(dst.step), oSizeROI, eAlphaOp) ); + dst.ptr(), static_cast(dst.step), oSizeROI, eAlphaOp, h) ); +#else + nppSafeCall(func(img1.ptr(), static_cast(img1.step), img2.ptr(), static_cast(img2.step), + dst.ptr(), static_cast(dst.step), oSizeROI, eAlphaOp)); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -2307,12 +2327,21 @@ void cv::cuda::alphaComp(InputArray _img1, InputArray _img2, OutputArray _dst, i typedef void (*func_t)(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream); static const func_t funcs[] = { +#if USE_NPP_STREAM_CTX + NppAlphaComp::call, + 0, + NppAlphaComp::call, + 0, + NppAlphaComp::call, + NppAlphaComp::call +#else NppAlphaComp::call, 0, NppAlphaComp::call, 0, NppAlphaComp::call, NppAlphaComp::call +#endif }; GpuMat img1 = _img1.getGpuMat(); diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp index 51a5ce1a83e..023cb089c47 100644 --- a/modules/cudaimgproc/src/histogram.cpp +++ b/modules/cudaimgproc/src/histogram.cpp @@ -281,27 +281,37 @@ cv::Ptr cv::cuda::createCLAHE(double clipLimit, cv::Size tileGr namespace { -#if (CUDA_VERSION >= 12040) - typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, size_t* hpBufferSize); - typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], size_t* hpBufferSize); +#if (NPP_VERSION >= 12205) + typedef NppStatus(*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, size_t* hpBufferSize, NppStreamContext ctx); + typedef NppStatus(*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], size_t* hpBufferSize, NppStreamContext ctx); #else - typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize); - typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize); + typedef NppStatus(*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize); + typedef NppStatus(*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize); #endif template struct NppHistogramEvenFuncC1 { typedef typename NPPTypeTraits::npp_type src_t; - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist, - int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer); +#if USE_NPP_STREAM_CTX + typedef NppStatus(*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist, + int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u* pBuffer, NppStreamContext ctx); +#else + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist, + int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer); +#endif }; template struct NppHistogramEvenFuncC4 { typedef typename NPPTypeTraits::npp_type src_t; +#if USE_NPP_STREAM_CTX + typedef NppStatus(*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, + Npp32s* pHist[4], int nLevels[4], Npp32s nLowerLevel[4], Npp32s nUpperLevel[4], Npp8u* pBuffer, NppStreamContext ctx); +#else typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist[4], int nLevels[4], Npp32s nLowerLevel[4], Npp32s nUpperLevel[4], Npp8u * pBuffer); +#endif }; template::func_ptr func, get_buf_size_c1_t get_buf_size> @@ -320,20 +330,27 @@ namespace sz.width = src.cols; sz.height = src.rows; -#if (CUDA_VERSION >= 12040) + NppStreamHandler h(stream); + +#if (NPP_VERSION >= 12205) size_t buf_size; + get_buf_size(sz, levels, &buf_size, h); #else int buf_size; -#endif get_buf_size(sz, levels, &buf_size); +#endif BufferPool pool(stream); - GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1); - - NppStreamHandler h(stream); + CV_Assert(buf_size <= std::numeric_limits::max()); + GpuMat buf = pool.getBuffer(1, static_cast(buf_size), CV_8UC1); +#if USE_NPP_STREAM_CTX + nppSafeCall(func(src.ptr(), static_cast(src.step), sz, hist.ptr(), levels, + lowerLevel, upperLevel, buf.ptr(), h)); +#else nppSafeCall( func(src.ptr(), static_cast(src.step), sz, hist.ptr(), levels, lowerLevel, upperLevel, buf.ptr()) ); +#endif if (!stream) cudaSafeCall( cudaDeviceSynchronize() ); @@ -358,19 +375,25 @@ namespace Npp32s* pHist[] = {hist[0].ptr(), hist[1].ptr(), hist[2].ptr(), hist[3].ptr()}; -#if (CUDA_VERSION >= 12040) + NppStreamHandler h(stream); + +#if (NPP_VERSION >= 12205) size_t buf_size; + get_buf_size(sz, levels, &buf_size, h); #else int buf_size; -#endif get_buf_size(sz, levels, &buf_size); +#endif BufferPool pool(stream); - GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1); - - NppStreamHandler h(stream); + CV_Assert(buf_size <= std::numeric_limits::max()); + GpuMat buf = pool.getBuffer(1, static_cast(buf_size), CV_8UC1); - nppSafeCall( func(src.ptr(), static_cast(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr()) ); +#if USE_NPP_STREAM_CTX + nppSafeCall( func(src.ptr(), static_cast(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr(), h)); +#else + nppSafeCall(func(src.ptr(), static_cast(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr())); +#endif if (!stream) cudaSafeCall( cudaDeviceSynchronize() ); @@ -383,8 +406,13 @@ namespace typedef Npp32s level_t; enum {LEVEL_TYPE_CODE=CV_32SC1}; +#if USE_NPP_STREAM_CTX + typedef NppStatus(*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist, + const Npp32s* pLevels, int nLevels, Npp8u* pBuffer, NppStreamContext ctx); +#else typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist, const Npp32s* pLevels, int nLevels, Npp8u* pBuffer); +#endif }; template<> struct NppHistogramRangeFuncC1 { @@ -392,8 +420,13 @@ namespace typedef Npp32f level_t; enum {LEVEL_TYPE_CODE=CV_32FC1}; +#if USE_NPP_STREAM_CTX + typedef NppStatus(*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist, + const Npp32f* pLevels, int nLevels, Npp8u* pBuffer, NppStreamContext ctx); +#else typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist, const Npp32f* pLevels, int nLevels, Npp8u* pBuffer); +#endif }; template struct NppHistogramRangeFuncC4 { @@ -401,8 +434,13 @@ namespace typedef Npp32s level_t; enum {LEVEL_TYPE_CODE=CV_32SC1}; +#if USE_NPP_STREAM_CTX + typedef NppStatus(*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4], + const Npp32s* pLevels[4], int nLevels[4], Npp8u* pBuffer, NppStreamContext ctx); +#else typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4], const Npp32s* pLevels[4], int nLevels[4], Npp8u* pBuffer); +#endif }; template<> struct NppHistogramRangeFuncC4 { @@ -410,8 +448,13 @@ namespace typedef Npp32f level_t; enum {LEVEL_TYPE_CODE=CV_32FC1}; +#if USE_NPP_STREAM_CTX + typedef NppStatus(*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4], + const Npp32f* pLevels[4], int nLevels[4], Npp8u* pBuffer, NppStreamContext ctx); +#else typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4], const Npp32f* pLevels[4], int nLevels[4], Npp8u* pBuffer); +#endif }; template::func_ptr func, get_buf_size_c1_t get_buf_size> @@ -432,19 +475,25 @@ namespace sz.width = src.cols; sz.height = src.rows; -#if (CUDA_VERSION >= 12040) + NppStreamHandler h(stream); + +#if (NPP_VERSION >= 12205) size_t buf_size; + get_buf_size(sz, levels.cols, &buf_size, h); #else int buf_size; -#endif get_buf_size(sz, levels.cols, &buf_size); +#endif BufferPool pool(stream); - GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1); - - NppStreamHandler h(stream); + CV_Assert(buf_size <= std::numeric_limits::max()); + GpuMat buf = pool.getBuffer(1, static_cast(buf_size), CV_8UC1); +#if USE_NPP_STREAM_CTX + nppSafeCall(func(src.ptr(), static_cast(src.step), sz, hist.ptr(), levels.ptr(), levels.cols, buf.ptr(), h)); +#else nppSafeCall( func(src.ptr(), static_cast(src.step), sz, hist.ptr(), levels.ptr(), levels.cols, buf.ptr()) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -477,19 +526,25 @@ namespace sz.width = src.cols; sz.height = src.rows; -#if (CUDA_VERSION >= 12040) + NppStreamHandler h(stream); + +#if (NPP_VERSION >= 12205) size_t buf_size; + get_buf_size(sz, nLevels, &buf_size, h); #else int buf_size; -#endif get_buf_size(sz, nLevels, &buf_size); +#endif BufferPool pool(stream); - GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1); - - NppStreamHandler h(stream); + CV_Assert(buf_size <= std::numeric_limits::max()); + GpuMat buf = pool.getBuffer(1, static_cast(buf_size), CV_8UC1); +#if USE_NPP_STREAM_CTX + nppSafeCall(func(src.ptr(), static_cast(src.step), sz, pHist, pLevels, nLevels, buf.ptr(), h)); +#else nppSafeCall( func(src.ptr(), static_cast(src.step), sz, pHist, pLevels, nLevels, buf.ptr()) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -497,6 +552,24 @@ namespace }; } +class OldNppStreamHandlerForEvenLevels +{ +public: + explicit OldNppStreamHandlerForEvenLevels(Stream& newStream) + { + oldStream = nppGetStream(); + nppSafeSetStream(oldStream, StreamAccessor::getStream(newStream)); + } + + ~OldNppStreamHandlerForEvenLevels() + { + nppSafeSetStream(nppGetStream(), oldStream); + } + +private: + cudaStream_t oldStream; +}; + void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream) { const int kind = _levels.kind(); @@ -509,6 +582,9 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int else host_levels = _levels.getMat(); + // Update to use NppStreamHandler when nppiEvenLevelsHost_32s_Ctx is included in nppist.lib and libnppist.so + OldNppStreamHandlerForEvenLevels h(stream); + nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr(), nLevels, lowerLevel, upperLevel) ); if (kind == _InputArray::CUDA_GPU_MAT) @@ -537,10 +613,17 @@ void cv::cuda::histEven(InputArray _src, OutputArray hist, int histSize, int low typedef void (*hist_t)(const GpuMat& src, OutputArray hist, int levels, int lowerLevel, int upperLevel, Stream& stream); static const hist_t hist_callers[] = { +#if USE_NPP_STREAM_CTX + NppHistogramEvenC1::hist, + 0, + NppHistogramEvenC1::hist, + NppHistogramEvenC1::hist +#else NppHistogramEvenC1::hist, 0, NppHistogramEvenC1::hist, NppHistogramEvenC1::hist +#endif }; GpuMat src = _src.getGpuMat(); @@ -561,10 +644,17 @@ void cv::cuda::histEven(InputArray _src, GpuMat hist[4], int histSize[4], int lo typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], int levels[4], int lowerLevel[4], int upperLevel[4], Stream& stream); static const hist_t hist_callers[] = { +#if USE_NPP_STREAM_CTX + NppHistogramEvenC4::hist, + 0, + NppHistogramEvenC4::hist, + NppHistogramEvenC4::hist +#else NppHistogramEvenC4::hist, 0, NppHistogramEvenC4::hist, NppHistogramEvenC4::hist +#endif }; GpuMat src = _src.getGpuMat(); @@ -579,12 +669,21 @@ void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, Stream& stream); static const hist_t hist_callers[] = { +#if USE_NPP_STREAM_CTX + NppHistogramRangeC1::hist, + 0, + NppHistogramRangeC1::hist, + NppHistogramRangeC1::hist, + 0, + NppHistogramRangeC1::hist +#else NppHistogramRangeC1::hist, 0, NppHistogramRangeC1::hist, NppHistogramRangeC1::hist, 0, NppHistogramRangeC1::hist +#endif }; GpuMat src = _src.getGpuMat(); @@ -600,12 +699,21 @@ void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4] typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream); static const hist_t hist_callers[] = { +#if USE_NPP_STREAM_CTX + NppHistogramRangeC4::hist, + 0, + NppHistogramRangeC4::hist, + NppHistogramRangeC4::hist, + 0, + NppHistogramRangeC4::hist +#else NppHistogramRangeC4::hist, 0, NppHistogramRangeC4::hist, NppHistogramRangeC4::hist, 0, NppHistogramRangeC4::hist +#endif }; GpuMat src = _src.getGpuMat(); diff --git a/modules/cudawarping/src/warp.cpp b/modules/cudawarping/src/warp.cpp index a18a459c2b1..8690f54085d 100644 --- a/modules/cudawarping/src/warp.cpp +++ b/modules/cudawarping/src/warp.cpp @@ -139,9 +139,15 @@ namespace { typedef typename NPPTypeTraits::npp_type npp_type; +#if USE_NPP_STREAM_CTX + typedef NppStatus(*func_t)(const npp_type* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_type* pDst, + int dstStep, NppiRect dstRoi, const double coeffs[][3], + int interpolation, NppStreamContext ctx); +#else typedef NppStatus (*func_t)(const npp_type* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_type* pDst, int dstStep, NppiRect dstRoi, const double coeffs[][3], int interpolation); +#endif }; template ::func_t func> struct NppWarp @@ -170,9 +176,15 @@ namespace cv::cuda::NppStreamHandler h(stream); +#if USE_NPP_STREAM_CTX + nppSafeCall(func(src.ptr(), srcsz, static_cast(src.step), srcroi, + dst.ptr(), static_cast(dst.step), dstroi, + coeffs, npp_inter[interpolation], h)); +#else nppSafeCall( func(src.ptr(), srcsz, static_cast(src.step), srcroi, dst.ptr(), static_cast(dst.step), dstroi, coeffs, npp_inter[interpolation]) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -250,6 +262,24 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size static const func_t funcs[2][6][4] = { +#if USE_NPP_STREAM_CTX + { + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {0, 0, 0, 0}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {0, 0, 0, 0}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call} + }, + { + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {0, 0, 0, 0}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {0, 0, 0, 0}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call} + } +#else { {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, @@ -266,6 +296,7 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {NppWarp::call, 0, NppWarp::call, NppWarp::call} } +#endif }; dst.setTo(borderValue, stream); @@ -389,6 +420,24 @@ void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M, static const func_t funcs[2][6][4] = { +#if USE_NPP_STREAM_CTX + { + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {0, 0, 0, 0}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {0, 0, 0, 0}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call} + }, + { + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {0, 0, 0, 0}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {0, 0, 0, 0}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call}, + {NppWarp::call, 0, NppWarp::call, NppWarp::call} + } +#else { {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {0, 0, 0, 0}, @@ -405,6 +454,7 @@ void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M, {NppWarp::call, 0, NppWarp::call, NppWarp::call}, {NppWarp::call, 0, NppWarp::call, NppWarp::call} } +#endif }; dst.setTo(borderValue, stream); @@ -467,9 +517,15 @@ namespace { typedef typename NPPTypeTraits::npp_type npp_type; +#if USE_NPP_STREAM_CTX + typedef NppStatus(*func_t)(const npp_type* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, + npp_type* pDst, int nDstStep, NppiRect oDstROI, + double nAngle, double nShiftX, double nShiftY, int eInterpolation, NppStreamContext ctx); +#else typedef NppStatus (*func_t)(const npp_type* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, npp_type* pDst, int nDstStep, NppiRect oDstROI, double nAngle, double nShiftX, double nShiftY, int eInterpolation); +#endif }; template ::func_t func> struct NppRotate @@ -495,8 +551,13 @@ namespace dstroi.height = dst.rows; dstroi.width = dst.cols; +#if USE_NPP_STREAM_CTX + nppSafeCall(func(src.ptr(), srcsz, static_cast(src.step), srcroi, + dst.ptr(), static_cast(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation], h)); +#else nppSafeCall( func(src.ptr(), srcsz, static_cast(src.step), srcroi, dst.ptr(), static_cast(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); @@ -509,12 +570,21 @@ void cv::cuda::rotate(InputArray _src, OutputArray _dst, Size dsize, double angl typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream); static const func_t funcs[6][4] = { +#if USE_NPP_STREAM_CTX + {NppRotate::call, 0, NppRotate::call, NppRotate::call}, + {0,0,0,0}, + {NppRotate::call, 0, NppRotate::call, NppRotate::call}, + {0,0,0,0}, + {0,0,0,0}, + {NppRotate::call, 0, NppRotate::call, NppRotate::call} +#else {NppRotate::call, 0, NppRotate::call, NppRotate::call}, {0,0,0,0}, {NppRotate::call, 0, NppRotate::call, NppRotate::call}, {0,0,0,0}, {0,0,0,0}, {NppRotate::call, 0, NppRotate::call, NppRotate::call} +#endif }; GpuMat src = _src.getGpuMat();