From 06f764abc1539d2a33a9b237bff655adbf126e53 Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Wed, 2 Oct 2024 13:52:44 +0300
Subject: [PATCH 1/2] cuda - update npp calls to use the new NppStreamContext
 API if available

---
 modules/cudaarithm/src/core.cpp               |  49 +++++-
 modules/cudaarithm/src/cuda/bitwise_scalar.cu |  47 ++++-
 modules/cudaarithm/src/cuda/threshold.cu      |   5 +
 modules/cudaarithm/src/cuda/transpose.cu      |   5 +
 modules/cudaarithm/src/element_operations.cpp |  49 ++++++
 modules/cudaarithm/src/reductions.cpp         |  47 ++++-
 modules/cudacodec/src/video_reader.cpp        |  23 ++-
 modules/cudafilters/src/filtering.cpp         | 117 +++++++++++--
 modules/cudaimgproc/src/color.cpp             |  77 ++++++--
 modules/cudaimgproc/src/histogram.cpp         | 164 +++++++++++++++---
 modules/cudawarping/src/warp.cpp              |  70 ++++++++
 11 files changed, 571 insertions(+), 82 deletions(-)
diff --git a/modules/cudaarithm/src/core.cpp b/modules/cudaarithm/src/core.cpp
index 223929881fd..449cb308a2d 100644
--- a/modules/cudaarithm/src/core.cpp
+++ b/modules/cudaarithm/src/core.cpp
@@ -79,7 +79,11 @@ namespace
     {
         typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
 
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip, NppStreamContext ctx);
+#else
+        typedef NppStatus(*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
+#endif
     };
 
     template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
@@ -94,9 +98,15 @@ namespace
             sz.width  = src.cols;
             sz.height = src.rows;
 
+#if USE_NPP_STREAM_CTX
             nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
                 dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
-                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
+                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS)), h) );
+#else
+            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
+                dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
+                    (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
+#endif
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -107,7 +117,11 @@ namespace
     {
         typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
 
-        typedef NppStatus (*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip);
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus (*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip, NppStreamContext ctx);
+#else
+        typedef NppStatus(*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip);
+#endif
     };
 
     template <int DEPTH, typename NppMirrorIFunc<DEPTH>::func_t func> struct NppMirrorI
@@ -121,10 +135,15 @@ namespace
             NppiSize sz;
             sz.width  = srcDst.cols;
             sz.height = srcDst.rows;
-
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func(srcDst.ptr<npp_t>(), static_cast<int>(srcDst.step),
+                sz,
+                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS)), h) );
+#else
             nppSafeCall( func(srcDst.ptr<npp_t>(), static_cast<int>(srcDst.step),
                 sz,
                 (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
+#endif
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -137,23 +156,41 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str
     typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
     static const func_t funcs[6][4] =
     {
-        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
+#if USE_NPP_STREAM_CTX
+        {NppMirror<CV_8U, nppiMirror_8u_C1R_Ctx>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R_Ctx>::call, NppMirror<CV_8U, nppiMirror_8u_C4R_Ctx>::call},
+        {0,0,0,0},
+        {NppMirror<CV_16U, nppiMirror_16u_C1R_Ctx>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R_Ctx>::call, NppMirror<CV_16U, nppiMirror_16u_C4R_Ctx>::call},
+        {0,0,0,0},
+        {NppMirror<CV_32S, nppiMirror_32s_C1R_Ctx>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R_Ctx>::call, NppMirror<CV_32S, nppiMirror_32s_C4R_Ctx>::call},
+        {NppMirror<CV_32F, nppiMirror_32f_C1R_Ctx>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R_Ctx>::call, NppMirror<CV_32F, nppiMirror_32f_C4R_Ctx>::call}
+#else
+        { NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call },
         {0,0,0,0},
         {NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
         {0,0,0,0},
         {NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
         {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
+#endif
     };
 
     typedef void (*ifunc_t)(GpuMat& srcDst, int flipCode, cudaStream_t stream);
     static const ifunc_t ifuncs[6][4] =
     {
-        {NppMirrorI<CV_8U, nppiMirror_8u_C1IR>::call, 0, NppMirrorI<CV_8U, nppiMirror_8u_C3IR>::call, NppMirrorI<CV_8U, nppiMirror_8u_C4IR>::call},
+#if USE_NPP_STREAM_CTX
+        {NppMirrorI<CV_8U, nppiMirror_8u_C1IR_Ctx>::call, 0, NppMirrorI<CV_8U, nppiMirror_8u_C3IR_Ctx>::call, NppMirrorI<CV_8U, nppiMirror_8u_C4IR_Ctx>::call},
+        {0,0,0,0},
+        {NppMirrorI<CV_16U, nppiMirror_16u_C1IR_Ctx>::call, 0, NppMirrorI<CV_16U, nppiMirror_16u_C3IR_Ctx>::call, NppMirrorI<CV_16U, nppiMirror_16u_C4IR_Ctx>::call},
+        {0,0,0,0},
+        {NppMirrorI<CV_32S, nppiMirror_32s_C1IR_Ctx>::call, 0, NppMirrorI<CV_32S, nppiMirror_32s_C3IR_Ctx>::call, NppMirrorI<CV_32S, nppiMirror_32s_C4IR_Ctx>::call},
+        {NppMirrorI<CV_32F, nppiMirror_32f_C1IR_Ctx>::call, 0, NppMirrorI<CV_32F, nppiMirror_32f_C3IR_Ctx>::call, NppMirrorI<CV_32F, nppiMirror_32f_C4IR_Ctx>::call}
+#else
+        { NppMirrorI<CV_8U, nppiMirror_8u_C1IR>::call, 0, NppMirrorI<CV_8U, nppiMirror_8u_C3IR>::call, NppMirrorI<CV_8U, nppiMirror_8u_C4IR>::call },
         {0,0,0,0},
         {NppMirrorI<CV_16U, nppiMirror_16u_C1IR>::call, 0, NppMirrorI<CV_16U, nppiMirror_16u_C3IR>::call, NppMirrorI<CV_16U, nppiMirror_16u_C4IR>::call},
         {0,0,0,0},
         {NppMirrorI<CV_32S, nppiMirror_32s_C1IR>::call, 0, NppMirrorI<CV_32S, nppiMirror_32s_C3IR>::call, NppMirrorI<CV_32S, nppiMirror_32s_C4IR>::call},
         {NppMirrorI<CV_32F, nppiMirror_32f_C1IR>::call, 0, NppMirrorI<CV_32F, nppiMirror_32f_C3IR>::call, NppMirrorI<CV_32F, nppiMirror_32f_C4IR>::call}
+#endif
     };
 
     GpuMat src = getInputMat(_src, stream);
diff --git a/modules/cudaarithm/src/cuda/bitwise_scalar.cu b/modules/cudaarithm/src/cuda/bitwise_scalar.cu
index 0dd99e8cd85..1ff3edd7b48 100644
--- a/modules/cudaarithm/src/cuda/bitwise_scalar.cu
+++ b/modules/cudaarithm/src/cuda/bitwise_scalar.cu
@@ -92,7 +92,11 @@ namespace
     {
         typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
 
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx);
+#else
         typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
+#endif
     };
 
     template <int DEPTH, int cn, typename NppBitwiseCFunc<DEPTH, cn>::func_t func> struct NppBitwiseC
@@ -116,7 +120,11 @@ namespace
                 cv::saturate_cast<npp_type>(value[3])
             };
 
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI, h));
+#else
             nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
+#endif
 
             if (stream == 0)
                 CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
@@ -131,13 +139,39 @@ void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const Gpu
     typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
     static const func_t funcs[3][6][4] =
     {
+#if USE_NPP_STREAM_CTX
+        {
+            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R_Ctx>::call},
+            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R_Ctx>::call},
+            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R_Ctx>::call},
+            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R_Ctx>::call}
+        },
         {
-            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
-            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
-            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
-            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
-            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call},
-            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+            {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R_Ctx>::call},
+            {BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R_Ctx>::call},
+            {BitScalar<uint, bitScalarOp<bit_or, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R_Ctx>::call},
+            {BitScalar<uint, bitScalarOp<bit_or, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R_Ctx>::call}
+        },
+        {
+            {BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R_Ctx >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R_Ctx>::call},
+            {BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R_Ctx>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R_Ctx>::call},
+            {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R_Ctx>::call},
+            {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R_Ctx>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R_Ctx>::call}
+        }
+#else
+        {
+            { BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call, 0, NppBitwiseC<CV_8U, 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call },
+            { BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call },
+            { BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call },
+            { BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call },
+            { BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call },
+            { BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call }
         },
         {
             {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
@@ -155,6 +189,7 @@ void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const Gpu
             {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call},
             {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
         }
+#endif
     };
 
     const int depth = src.depth();
diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
index 1249fee04be..cfa88dfae7c 100644
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -116,8 +116,13 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou
         sz.width  = src.cols;
         sz.height = src.rows;
 
+#if USE_NPP_STREAM_CTX
+        nppSafeCall(nppiThreshold_32f_C1R_Ctx(src.ptr<Npp32f>(), static_cast<int>(src.step),
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER, h));
+#else
         nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
             dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );
+#endif
 
         if (!stream)
             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
index bfe50bd34fb..8dae081731e 100644
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -74,8 +74,13 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
         sz.width  = src.cols;
         sz.height = src.rows;
 
+#if USE_NPP_STREAM_CTX
+        nppSafeCall(nppiTranspose_8u_C1R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, h));
+#else
         nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
             dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+#endif
 
         if (!stream)
             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
diff --git a/modules/cudaarithm/src/element_operations.cpp b/modules/cudaarithm/src/element_operations.cpp
index 1ad3c17c40f..037abc07821 100644
--- a/modules/cudaarithm/src/element_operations.cpp
+++ b/modules/cudaarithm/src/element_operations.cpp
@@ -337,13 +337,21 @@ namespace
     {
         typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
 
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx);
+#else
         typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u* pConstants, npp_type* pDst,  int nDstStep,  NppiSize oSizeROI);
+#endif
     };
     template <int DEPTH> struct NppShiftFunc<DEPTH, 1>
     {
         typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
 
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx);
+#else
         typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u pConstants, npp_type* pDst,  int nDstStep,  NppiSize oSizeROI);
+#endif
     };
 
     template <int DEPTH, int cn, typename NppShiftFunc<DEPTH, cn>::func_t func> struct NppShift
@@ -358,7 +366,11 @@ namespace
             oSizeROI.width = src.cols;
             oSizeROI.height = src.rows;
 
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func(src.ptr<npp_type>(), static_cast<int>(src.step), sc.val, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI, h));
+#else
             nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), sc.val, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
+#endif
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -376,7 +388,11 @@ namespace
             oSizeROI.width = src.cols;
             oSizeROI.height = src.rows;
 
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func(src.ptr<npp_type>(), static_cast<int>(src.step), sc.val[0], dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI, h));
+#else
             nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), sc.val[0], dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
+#endif
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -389,11 +405,20 @@ void cv::cuda::rshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Strea
     typedef void (*func_t)(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream);
     static const func_t funcs[5][4] =
     {
+#if USE_NPP_STREAM_CTX
+
+        {NppShift<CV_8U , 1, nppiRShiftC_8u_C1R_Ctx>::call, 0, NppShift<CV_8U , 3, nppiRShiftC_8u_C3R_Ctx>::call, NppShift<CV_8U , 4, nppiRShiftC_8u_C4R_Ctx>::call },
+        {NppShift<CV_8S , 1, nppiRShiftC_8s_C1R_Ctx>::call, 0, NppShift<CV_8S , 3, nppiRShiftC_8s_C3R_Ctx>::call, NppShift<CV_8S , 4, nppiRShiftC_8s_C4R_Ctx>::call },
+        {NppShift<CV_16U, 1, nppiRShiftC_16u_C1R_Ctx>::call, 0, NppShift<CV_16U, 3, nppiRShiftC_16u_C3R_Ctx>::call, NppShift<CV_16U, 4, nppiRShiftC_16u_C4R_Ctx>::call},
+        {NppShift<CV_16S, 1, nppiRShiftC_16s_C1R_Ctx>::call, 0, NppShift<CV_16S, 3, nppiRShiftC_16s_C3R_Ctx>::call, NppShift<CV_16S, 4, nppiRShiftC_16s_C4R_Ctx>::call},
+        {NppShift<CV_32S, 1, nppiRShiftC_32s_C1R_Ctx>::call, 0, NppShift<CV_32S, 3, nppiRShiftC_32s_C3R_Ctx>::call, NppShift<CV_32S, 4, nppiRShiftC_32s_C4R_Ctx>::call},
+#else
         {NppShift<CV_8U , 1, nppiRShiftC_8u_C1R >::call, 0, NppShift<CV_8U , 3, nppiRShiftC_8u_C3R >::call, NppShift<CV_8U , 4, nppiRShiftC_8u_C4R>::call },
         {NppShift<CV_8S , 1, nppiRShiftC_8s_C1R >::call, 0, NppShift<CV_8S , 3, nppiRShiftC_8s_C3R >::call, NppShift<CV_8S , 4, nppiRShiftC_8s_C4R>::call },
         {NppShift<CV_16U, 1, nppiRShiftC_16u_C1R>::call, 0, NppShift<CV_16U, 3, nppiRShiftC_16u_C3R>::call, NppShift<CV_16U, 4, nppiRShiftC_16u_C4R>::call},
         {NppShift<CV_16S, 1, nppiRShiftC_16s_C1R>::call, 0, NppShift<CV_16S, 3, nppiRShiftC_16s_C3R>::call, NppShift<CV_16S, 4, nppiRShiftC_16s_C4R>::call},
         {NppShift<CV_32S, 1, nppiRShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiRShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiRShiftC_32s_C4R>::call},
+#endif
     };
 
     GpuMat src = getInputMat(_src, stream);
@@ -413,11 +438,19 @@ void cv::cuda::lshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Strea
     typedef void (*func_t)(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream);
     static const func_t funcs[5][4] =
     {
+#if USE_NPP_STREAM_CTX
+        {NppShift<CV_8U , 1, nppiLShiftC_8u_C1R_Ctx>::call , 0, NppShift<CV_8U , 3, nppiLShiftC_8u_C3R_Ctx>::call , NppShift<CV_8U , 4, nppiLShiftC_8u_C4R_Ctx>::call },
+        {0                                             , 0, 0                                             , 0                                             },
+        {NppShift<CV_16U, 1, nppiLShiftC_16u_C1R_Ctx>::call, 0, NppShift<CV_16U, 3, nppiLShiftC_16u_C3R_Ctx>::call, NppShift<CV_16U, 4, nppiLShiftC_16u_C4R_Ctx>::call},
+        {0                                             , 0, 0                                             , 0                                             },
+        {NppShift<CV_32S, 1, nppiLShiftC_32s_C1R_Ctx>::call, 0, NppShift<CV_32S, 3, nppiLShiftC_32s_C3R_Ctx>::call, NppShift<CV_32S, 4, nppiLShiftC_32s_C4R_Ctx>::call},
+#else
         {NppShift<CV_8U , 1, nppiLShiftC_8u_C1R>::call , 0, NppShift<CV_8U , 3, nppiLShiftC_8u_C3R>::call , NppShift<CV_8U , 4, nppiLShiftC_8u_C4R>::call },
         {0                                             , 0, 0                                             , 0                                             },
         {NppShift<CV_16U, 1, nppiLShiftC_16u_C1R>::call, 0, NppShift<CV_16U, 3, nppiLShiftC_16u_C3R>::call, NppShift<CV_16U, 4, nppiLShiftC_16u_C4R>::call},
         {0                                             , 0, 0                                             , 0                                             },
         {NppShift<CV_32S, 1, nppiLShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiLShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiLShiftC_32s_C4R>::call},
+#endif
     };
 
     GpuMat src = getInputMat(_src, stream);
@@ -463,7 +496,11 @@ void cv::cuda::max(InputArray src1, InputArray src2, OutputArray dst, Stream& st
 
 namespace
 {
+#if USE_NPP_STREAM_CTX
+    typedef NppStatus(*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx);
+#else
     typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
+#endif
 
     void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream)
     {
@@ -475,7 +512,11 @@ namespace
 
         NppStreamHandler h(stream);
 
+#if USE_NPP_STREAM_CTX
+        nppSafeCall(func(src.ptr<Npp32fc>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, h));
+#else
         nppSafeCall( func(src.ptr<Npp32fc>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+#endif
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
@@ -488,7 +529,11 @@ void cv::cuda::magnitude(InputArray _src, OutputArray _dst, Stream& stream)
 
     GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream);
 
+#if USE_NPP_STREAM_CTX
+    npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R_Ctx, StreamAccessor::getStream(stream));
+#else
     npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
+#endif
 
     syncOutput(dst, _dst, stream);
 }
@@ -499,7 +544,11 @@ void cv::cuda::magnitudeSqr(InputArray _src, OutputArray _dst, Stream& stream)
 
     GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream);
 
+#if USE_NPP_STREAM_CTX
+    npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R_Ctx, StreamAccessor::getStream(stream));
+#else
     npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
+#endif
 
     syncOutput(dst, _dst, stream);
 }
diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp
index b70a128558f..d89a2984303 100644
--- a/modules/cudaarithm/src/reductions.cpp
+++ b/modules/cudaarithm/src/reductions.cpp
@@ -151,32 +151,44 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, Stream& stream)
     sz.width  = gsrc.cols;
     sz.height = gsrc.rows;
 
-#if (CUDA_VERSION >= 12040)
+#if (NPP_VERSION >= 12205)
     size_t bufSize;
 #else
     int bufSize;
 #endif
 
+    NppStreamHandler h(StreamAccessor::getStream(stream));
+
 #if (CUDA_VERSION <= 4020)
     nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
+#else
+#if USE_NPP_STREAM_CTX
+    if (gsrc.type() == CV_8UC1)
+        nppSafeCall(nppiMeanStdDevGetBufferHostSize_8u_C1R_Ctx(sz, &bufSize, h));
+    else
+        nppSafeCall(nppiMeanStdDevGetBufferHostSize_32f_C1R_Ctx(sz, &bufSize, h));
 #else
     if (gsrc.type() == CV_8UC1)
         nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
     else
         nppSafeCall( nppiMeanStdDevGetBufferHostSize_32f_C1R(sz, &bufSize) );
+#endif
 #endif
 
     BufferPool pool(stream);
     CV_Assert(bufSize <= std::numeric_limits<int>::max());
     GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), gsrc.type());
-
-    // detail: https://github.com/opencv/opencv/issues/11063
-    //NppStreamHandler h(StreamAccessor::getStream(stream));
-
+#if USE_NPP_STREAM_CTX
+    if (gsrc.type() == CV_8UC1)
+        nppSafeCall(nppiMean_StdDev_8u_C1R_Ctx(gsrc.ptr<Npp8u>(), static_cast<int>(gsrc.step), sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1, h));
+    else
+        nppSafeCall(nppiMean_StdDev_32f_C1R_Ctx(gsrc.ptr<Npp32f>(), static_cast<int>(gsrc.step), sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1, h));
+#else
     if(gsrc.type() == CV_8UC1)
         nppSafeCall( nppiMean_StdDev_8u_C1R(gsrc.ptr<Npp8u>(), static_cast<int>(gsrc.step), sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1) );
     else
         nppSafeCall( nppiMean_StdDev_32f_C1R(gsrc.ptr<Npp32f>(), static_cast<int>(gsrc.step), sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1) );
+#endif
 
     syncOutput(gdst, dst, stream);
 }
@@ -233,31 +245,49 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stre
     sz.width  = gsrc.cols;
     sz.height = gsrc.rows;
 
-#if (CUDA_VERSION >= 12040)
+#if (NPP_VERSION >= 12205)
     size_t bufSize;
 #else
     int bufSize;
 #endif
 
+    NppStreamHandler h(StreamAccessor::getStream(stream));
+
 #if (CUDA_VERSION <= 4020)
         nppSafeCall( nppiMeanStdDev8uC1MRGetBufferHostSize(sz, &bufSize) );
+#else
+#if USE_NPP_STREAM_CTX
+    if (gsrc.type() == CV_8UC1)
+        nppSafeCall(nppiMeanStdDevGetBufferHostSize_8u_C1MR_Ctx(sz, &bufSize, h));
+    else
+        nppSafeCall(nppiMeanStdDevGetBufferHostSize_32f_C1MR_Ctx(sz, &bufSize, h));
 #else
     if (gsrc.type() == CV_8UC1)
         nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1MR(sz, &bufSize) );
     else
         nppSafeCall( nppiMeanStdDevGetBufferHostSize_32f_C1MR(sz, &bufSize) );
+#endif
 #endif
 
     BufferPool pool(stream);
     CV_Assert(bufSize <= std::numeric_limits<int>::max());
     GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), gsrc.type());
 
+#if USE_NPP_STREAM_CTX
+    if (gsrc.type() == CV_8UC1)
+        nppSafeCall(nppiMean_StdDev_8u_C1MR_Ctx(gsrc.ptr<Npp8u>(), static_cast<int>(gsrc.step), gmask.ptr<Npp8u>(), static_cast<int>(gmask.step),
+            sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1, h));
+    else
+        nppSafeCall(nppiMean_StdDev_32f_C1MR_Ctx(gsrc.ptr<Npp32f>(), static_cast<int>(gsrc.step), gmask.ptr<Npp8u>(), static_cast<int>(gmask.step),
+            sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1, h));
+#else
     if(gsrc.type() == CV_8UC1)
         nppSafeCall( nppiMean_StdDev_8u_C1MR(gsrc.ptr<Npp8u>(), static_cast<int>(gsrc.step), gmask.ptr<Npp8u>(), static_cast<int>(gmask.step),
                                              sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1) );
     else
         nppSafeCall( nppiMean_StdDev_32f_C1MR(gsrc.ptr<Npp32f>(), static_cast<int>(gsrc.step), gmask.ptr<Npp8u>(), static_cast<int>(gmask.step),
                                               sz, buf.ptr<Npp8u>(), gdst.ptr<Npp64f>(), gdst.ptr<Npp64f>() + 1) );
+#endif
 
     syncOutput(gdst, dst, stream);
 }
@@ -288,8 +318,13 @@ void cv::cuda::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Re
 
     NppStreamHandler h(stream);
 
+#if USE_NPP_STREAM_CTX
+    nppSafeCall(nppiRectStdDev_32s32f_C1R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp64f>(), static_cast<int>(sqr.step),
+        dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect, h));
+#else
     nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp64f>(), static_cast<int>(sqr.step),
                 dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) );
+#endif
 
     if (stream == 0)
         cudaSafeCall( cudaDeviceSynchronize() );
diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp
index 6d71e544fa0..5bf9aac91ed 100644
--- a/modules/cudacodec/src/video_reader.cpp
+++ b/modules/cudacodec/src/video_reader.cpp
@@ -68,26 +68,23 @@ void cvtFromNv12(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int he
         outFrame.create(height, width, CV_8UC3);
         Npp8u* pSrc[2] = { decodedFrame.data, &decodedFrame.data[decodedFrame.step * height] };
         NppiSize oSizeROI = { width,height };
-#if (CUDART_VERSION < 10010)
         cv::cuda::NppStreamHandler h(stream);
+#if USE_NPP_STREAM_CTX
         if (videoFullRangeFlag)
-            nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
-        else {
-            nppSafeCall(nppiNV12ToBGR_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
-        }
-#elif (CUDART_VERSION >= 10010)
-        NppStreamContext nppStreamCtx;
-        nppSafeCall(nppGetStreamContext(&nppStreamCtx));
-        nppStreamCtx.hStream = StreamAccessor::getStream(stream);
-        if (videoFullRangeFlag)
-            nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
+            nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, h));
         else {
 #if (CUDART_VERSION < 11000)
-            nppSafeCall(nppiNV12ToBGR_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
+            nppSafeCall(nppiNV12ToBGR_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, h));
 #else
-            nppSafeCall(nppiNV12ToBGR_709CSC_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
+            nppSafeCall(nppiNV12ToBGR_709CSC_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, h));
 #endif
         }
+#else
+        if (videoFullRangeFlag)
+            nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
+        else {
+            nppSafeCall(nppiNV12ToBGR_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
+        }
 #endif
     }
     else if (colorFormat == ColorFormat::GRAY) {
diff --git a/modules/cudafilters/src/filtering.cpp b/modules/cudafilters/src/filtering.cpp
index 185751f861f..ff2315ee70d 100644
--- a/modules/cudafilters/src/filtering.cpp
+++ b/modules/cudafilters/src/filtering.cpp
@@ -105,10 +105,17 @@ namespace
         void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
     private:
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*nppFilterBox8U_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep,
+            NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor, NppStreamContext ctx);
+        typedef NppStatus(*nppFilterBox32F_t)(const Npp32f* pSrc, Npp32s nSrcStep, Npp32f* pDst, Npp32s nDstStep,
+            NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor, NppStreamContext ctx);
+#else
         typedef NppStatus (*nppFilterBox8U_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep,
                                             NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor);
         typedef NppStatus (*nppFilterBox32F_t)(const Npp32f* pSrc, Npp32s nSrcStep, Npp32f* pDst, Npp32s nDstStep,
                                             NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor);
+#endif
 
         Size ksize_;
         Point anchor_;
@@ -161,20 +168,40 @@ namespace
         {
         case CV_8U:
         {
+#if USE_NPP_STREAM_CTX
+            static const nppFilterBox8U_t funcs8U[] = { 0, nppiFilterBox_8u_C1R_Ctx, 0, 0, nppiFilterBox_8u_C4R_Ctx };
+#else
             static const nppFilterBox8U_t funcs8U[] = { 0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R };
+#endif
             const nppFilterBox8U_t func8U = funcs8U[cn];
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func8U(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                oSizeROI, oMaskSize, oAnchor, h));
+#else
             nppSafeCall(func8U(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
                 dst.ptr<Npp8u>(), static_cast<int>(dst.step),
                 oSizeROI, oMaskSize, oAnchor));
+#endif
         }
             break;
         case CV_32F:
         {
+#if USE_NPP_STREAM_CTX
+            static const nppFilterBox32F_t funcs32F[] = { 0, nppiFilterBox_32f_C1R_Ctx, 0, 0, 0 };
+#else
             static const nppFilterBox32F_t funcs32F[] = { 0, nppiFilterBox_32f_C1R, 0, 0, 0 };
+#endif
             const nppFilterBox32F_t func32F = funcs32F[cn];
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func32F(srcRoi.ptr<Npp32f>(), static_cast<int>(srcRoi.step),
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                oSizeROI, oMaskSize, oAnchor, h));
+#else
             nppSafeCall(func32F(srcRoi.ptr<Npp32f>(), static_cast<int>(srcRoi.step),
                 dst.ptr<Npp32f>(), static_cast<int>(dst.step),
                 oSizeROI, oMaskSize, oAnchor));
+#endif
         }
             break;
         }
@@ -566,10 +593,18 @@ namespace
         void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
     private:
+#if USE_NPP_STREAM_CTX
         typedef NppStatus (*nppMorfFilter8u_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
-                                               const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor);
+                                               const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor, NppStreamContext streamCtx);
         typedef NppStatus (*nppMorfFilter32f_t)(const Npp32f* pSrc, Npp32s nSrcStep, Npp32f* pDst, Npp32s nDstStep, NppiSize oSizeROI,
-                                                const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor);
+                                                const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor, NppStreamContext streamCtx);
+#else
+        typedef NppStatus(*nppMorfFilter8u_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+            const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor);
+        typedef NppStatus(*nppMorfFilter32f_t)(const Npp32f* pSrc, Npp32s nSrcStep, Npp32f* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+            const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor);
+
+#endif
 
         int type_;
         GpuMat kernel_;
@@ -585,6 +620,18 @@ namespace
     MorphologyFilter::MorphologyFilter(int op, int srcType, InputArray _kernel, Point anchor, int iterations) :
         type_(srcType), anchor_(anchor), iters_(iterations)
     {
+#if USE_NPP_STREAM_CTX
+        static const nppMorfFilter8u_t funcs8u[2][5] =
+        {
+            {0, nppiErode_8u_C1R_Ctx, 0, 0, nppiErode_8u_C4R_Ctx },
+            {0, nppiDilate_8u_C1R_Ctx, 0, 0, nppiDilate_8u_C4R_Ctx }
+        };
+        static const nppMorfFilter32f_t funcs32f[2][5] =
+        {
+            {0, nppiErode_32f_C1R_Ctx, 0, 0, nppiErode_32f_C4R_Ctx },
+            {0, nppiDilate_32f_C1R_Ctx, 0, 0, nppiDilate_32f_C4R_Ctx }
+        };
+#else
         static const nppMorfFilter8u_t funcs8u[2][5] =
         {
             {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R },
@@ -595,6 +642,7 @@ namespace
             {0, nppiErode_32f_C1R, 0, 0, nppiErode_32f_C4R },
             {0, nppiDilate_32f_C1R, 0, 0, nppiDilate_32f_C4R }
         };
+#endif
 
         CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE );
         CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_32FC1 || srcType == CV_32FC4 );
@@ -676,28 +724,46 @@ namespace
 
         if (type_ == CV_8UC1 || type_ == CV_8UC4)
         {
+#if USE_NPP_STREAM_CTX
             nppSafeCall( func8u_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
-                                 oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+                                 oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor, h) );
+#else
+            nppSafeCall(func8u_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor));
+#endif
 
             for(int i = 1; i < iters_; ++i)
             {
                 dst.copyTo(bufRoi, _stream);
-
+#if USE_NPP_STREAM_CTX
                 nppSafeCall( func8u_(bufRoi.ptr<Npp8u>(), static_cast<int>(bufRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
-                                     oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+                                     oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor, h) );
+#else
+                nppSafeCall(func8u_(bufRoi.ptr<Npp8u>(), static_cast<int>(bufRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                    oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor));
+#endif
             }
         }
         else if (type_ == CV_32FC1 || type_ == CV_32FC4)
         {
+#if USE_NPP_STREAM_CTX
             nppSafeCall( func32f_(srcRoi.ptr<Npp32f>(), static_cast<int>(srcRoi.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step),
-                                  oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
-
+                                  oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor, h) );
+#else
+            nppSafeCall(func32f_(srcRoi.ptr<Npp32f>(), static_cast<int>(srcRoi.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor));
+#endif
             for(int i = 1; i < iters_; ++i)
             {
                 dst.copyTo(bufRoi, _stream);
 
+#if USE_NPP_STREAM_CTX
                 nppSafeCall( func32f_(bufRoi.ptr<Npp32f>(), static_cast<int>(bufRoi.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step),
-                                      oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+                                      oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor, h) );
+#else
+                nppSafeCall(func32f_(srcRoi.ptr<Npp32f>(), static_cast<int>(srcRoi.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                    oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor));
+#endif
             }
         }
 
@@ -887,8 +953,13 @@ namespace
         void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
     private:
+#if USE_NPP_STREAM_CTX
         typedef NppStatus (*nppFilterRank_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
-                                             NppiSize oMaskSize, NppiPoint oAnchor);
+                                             NppiSize oMaskSize, NppiPoint oAnchor, NppStreamContext);
+#else
+        typedef NppStatus(*nppFilterRank_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+            NppiSize oMaskSize, NppiPoint oAnchor);
+#endif
 
         int type_;
         Size ksize_;
@@ -903,8 +974,13 @@ namespace
     NPPRankFilter::NPPRankFilter(int op, int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal) :
         type_(srcType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
     {
-        static const nppFilterRank_t maxFuncs[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R};
+#if USE_NPP_STREAM_CTX
+        static const nppFilterRank_t maxFuncs[] = {0, nppiFilterMax_8u_C1R_Ctx, 0, 0, nppiFilterMax_8u_C4R_Ctx};
+        static const nppFilterRank_t minFuncs[] = { 0, nppiFilterMin_8u_C1R_Ctx, 0, 0, nppiFilterMin_8u_C4R_Ctx };
+#else
+        static const nppFilterRank_t maxFuncs[] = { 0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R };
         static const nppFilterRank_t minFuncs[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R};
+#endif
 
         CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
 
@@ -943,8 +1019,13 @@ namespace
         oAnchor.x = anchor_.x;
         oAnchor.y = anchor_.y;
 
+#if USE_NPP_STREAM_CTX
+        nppSafeCall(func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+            oSizeROI, oMaskSize, oAnchor, h));
+#else
         nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
                            oSizeROI, oMaskSize, oAnchor) );
+#endif
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
@@ -1011,9 +1092,15 @@ namespace
         oSizeROI.width = src.cols;
         oSizeROI.height = src.rows;
 
+#if USE_NPP_STREAM_CTX
+        nppSafeCall(nppiSumWindowRow_8u32f_C1R_Ctx(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+            oSizeROI, ksize_, anchor_, h));
+#else
         nppSafeCall( nppiSumWindowRow_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
                                                 dst.ptr<Npp32f>(), static_cast<int>(dst.step),
                                                 oSizeROI, ksize_, anchor_) );
+#endif
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
@@ -1072,9 +1159,15 @@ namespace
         oSizeROI.width = src.cols;
         oSizeROI.height = src.rows;
 
-        nppSafeCall( nppiSumWindowColumn_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+#if USE_NPP_STREAM_CTX
+        nppSafeCall( nppiSumWindowColumn_8u32f_C1R_Ctx(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
                                                    dst.ptr<Npp32f>(), static_cast<int>(dst.step),
-                                                   oSizeROI, ksize_, anchor_) );
+                                                   oSizeROI, ksize_, anchor_, h) );
+#else
+        nppSafeCall(nppiSumWindowColumn_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+            oSizeROI, ksize_, anchor_) );
+#endif
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
diff --git a/modules/cudaimgproc/src/color.cpp b/modules/cudaimgproc/src/color.cpp
index 5adfa6cda6c..6e812428ee6 100644
--- a/modules/cudaimgproc/src/color.cpp
+++ b/modules/cudaimgproc/src/color.cpp
@@ -1831,10 +1831,17 @@ namespace
         oSizeROI.width = src.cols;
         oSizeROI.height = src.rows;
 
+#if USE_NPP_STREAM_CTX
         if (src.depth() == CV_8U)
-            nppSafeCall( nppiAlphaPremul_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+            nppSafeCall( nppiAlphaPremul_8u_AC4R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI, h) );
         else
-            nppSafeCall( nppiAlphaPremul_16u_AC4R(src.ptr<Npp16u>(), static_cast<int>(src.step), dst.ptr<Npp16u>(), static_cast<int>(dst.step), oSizeROI) );
+            nppSafeCall( nppiAlphaPremul_16u_AC4R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step), dst.ptr<Npp16u>(), static_cast<int>(dst.step), oSizeROI, h) );
+#else
+        if (src.depth() == CV_8U)
+            nppSafeCall(nppiAlphaPremul_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI));
+        else
+            nppSafeCall(nppiAlphaPremul_16u_AC4R(src.ptr<Npp16u>(), static_cast<int>(src.step), dst.ptr<Npp16u>(), static_cast<int>(dst.step), oSizeROI));
+#endif
 
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
@@ -2200,7 +2207,11 @@ void cv::cuda::swapChannels(InputOutputArray _image, const int dstOrder[4], Stre
     sz.width  = image.cols;
     sz.height = image.rows;
 
-    nppSafeCall( nppiSwapChannels_8u_C4IR(image.ptr<Npp8u>(), static_cast<int>(image.step), sz, dstOrder) );
+#if USE_NPP_STREAM_CTX
+    nppSafeCall( nppiSwapChannels_8u_C4IR_Ctx(image.ptr<Npp8u>(), static_cast<int>(image.step), sz, dstOrder, h) );
+#else
+    nppSafeCall(nppiSwapChannels_8u_C4IR(image.ptr<Npp8u>(), static_cast<int>(image.step), sz, dstOrder));
+#endif
 
     if (stream == 0)
         cudaSafeCall( cudaDeviceSynchronize() );
@@ -2209,7 +2220,7 @@ void cv::cuda::swapChannels(InputOutputArray _image, const int dstOrder[4], Stre
 ////////////////////////////////////////////////////////////////////////
 // gammaCorrection
 
-void cv::cuda::gammaCorrection(InputArray _src, OutputArray _dst, bool forward, Stream& stream)
+void cv::cuda::gammaCorrection(InputArray _src, OutputArray _dst, bool forward, Stream& _stream)
 {
 #if (CUDA_VERSION < 5000)
     CV_UNUSED(_src);
@@ -2218,8 +2229,23 @@ void cv::cuda::gammaCorrection(InputArray _src, OutputArray _dst, bool forward,
     CV_UNUSED(stream);
     CV_Error(Error::StsNotImplemented, "This function works only with CUDA 5.0 or higher");
 #else
-    typedef NppStatus (*func_t)(const Npp8u* pSrc, int nSrcStep, Npp8u* pDst, int nDstStep, NppiSize oSizeROI);
-    typedef NppStatus (*func_inplace_t)(Npp8u* pSrcDst, int nSrcDstStep, NppiSize oSizeROI);
+#if USE_NPP_STREAM_CTX
+    typedef NppStatus (*func_t)(const Npp8u* pSrc, int nSrcStep, Npp8u* pDst, int nDstStep, NppiSize oSizeROI, NppStreamContext ctx);
+    typedef NppStatus (*func_inplace_t)(Npp8u* pSrcDst, int nSrcDstStep, NppiSize oSizeROI, NppStreamContext ctx);
+
+    static const func_t funcs[2][5] =
+    {
+        {0, 0, 0, nppiGammaInv_8u_C3R_Ctx, nppiGammaInv_8u_AC4R_Ctx},
+        {0, 0, 0, nppiGammaFwd_8u_C3R_Ctx, nppiGammaFwd_8u_AC4R_Ctx}
+    };
+    static const func_inplace_t funcs_inplace[2][5] =
+    {
+        {0, 0, 0, nppiGammaInv_8u_C3IR_Ctx, nppiGammaInv_8u_AC4IR_Ctx},
+        {0, 0, 0, nppiGammaFwd_8u_C3IR_Ctx, nppiGammaFwd_8u_AC4IR_Ctx}
+    };
+#else
+    typedef NppStatus(*func_t)(const Npp8u* pSrc, int nSrcStep, Npp8u* pDst, int nDstStep, NppiSize oSizeROI);
+    typedef NppStatus(*func_inplace_t)(Npp8u* pSrcDst, int nSrcDstStep, NppiSize oSizeROI);
 
     static const func_t funcs[2][5] =
     {
@@ -2231,6 +2257,7 @@ void cv::cuda::gammaCorrection(InputArray _src, OutputArray _dst, bool forward,
         {0, 0, 0, nppiGammaInv_8u_C3IR, nppiGammaInv_8u_AC4IR},
         {0, 0, 0, nppiGammaFwd_8u_C3IR, nppiGammaFwd_8u_AC4IR}
     };
+#endif
 
     GpuMat src = _src.getGpuMat();
 
@@ -2239,17 +2266,27 @@ void cv::cuda::gammaCorrection(InputArray _src, OutputArray _dst, bool forward,
     _dst.create(src.size(), src.type());
     GpuMat dst = _dst.getGpuMat();
 
-    NppStreamHandler h(StreamAccessor::getStream(stream));
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+    NppStreamHandler h(stream);
 
     NppiSize oSizeROI;
     oSizeROI.width = src.cols;
     oSizeROI.height = src.rows;
 
+#if USE_NPP_STREAM_CTX
+    if (dst.data == src.data)
+        nppSafeCall(funcs_inplace[forward][src.channels()](dst.ptr<Npp8u>(), static_cast<int>(src.step), oSizeROI, h));
+    else
+        nppSafeCall(funcs[forward][src.channels()](src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI, h));
+#else
     if (dst.data == src.data)
-        funcs_inplace[forward][src.channels()](dst.ptr<Npp8u>(), static_cast<int>(src.step), oSizeROI);
+        nppSafeCall(funcs_inplace[forward][src.channels()](dst.ptr<Npp8u>(), static_cast<int>(src.step), oSizeROI));
     else
-        funcs[forward][src.channels()](src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI);
+        nppSafeCall(funcs[forward][src.channels()](src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI));
+#endif
 
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
 #endif
 }
 
@@ -2262,7 +2299,11 @@ namespace
     {
         typedef typename NPPTypeTraits<DEPTH>::npp_type npp_t;
 
-        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, NppiAlphaOp eAlphaOp);
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, NppiAlphaOp eAlphaOp, NppStreamContext ctx);
+#else
+        typedef NppStatus(*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, NppiAlphaOp eAlphaOp);
+#endif
     };
 
     template <int DEPTH, typename NppAlphaCompFunc<DEPTH>::func_t func> struct NppAlphaComp
@@ -2277,8 +2318,13 @@ namespace
             oSizeROI.width = img1.cols;
             oSizeROI.height = img2.rows;
 
+#if USE_NPP_STREAM_CTX
             nppSafeCall( func(img1.ptr<npp_t>(), static_cast<int>(img1.step), img2.ptr<npp_t>(), static_cast<int>(img2.step),
-                              dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, eAlphaOp) );
+                              dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, eAlphaOp, h) );
+#else
+            nppSafeCall(func(img1.ptr<npp_t>(), static_cast<int>(img1.step), img2.ptr<npp_t>(), static_cast<int>(img2.step),
+                dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, eAlphaOp));
+#endif
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -2307,12 +2353,21 @@ void cv::cuda::alphaComp(InputArray _img1, InputArray _img2, OutputArray _dst, i
     typedef void (*func_t)(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream);
     static const func_t funcs[] =
     {
+#if USE_NPP_STREAM_CTX
+        NppAlphaComp<CV_8U, nppiAlphaComp_8u_AC4R_Ctx>::call,
+        0,
+        NppAlphaComp<CV_16U, nppiAlphaComp_16u_AC4R_Ctx>::call,
+        0,
+        NppAlphaComp<CV_32S, nppiAlphaComp_32s_AC4R_Ctx>::call,
+        NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R_Ctx>::call
+#else
         NppAlphaComp<CV_8U, nppiAlphaComp_8u_AC4R>::call,
         0,
         NppAlphaComp<CV_16U, nppiAlphaComp_16u_AC4R>::call,
         0,
         NppAlphaComp<CV_32S, nppiAlphaComp_32s_AC4R>::call,
         NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call
+#endif
     };
 
     GpuMat img1 = _img1.getGpuMat();
diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp
index 51a5ce1a83e..023cb089c47 100644
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -281,27 +281,37 @@ cv::Ptr<cv::cuda::CLAHE> cv::cuda::createCLAHE(double clipLimit, cv::Size tileGr
 
 namespace
 {
-#if (CUDA_VERSION >= 12040)
-    typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, size_t* hpBufferSize);
-    typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], size_t* hpBufferSize);
+#if (NPP_VERSION >= 12205)
+    typedef NppStatus(*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, size_t* hpBufferSize, NppStreamContext ctx);
+    typedef NppStatus(*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], size_t* hpBufferSize, NppStreamContext ctx);
 #else
-    typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize);
-    typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize);
+    typedef NppStatus(*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize);
+    typedef NppStatus(*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize);
 #endif
 
     template<int SDEPTH> struct NppHistogramEvenFuncC1
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 
-    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist,
-            int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer);
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
+            int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u* pBuffer, NppStreamContext ctx);
+#else
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist,
+                int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer);
+#endif
     };
     template<int SDEPTH> struct NppHistogramEvenFuncC4
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
 
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI,
+            Npp32s* pHist[4], int nLevels[4], Npp32s nLowerLevel[4], Npp32s nUpperLevel[4], Npp8u* pBuffer, NppStreamContext ctx);
+#else
         typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI,
             Npp32s * pHist[4], int nLevels[4], Npp32s nLowerLevel[4], Npp32s nUpperLevel[4], Npp8u * pBuffer);
+#endif
     };
 
     template<int SDEPTH, typename NppHistogramEvenFuncC1<SDEPTH>::func_ptr func, get_buf_size_c1_t get_buf_size>
@@ -320,20 +330,27 @@ namespace
             sz.width = src.cols;
             sz.height = src.rows;
 
-#if (CUDA_VERSION >= 12040)
+            NppStreamHandler h(stream);
+
+#if (NPP_VERSION >= 12205)
             size_t buf_size;
+            get_buf_size(sz, levels, &buf_size, h);
 #else
             int buf_size;
-#endif
             get_buf_size(sz, levels, &buf_size);
+#endif
 
             BufferPool pool(stream);
-            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
-
-            NppStreamHandler h(stream);
+            CV_Assert(buf_size <= std::numeric_limits<int>::max());
+            GpuMat buf = pool.getBuffer(1, static_cast<int>(buf_size), CV_8UC1);
 
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels,
+                lowerLevel, upperLevel, buf.ptr<Npp8u>(), h));
+#else
             nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels,
                 lowerLevel, upperLevel, buf.ptr<Npp8u>()) );
+#endif
 
             if (!stream)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -358,19 +375,25 @@ namespace
 
             Npp32s* pHist[] = {hist[0].ptr<Npp32s>(), hist[1].ptr<Npp32s>(), hist[2].ptr<Npp32s>(), hist[3].ptr<Npp32s>()};
 
-#if (CUDA_VERSION >= 12040)
+            NppStreamHandler h(stream);
+
+#if (NPP_VERSION >= 12205)
             size_t buf_size;
+            get_buf_size(sz, levels, &buf_size, h);
 #else
             int buf_size;
-#endif
             get_buf_size(sz, levels, &buf_size);
+#endif
 
             BufferPool pool(stream);
-            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
-
-            NppStreamHandler h(stream);
+            CV_Assert(buf_size <= std::numeric_limits<int>::max());
+            GpuMat buf = pool.getBuffer(1, static_cast<int>(buf_size), CV_8UC1);
 
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr<Npp8u>()) );
+#if USE_NPP_STREAM_CTX
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr<Npp8u>(), h));
+#else
+            nppSafeCall(func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr<Npp8u>()));
+#endif
 
             if (!stream)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -383,8 +406,13 @@ namespace
         typedef Npp32s level_t;
         enum {LEVEL_TYPE_CODE=CV_32SC1};
 
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
+            const Npp32s* pLevels, int nLevels, Npp8u* pBuffer, NppStreamContext ctx);
+#else
         typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
             const Npp32s* pLevels, int nLevels, Npp8u* pBuffer);
+#endif
     };
     template<> struct NppHistogramRangeFuncC1<CV_32F>
     {
@@ -392,8 +420,13 @@ namespace
         typedef Npp32f level_t;
         enum {LEVEL_TYPE_CODE=CV_32FC1};
 
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
+            const Npp32f* pLevels, int nLevels, Npp8u* pBuffer, NppStreamContext ctx);
+#else
         typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
             const Npp32f* pLevels, int nLevels, Npp8u* pBuffer);
+#endif
     };
     template<int SDEPTH> struct NppHistogramRangeFuncC4
     {
@@ -401,8 +434,13 @@ namespace
         typedef Npp32s level_t;
         enum {LEVEL_TYPE_CODE=CV_32SC1};
 
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
+            const Npp32s* pLevels[4], int nLevels[4], Npp8u* pBuffer, NppStreamContext ctx);
+#else
         typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
             const Npp32s* pLevels[4], int nLevels[4], Npp8u* pBuffer);
+#endif
     };
     template<> struct NppHistogramRangeFuncC4<CV_32F>
     {
@@ -410,8 +448,13 @@ namespace
         typedef Npp32f level_t;
         enum {LEVEL_TYPE_CODE=CV_32FC1};
 
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
+            const Npp32f* pLevels[4], int nLevels[4], Npp8u* pBuffer, NppStreamContext ctx);
+#else
         typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
             const Npp32f* pLevels[4], int nLevels[4], Npp8u* pBuffer);
+#endif
     };
 
     template<int SDEPTH, typename NppHistogramRangeFuncC1<SDEPTH>::func_ptr func, get_buf_size_c1_t get_buf_size>
@@ -432,19 +475,25 @@ namespace
             sz.width = src.cols;
             sz.height = src.rows;
 
-#if (CUDA_VERSION >= 12040)
+            NppStreamHandler h(stream);
+
+#if (NPP_VERSION >= 12205)
             size_t buf_size;
+            get_buf_size(sz, levels.cols, &buf_size, h);
 #else
             int buf_size;
-#endif
             get_buf_size(sz, levels.cols, &buf_size);
+#endif
 
             BufferPool pool(stream);
-            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
-
-            NppStreamHandler h(stream);
+            CV_Assert(buf_size <= std::numeric_limits<int>::max());
+            GpuMat buf = pool.getBuffer(1, static_cast<int>(buf_size), CV_8UC1);
 
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels.ptr<level_t>(), levels.cols, buf.ptr<Npp8u>(), h));
+#else
             nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels.ptr<level_t>(), levels.cols, buf.ptr<Npp8u>()) );
+#endif
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -477,19 +526,25 @@ namespace
             sz.width = src.cols;
             sz.height = src.rows;
 
-#if (CUDA_VERSION >= 12040)
+            NppStreamHandler h(stream);
+
+#if (NPP_VERSION >= 12205)
             size_t buf_size;
+            get_buf_size(sz, nLevels, &buf_size, h);
 #else
             int buf_size;
-#endif
             get_buf_size(sz, nLevels, &buf_size);
+#endif
 
             BufferPool pool(stream);
-            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
-
-            NppStreamHandler h(stream);
+            CV_Assert(buf_size <= std::numeric_limits<int>::max());
+            GpuMat buf = pool.getBuffer(1, static_cast<int>(buf_size), CV_8UC1);
 
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, pLevels, nLevels, buf.ptr<Npp8u>(), h));
+#else
             nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, pLevels, nLevels, buf.ptr<Npp8u>()) );
+#endif
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -497,6 +552,24 @@ namespace
     };
 }
 
+class OldNppStreamHandlerForEvenLevels
+{
+public:
+    explicit OldNppStreamHandlerForEvenLevels(Stream& newStream)
+    {
+        oldStream = nppGetStream();
+        nppSafeSetStream(oldStream, StreamAccessor::getStream(newStream));
+    }
+
+    ~OldNppStreamHandlerForEvenLevels()
+    {
+        nppSafeSetStream(nppGetStream(), oldStream);
+    }
+
+private:
+    cudaStream_t oldStream;
+};
+
 void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream)
 {
     const int kind = _levels.kind();
@@ -509,6 +582,9 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int
     else
         host_levels = _levels.getMat();
 
+    // Update to use NppStreamHandler when nppiEvenLevelsHost_32s_Ctx is included in nppist.lib and libnppist.so
+    OldNppStreamHandlerForEvenLevels h(stream);
+
     nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr<Npp32s>(), nLevels, lowerLevel, upperLevel) );
 
     if (kind == _InputArray::CUDA_GPU_MAT)
@@ -537,10 +613,17 @@ void cv::cuda::histEven(InputArray _src, OutputArray hist, int histSize, int low
     typedef void (*hist_t)(const GpuMat& src, OutputArray hist, int levels, int lowerLevel, int upperLevel, Stream& stream);
     static const hist_t hist_callers[] =
     {
+#if USE_NPP_STREAM_CTX
+        NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R_Ctx , nppiHistogramEvenGetBufferSize_8u_C1R_Ctx >::hist,
+        0,
+        NppHistogramEvenC1<CV_16U, nppiHistogramEven_16u_C1R_Ctx, nppiHistogramEvenGetBufferSize_16u_C1R_Ctx>::hist,
+        NppHistogramEvenC1<CV_16S, nppiHistogramEven_16s_C1R_Ctx, nppiHistogramEvenGetBufferSize_16s_C1R_Ctx>::hist
+#else
         NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist,
         0,
         NppHistogramEvenC1<CV_16U, nppiHistogramEven_16u_C1R, nppiHistogramEvenGetBufferSize_16u_C1R>::hist,
         NppHistogramEvenC1<CV_16S, nppiHistogramEven_16s_C1R, nppiHistogramEvenGetBufferSize_16s_C1R>::hist
+#endif
     };
 
     GpuMat src = _src.getGpuMat();
@@ -561,10 +644,17 @@ void cv::cuda::histEven(InputArray _src, GpuMat hist[4], int histSize[4], int lo
     typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], int levels[4], int lowerLevel[4], int upperLevel[4], Stream& stream);
     static const hist_t hist_callers[] =
     {
+#if USE_NPP_STREAM_CTX
+        NppHistogramEvenC4<CV_8U , nppiHistogramEven_8u_C4R_Ctx , nppiHistogramEvenGetBufferSize_8u_C4R_Ctx >::hist,
+        0,
+        NppHistogramEvenC4<CV_16U, nppiHistogramEven_16u_C4R_Ctx, nppiHistogramEvenGetBufferSize_16u_C4R_Ctx>::hist,
+        NppHistogramEvenC4<CV_16S, nppiHistogramEven_16s_C4R_Ctx, nppiHistogramEvenGetBufferSize_16s_C4R_Ctx>::hist
+#else
         NppHistogramEvenC4<CV_8U , nppiHistogramEven_8u_C4R , nppiHistogramEvenGetBufferSize_8u_C4R >::hist,
         0,
         NppHistogramEvenC4<CV_16U, nppiHistogramEven_16u_C4R, nppiHistogramEvenGetBufferSize_16u_C4R>::hist,
         NppHistogramEvenC4<CV_16S, nppiHistogramEven_16s_C4R, nppiHistogramEvenGetBufferSize_16s_C4R>::hist
+#endif
     };
 
     GpuMat src = _src.getGpuMat();
@@ -579,12 +669,21 @@ void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels,
     typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, Stream& stream);
     static const hist_t hist_callers[] =
     {
+#if USE_NPP_STREAM_CTX
+        NppHistogramRangeC1<CV_8U , nppiHistogramRange_8u_C1R_Ctx , nppiHistogramRangeGetBufferSize_8u_C1R_Ctx >::hist,
+        0,
+        NppHistogramRangeC1<CV_16U, nppiHistogramRange_16u_C1R_Ctx, nppiHistogramRangeGetBufferSize_16u_C1R_Ctx>::hist,
+        NppHistogramRangeC1<CV_16S, nppiHistogramRange_16s_C1R_Ctx, nppiHistogramRangeGetBufferSize_16s_C1R_Ctx>::hist,
+        0,
+        NppHistogramRangeC1<CV_32F, nppiHistogramRange_32f_C1R_Ctx, nppiHistogramRangeGetBufferSize_32f_C1R_Ctx>::hist
+#else
         NppHistogramRangeC1<CV_8U , nppiHistogramRange_8u_C1R , nppiHistogramRangeGetBufferSize_8u_C1R >::hist,
         0,
         NppHistogramRangeC1<CV_16U, nppiHistogramRange_16u_C1R, nppiHistogramRangeGetBufferSize_16u_C1R>::hist,
         NppHistogramRangeC1<CV_16S, nppiHistogramRange_16s_C1R, nppiHistogramRangeGetBufferSize_16s_C1R>::hist,
         0,
         NppHistogramRangeC1<CV_32F, nppiHistogramRange_32f_C1R, nppiHistogramRangeGetBufferSize_32f_C1R>::hist
+#endif
     };
 
     GpuMat src = _src.getGpuMat();
@@ -600,12 +699,21 @@ void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4]
     typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream);
     static const hist_t hist_callers[] =
     {
+#if USE_NPP_STREAM_CTX
+        NppHistogramRangeC4<CV_8U , nppiHistogramRange_8u_C4R_Ctx , nppiHistogramRangeGetBufferSize_8u_C4R_Ctx >::hist,
+        0,
+        NppHistogramRangeC4<CV_16U, nppiHistogramRange_16u_C4R_Ctx, nppiHistogramRangeGetBufferSize_16u_C4R_Ctx>::hist,
+        NppHistogramRangeC4<CV_16S, nppiHistogramRange_16s_C4R_Ctx, nppiHistogramRangeGetBufferSize_16s_C4R_Ctx>::hist,
+        0,
+        NppHistogramRangeC4<CV_32F, nppiHistogramRange_32f_C4R_Ctx, nppiHistogramRangeGetBufferSize_32f_C4R_Ctx>::hist
+#else
         NppHistogramRangeC4<CV_8U , nppiHistogramRange_8u_C4R , nppiHistogramRangeGetBufferSize_8u_C4R >::hist,
         0,
         NppHistogramRangeC4<CV_16U, nppiHistogramRange_16u_C4R, nppiHistogramRangeGetBufferSize_16u_C4R>::hist,
         NppHistogramRangeC4<CV_16S, nppiHistogramRange_16s_C4R, nppiHistogramRangeGetBufferSize_16s_C4R>::hist,
         0,
         NppHistogramRangeC4<CV_32F, nppiHistogramRange_32f_C4R, nppiHistogramRangeGetBufferSize_32f_C4R>::hist
+#endif
     };
 
     GpuMat src = _src.getGpuMat();
diff --git a/modules/cudawarping/src/warp.cpp b/modules/cudawarping/src/warp.cpp
index a18a459c2b1..8690f54085d 100644
--- a/modules/cudawarping/src/warp.cpp
+++ b/modules/cudawarping/src/warp.cpp
@@ -139,9 +139,15 @@ namespace
     {
         typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
 
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*func_t)(const npp_type* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_type* pDst,
+            int dstStep, NppiRect dstRoi, const double coeffs[][3],
+            int interpolation, NppStreamContext ctx);
+#else
         typedef NppStatus (*func_t)(const npp_type* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_type* pDst,
                                     int dstStep, NppiRect dstRoi, const double coeffs[][3],
                                     int interpolation);
+#endif
     };
 
     template <int DEPTH, typename NppWarpFunc<DEPTH>::func_t func> struct NppWarp
@@ -170,9 +176,15 @@ namespace
 
             cv::cuda::NppStreamHandler h(stream);
 
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func(src.ptr<npp_type>(), srcsz, static_cast<int>(src.step), srcroi,
+                dst.ptr<npp_type>(), static_cast<int>(dst.step), dstroi,
+                coeffs, npp_inter[interpolation], h));
+#else
             nppSafeCall( func(src.ptr<npp_type>(), srcsz, static_cast<int>(src.step), srcroi,
                               dst.ptr<npp_type>(), static_cast<int>(dst.step), dstroi,
                               coeffs, npp_inter[interpolation]) );
+#endif
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -250,6 +262,24 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size
 
         static const func_t funcs[2][6][4] =
         {
+#if USE_NPP_STREAM_CTX
+            {
+                {NppWarp<CV_8U, nppiWarpAffine_8u_C1R_Ctx>::call, 0, NppWarp<CV_8U, nppiWarpAffine_8u_C3R_Ctx>::call, NppWarp<CV_8U, nppiWarpAffine_8u_C4R_Ctx>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpAffine_16u_C1R_Ctx>::call, 0, NppWarp<CV_16U, nppiWarpAffine_16u_C3R_Ctx>::call, NppWarp<CV_16U, nppiWarpAffine_16u_C4R_Ctx>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpAffine_32s_C1R_Ctx>::call, 0, NppWarp<CV_32S, nppiWarpAffine_32s_C3R_Ctx>::call, NppWarp<CV_32S, nppiWarpAffine_32s_C4R_Ctx>::call},
+                {NppWarp<CV_32F, nppiWarpAffine_32f_C1R_Ctx>::call, 0, NppWarp<CV_32F, nppiWarpAffine_32f_C3R_Ctx>::call, NppWarp<CV_32F, nppiWarpAffine_32f_C4R_Ctx>::call}
+            },
+            {
+                {NppWarp<CV_8U, nppiWarpAffineBack_8u_C1R_Ctx>::call, 0, NppWarp<CV_8U, nppiWarpAffineBack_8u_C3R_Ctx>::call, NppWarp<CV_8U, nppiWarpAffineBack_8u_C4R_Ctx>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpAffineBack_16u_C1R_Ctx>::call, 0, NppWarp<CV_16U, nppiWarpAffineBack_16u_C3R_Ctx>::call, NppWarp<CV_16U, nppiWarpAffineBack_16u_C4R_Ctx>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpAffineBack_32s_C1R_Ctx>::call, 0, NppWarp<CV_32S, nppiWarpAffineBack_32s_C3R_Ctx>::call, NppWarp<CV_32S, nppiWarpAffineBack_32s_C4R_Ctx>::call},
+                {NppWarp<CV_32F, nppiWarpAffineBack_32f_C1R_Ctx>::call, 0, NppWarp<CV_32F, nppiWarpAffineBack_32f_C3R_Ctx>::call, NppWarp<CV_32F, nppiWarpAffineBack_32f_C4R_Ctx>::call}
+            }
+#else
             {
                 {NppWarp<CV_8U, nppiWarpAffine_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffine_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffine_8u_C4R>::call},
                 {0, 0, 0, 0},
@@ -266,6 +296,7 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size
                 {NppWarp<CV_32S, nppiWarpAffineBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffineBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffineBack_32s_C4R>::call},
                 {NppWarp<CV_32F, nppiWarpAffineBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffineBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffineBack_32f_C4R>::call}
             }
+#endif
         };
 
         dst.setTo(borderValue, stream);
@@ -389,6 +420,24 @@ void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M,
 
         static const func_t funcs[2][6][4] =
         {
+#if USE_NPP_STREAM_CTX
+            {
+                {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R_Ctx>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R_Ctx>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R_Ctx>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R_Ctx>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R_Ctx>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R_Ctx>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R_Ctx>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R_Ctx>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R_Ctx>::call},
+                {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R_Ctx>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R_Ctx>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R_Ctx>::call}
+            },
+            {
+                {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R_Ctx>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R_Ctx>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R_Ctx>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R_Ctx>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R_Ctx>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R_Ctx>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R_Ctx>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R_Ctx>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R_Ctx>::call},
+                {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R_Ctx>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R_Ctx>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R_Ctx>::call}
+            }
+#else
             {
                 {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
                 {0, 0, 0, 0},
@@ -405,6 +454,7 @@ void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M,
                 {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
                 {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
             }
+#endif
         };
 
         dst.setTo(borderValue, stream);
@@ -467,9 +517,15 @@ namespace
     {
         typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
 
+#if USE_NPP_STREAM_CTX
+        typedef NppStatus(*func_t)(const npp_type* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI,
+            npp_type* pDst, int nDstStep, NppiRect oDstROI,
+            double nAngle, double nShiftX, double nShiftY, int eInterpolation, NppStreamContext ctx);
+#else
         typedef NppStatus (*func_t)(const npp_type* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI,
                                     npp_type* pDst, int nDstStep, NppiRect oDstROI,
                                     double nAngle, double nShiftX, double nShiftY, int eInterpolation);
+#endif
     };
 
     template <int DEPTH, typename NppRotateFunc<DEPTH>::func_t func> struct NppRotate
@@ -495,8 +551,13 @@ namespace
             dstroi.height = dst.rows;
             dstroi.width = dst.cols;
 
+#if USE_NPP_STREAM_CTX
+            nppSafeCall(func(src.ptr<npp_type>(), srcsz, static_cast<int>(src.step), srcroi,
+                dst.ptr<npp_type>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation], h));
+#else
             nppSafeCall( func(src.ptr<npp_type>(), srcsz, static_cast<int>(src.step), srcroi,
                 dst.ptr<npp_type>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
+#endif
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -509,12 +570,21 @@ void cv::cuda::rotate(InputArray _src, OutputArray _dst, Size dsize, double angl
     typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream);
     static const func_t funcs[6][4] =
     {
+#if USE_NPP_STREAM_CTX
+        {NppRotate<CV_8U, nppiRotate_8u_C1R_Ctx>::call, 0, NppRotate<CV_8U, nppiRotate_8u_C3R_Ctx>::call, NppRotate<CV_8U, nppiRotate_8u_C4R_Ctx>::call},
+        {0,0,0,0},
+        {NppRotate<CV_16U, nppiRotate_16u_C1R_Ctx>::call, 0, NppRotate<CV_16U, nppiRotate_16u_C3R_Ctx>::call, NppRotate<CV_16U, nppiRotate_16u_C4R_Ctx>::call},
+        {0,0,0,0},
+        {0,0,0,0},
+        {NppRotate<CV_32F, nppiRotate_32f_C1R_Ctx>::call, 0, NppRotate<CV_32F, nppiRotate_32f_C3R_Ctx>::call, NppRotate<CV_32F, nppiRotate_32f_C4R_Ctx>::call}
+#else
         {NppRotate<CV_8U, nppiRotate_8u_C1R>::call, 0, NppRotate<CV_8U, nppiRotate_8u_C3R>::call, NppRotate<CV_8U, nppiRotate_8u_C4R>::call},
         {0,0,0,0},
         {NppRotate<CV_16U, nppiRotate_16u_C1R>::call, 0, NppRotate<CV_16U, nppiRotate_16u_C3R>::call, NppRotate<CV_16U, nppiRotate_16u_C4R>::call},
         {0,0,0,0},
         {0,0,0,0},
         {NppRotate<CV_32F, nppiRotate_32f_C1R>::call, 0, NppRotate<CV_32F, nppiRotate_32f_C3R>::call, NppRotate<CV_32F, nppiRotate_32f_C4R>::call}
+#endif
     };
 
     GpuMat src = _src.getGpuMat();

From a83259f323e969e47cf127c545ef66363c4de4ce Mon Sep 17 00:00:00 2001
From: Doekin <Leetimemp@gmail.com>
Date: Sat, 26 Oct 2024 12:59:14 +0800
Subject: [PATCH 2/2] wechat_qrcode: fix ineffective decoding retrial after
 inverting image

---
 modules/wechat_qrcode/src/zxing/qrcode/qrcode_reader.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/wechat_qrcode/src/zxing/qrcode/qrcode_reader.cpp b/modules/wechat_qrcode/src/zxing/qrcode/qrcode_reader.cpp
index b6c4c4f71c0..c5f624e89c0 100644
--- a/modules/wechat_qrcode/src/zxing/qrcode/qrcode_reader.cpp
+++ b/modules/wechat_qrcode/src/zxing/qrcode/qrcode_reader.cpp
@@ -49,7 +49,8 @@ vector<Ref<Result>> QRCodeReader::decode(Ref<BinaryBitmap> image, DecodeHints hi
         Ref<BitMatrix> invertedMatrix = image->getInvertedMatrix(err_handler);
         if (err_handler.ErrCode() || invertedMatrix == NULL) return result_list;
         vector<Ref<Result>> tmp_rst = decodeMore(image, invertedMatrix, hints, err_handler);
-        if (err_handler.ErrCode() || tmp_rst.empty()) return tmp_rst;
+        if (err_handler.ErrCode() || tmp_rst.empty()) return result_list;
+        return tmp_rst;
     }
 
     return rst;